In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

# ROOT_FOLDER = "NYPL-menus"
ROOT_FOLDER = "NYPL-menus-cleaned"

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Profiling Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Profiling the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [11]:
dataset = []

for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    print(filename)
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))


NYPL-menus-cleaned\Dish_fixed.csv
NYPL-menus-cleaned\Menu_fixed.csv
NYPL-menus-cleaned\MenuItem_fixed.csv
NYPL-menus-cleaned\MenuPage_fixed.csv


In [12]:
# IC 1: Temporal consistency in Dish
ic1_violations = dataset[DISH][dataset[DISH]["first_appeared"] > dataset[DISH]["last_appeared"]]

print(f"Violations found: {len(ic1_violations)}")
ic1_violations.head(10)

Violations found: 6


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
131193,164029,Clear beef broth,,0,1,1900,0,0.25,0.25
163257,204888,Hot roast beef with gravy,,0,1,1900,0,0.25,0.25
197050,250693,SURI LEBERLI - Shredded Calf's Liver Flambe in...,,0,1,1945,0,,
197053,250699,"SWISS MINCED VEAL, ROESTI",,0,1,1945,0,,
237740,301736,Cafe Glacee,,0,2,1940,0,0.4,0.4
244534,309629,Garlic Butter,,0,1,1947,0,0.4,0.4


In [18]:
dataset[MENU_ITEM][dataset[MENU_ITEM]["dish_id"].isin(dataset[DISH].loc[ic1_violations.index, "id"])]

Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
567462,595830,29537,0.25,,164029.0,2011-10-09 19:35:27 UTC,2011-10-09 19:35:27 UTC,0.181429,0.286683
567923,596331,29538,0.25,,204888.0,2011-10-10 00:28:58 UTC,2011-10-10 00:28:58 UTC,0.412857,0.645373
731557,769591,57538,,,250693.0,2012-02-01 07:07:00 UTC,2012-02-01 07:07:00 UTC,0.075714,0.617632
731560,769594,57538,,,250699.0,2012-02-01 07:07:27 UTC,2012-02-01 07:07:27 UTC,0.08,0.657833
746034,784298,58037,0.4,,309629.0,2012-02-06 23:57:54 UTC,2012-02-06 23:57:54 UTC,0.295714,0.88867
797877,838191,60252,0.4,,301736.0,2012-03-09 18:14:00 UTC,2012-03-09 18:14:00 UTC,0.554286,0.957065
803940,844311,60248,0.4,,301736.0,2012-03-12 17:41:51 UTC,2012-03-12 17:41:51 UTC,0.545714,0.968808


In [13]:
# IC 2: Date Outliner in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: call_prefix == date_prefix when both exist
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].notna() & 
    dataset[MENU]["call_prefix"].str.isnumeric() &
    dataset[MENU]["date"].str.match(r"[^1]+") & 
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Violations found: {len(ic2_violations)}")
ic2_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head()

Violations found: 0


Unnamed: 0,id,call_number,date,call_prefix,date_prefix


In [14]:
# IC 3: Date Blank in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    (dataset[MENU]["call_prefix"] != 'Zand') &
    (dataset[MENU]["call_prefix"] != 'Soet') &
    (dataset[MENU]["call_prefix"] != 'soet') &
    (dataset[MENU]["call_prefix"] != 'Bara') &
    (dataset[MENU]["call_prefix"] != '_wot') 
    # dataset[MENU]["call_prefix"].str.isnumeric()
    # dataset[MENU]["date"].str.match(r"[^1]+") & 
    # (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Violations found: {len(ic2_violations)}")
ic2_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head(20)

Violations found: 0


Unnamed: 0,id,call_number,date,call_prefix,date_prefix


In [15]:
# IC 3: Date Blank in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    # (dataset[MENU]["call_prefix"] == 'Zand') &
    # (dataset[MENU]["call_prefix"] == 'Soet') &
    # (dataset[MENU]["call_prefix"] == 'soet') &
    (dataset[MENU]["call_prefix"] == 'Bara') &
    # (dataset[MENU]["call_prefix"] == '_wot') 
    # dataset[MENU]["call_prefix"].str.isnumeric()
    # dataset[MENU]["date"].str.match(r"[^1]+") & 
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Violations found: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index]

Violations found: 42


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix
9768,27240,,Library,lunch,RESTAURANT,Library,photocopy (5 pages); 8.5 x 14 inches,,Baratta collection; annotated; appetizers and ...,Baratta 113,...,,Library,,Dollars,$,complete,5,78,Bara,
9769,27241,,Alfredo,dinner,RESTAURANT,Alfredo,folder and photocopy; 8.5 x 14 inches; ink mar...,,"Baratta collection; annotated; ""The Original o...",Baratta 129 (2 copies),...,,Alfredo,,Dollars,$,complete,4,74,Bara,
9771,27243,,J.C's,lunch; dinner,RESTAURANT,J.C's,folder; 9 x 12 inches; crease (length-wise),,Baratta collection; extensive wine list by bot...,Baratta 138,...,,J.C's,,Dollars,$,complete,4,110,Bara,
9774,27246,,The New York Plaza,wine list,HOTEL,The Plaza Hotel,booklet; 8.75 x 12 inches; binding missing two...,,Baratta collection; illustrations (cover); col...,Baratta 39,...,,The New York Plaza,,Dollars,$,complete,21,188,Bara,
9775,27247,,Alfredo,dinner,RESTAURANT,"Alfredo; Citicorp Center, 53rd Street and Lexi...",folders (2); 8 x 14 inches; ink markings (cover),,"Baratta collection; annotated; ""The Original o...",Baratta 46,...,,Alfredo,,Dollars,$,complete,8,53,Bara,
9779,27251,,Gramercy Park Hotel,dinner,"HOTEL,RESTAURANT",Gramercy Park Hotel,folder; 8.25 x 11.75 inches; water stain (bott...,RELIGIOUS HOLIDAY,Baratta collection; extensive wine suggestions...,Baratta 82,...,,Gramercy Park Hotel,,Dollars,$,complete,3,84,Bara,
9789,27261,,Ports O' Call,dinner,RESTAURANT,Ports O' Call; Sheraton-Dallas Hotel by Stephe...,"folder with insert (fuzzy, velvet-like cover);...",,Baratta collection; playful section headings; ...,Baratta 56,...,,Ports O' Call,,Dollars,$,complete,12,196,Bara,
9791,27263,,Gramercy Park Hotel,lunch; dinner,"HOTEL,RESTAURANT",Gramercy Park Hotel; 2 Lexington Avenue at 21s...,letterhead (2 pages); 8.5 x 11 inches; metal f...,,Baratta collection; type written menu on hotel...,Baratta 170,...,,Gramercy Park Hotel,,Dollars,$,complete,2,22,Bara,
9795,27267,,Gramercy Park Hotel,buffet,"HOTEL,RESTAURANT",Gramercy Park Hotel; 2 Lexington Avenue at 21s...,letterhead; 8.5 x 11 inches; bent corner (uppe...,,Baratta collection; type written menu on the b...,Baratta 112,...,,Gramercy Park Hotel,,Dollars,$,complete,2,35,Bara,
9796,27268,,Library Room,lunch,RESTAURANT,Library Room,photocopy (2 pages); 8.5 x 14 inches,,Baratta collection; annotated; salads and cold...,Baratta 114,...,,Library Room,,Dollars,$,complete,2,30,Bara,
