In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# ROOT_FOLDER = "NYPL-menus"
ROOT_FOLDER = "NYPL-menus-cleaned"

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Profiling Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Profiling the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [2]:
dataset = []

for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    print(filename)
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))


NYPL-menus-cleaned\Dish_fixed.csv
NYPL-menus-cleaned\Menu_fixed.csv
NYPL-menus-cleaned\MenuItem_fixed.csv
NYPL-menus-cleaned\MenuPage_fixed.csv


In [3]:
# IC 2: Date Outliner in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: call_prefix == date_year when both exist
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].notna() & 
    dataset[MENU]["call_prefix"].str.isnumeric() &
    (dataset[MENU]["date"].str.match(r"[^1]+") |
    dataset[MENU]["date"].str.match(r"[1][^89]+")) &
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Before Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

Before Cleaning Applied: 0


Unnamed: 0,id,call_number,date,call_prefix,date_prefix


In [4]:
# IC 3: Date Blank with call number year in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic3_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    dataset[MENU]["call_prefix"].str.isnumeric()
]

print(f"Violations found: {len(ic3_violations)}")
ic3_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head(20)

Violations found: 0


Unnamed: 0,id,call_number,date,call_prefix,date_prefix


In [5]:
# IC 4: Date Blank with no date info in call number in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic4_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() & (
    (dataset[MENU]["call_prefix"] == 'Zand') |
    (dataset[MENU]["call_prefix"] == 'Soet') |
    (dataset[MENU]["call_prefix"] == 'soet') |
    (dataset[MENU]["call_prefix"] == 'Bara') |
    (dataset[MENU]["call_prefix"] == '_wot'))
]

print(f"Violations found: {len(ic4_violations)}")
dataset[MENU].loc[ic4_violations.index]

Violations found: 0


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix


In [6]:
# IC 5: Date range outsite of 1890-1970 in Menu
ic5_violations = dataset[MENU][
    dataset[MENU]["date"].notna() & 
    (dataset[MENU]["date"].str[:4].astype(int) < 1890) | 
    (dataset[MENU]["date"].str[:4].astype(int) > 1970)
]["id"]

print(f"Violations found: {len(ic5_violations)}")
print(f"Menu dataset size: {len(dataset[MENU])}")
print(f"MenuPage dataset size: {len(dataset[MENU_PAGE])}")
print(f"MenuItem dataset size: {len(dataset[MENU_ITEM])}")
dataset[MENU].loc[ic5_violations.index]

Violations found: 1247
Menu dataset size: 17545
MenuPage dataset size: 66937
MenuItem dataset size: 1332726


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix
39,12503,,POLICE DEPARTMENT OF THE CITY OF NEW YORK,SEVENTH ANNUAL DINNER,GOVT;,DELMONICO'S,FOL; 4.75 x 7.25;,,SEAL ON COVER; FRENCH; INCLUDES WINES SERVED W...,1888-0010,...,1888-01-23,Police Department Of The City Of New York,,,,complete,3,30,1888,1888
48,12515,,THE ALBANY,LUNCH,?,"DENVER, COLO;",CARD;3.5 X 5;,,,1888-0611,...,1888-10-15,The Albany,,,,complete,2,30,1888,1888
49,12516,,REVERE HOUSE,COMPLIMENTARY BANQUET GIVEN BY THE CITY GOVERN...,RESTAURANT,"BOSTON, MA",BROADSIDE; ILLUS; 4.25 X 11.75,,MENU PRINTED IN BLACK ON CREAM SILK RIBBON WIT...,1865-0001,...,1865-09-28,Parker House,,Dollars,$,complete,4,422,1865,1865
147,12635,,,,,,,,,,...,1888-10-15,The Albany,,,,complete,2,30,,1888
148,12636,,,,,,,,,,...,1865-06-09,Revere House,,Dollars,$,complete,4,403,,1865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11875,29396,,Hotel Mas De Vence,,,,19x14cm folded; 19x28cm open,,,1995-0016_wotm,...,1995-07-03,Hotel Mas De Vence,,,,complete,3,4,1995,1995
11877,29398,,Outpost,,,,21.5x12cm folded; 21.5x35.5cm open,,,1995-0006_wotm,...,1995-04-22,Outpost,,Dollars,$,complete,3,59,1995,1995
11878,29399,,Que Pasta's,,,,21.5x18cm folded; 21.5x35.5cm open,,,1988-0001_wotm,...,1988-01-01,Que Pasta's,,Dollars,$,complete,3,64,1988,1988
11880,29401,,Le Riveria,,,,21x15cm folded; 21x30cm open,,,1990-0012_wotm,...,1990-06-06,Le Riveria,,,,complete,3,7,1990,1990


In [7]:
# IC 6: Date blank in Dish with Dependency on Menu

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4].astype('int64')

dish_appearances = (
    dataset[MENU_ITEM][['id', 'dish_id', 'menu_page_id']]
    .merge(dataset[MENU_PAGE][['id', 'menu_id']], 
            left_on='menu_page_id', right_on='id', suffixes=('', '_page'))
    .merge(dataset[MENU], left_on='menu_id', right_on='id')
    .groupby('dish_id')['date_prefix']
    .agg(['min', 'max'])
    .reset_index()
    .rename(columns={'min': 'calc_first', 'max': 'calc_last'})
)

dishes = dataset[DISH].merge(
    dish_appearances, left_on='id', right_on='dish_id', how='left')

ic6_violations_6_first = dishes[
    (dishes['calc_first'].notna() &(
    (dishes['first_appeared'].isna()) |
    (dishes['first_appeared'] == 0) |
    (dishes['first_appeared'] == 1) |
    (dishes['calc_first'].notna() & (dishes['first_appeared'] > dishes['calc_first']))))
]

ic6_violations_6_last = dishes[
    (dishes['calc_last'].notna()) & (
    (dishes['last_appeared'].isna()) |
    (dishes['last_appeared'] == 0) |
    (dishes['last_appeared'] == 2928) |
    (dishes['calc_last'].notna() & (dishes['last_appeared'] < dishes['calc_last'])))
]

print(f"Violations found: {len(ic6_violations_6_first) + len(ic6_violations_6_last)}")
dishes.loc[ic6_violations_6_first.index ][
    ["id", "name", "first_appeared", "last_appeared", "calc_first", "calc_last"]
]

Violations found: 0


Unnamed: 0,id,name,first_appeared,last_appeared,calc_first,calc_last


In [8]:
# IC 7: Date Zero in Dish with no dependencies on Menu
ic7_violations_1 = dataset[DISH][
    (dataset[DISH]["first_appeared"] == 0) | 
    (dataset[DISH]["last_appeared"] == 0)
]

ic7_violations_2 = dataset[MENU_ITEM][
    dataset[MENU_ITEM]["dish_id"].isin(dataset[DISH].loc[ic7_violations_1.index, "id"])
]

ic7_violations_3 = dataset[MENU_PAGE][
    dataset[MENU_PAGE]["id"].isin(ic7_violations_2["menu_page_id"])
]

ic7_violations_4 = dataset[MENU][
    dataset[MENU]["id"].isin(ic7_violations_3["menu_id"])
]

ic7_violations_first = dataset[DISH][(dataset[DISH]['first_appeared'] == 0) | (dataset[DISH]['first_appeared'] == 1)]

ic7_violations_last = dataset[DISH][dataset[DISH]['last_appeared'] == 0]

print(f"Violations found: {len(ic7_violations_1)}")
dataset[DISH].loc[ic7_violations_1.index]
# dataset[MENU_ITEM].loc[ic7_violations_2.index]
# dataset[MENU_PAGE].loc[ic7_violations_3.index]
# dataset[MENU].loc[ic7_violations_4.index]


Violations found: 0


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price


In [9]:
# IC 8: Temporal consistency in Dish
ic8_violations = dataset[DISH][dataset[DISH]["first_appeared"] > dataset[DISH]["last_appeared"]]

print(f"Violations found: {len(ic8_violations)}")
ic8_violations.head(10)

Violations found: 0


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price


In [10]:
# Date range percentile in Dish

# Filter out invalid entries (NaN or temporal inconsistencies)
valid_dishes = dataset[DISH][
    dataset[DISH]['first_appeared'].notna() & 
    dataset[DISH]['last_appeared'].notna()
]

# Calculate percentiles
start_year = int(np.percentile(valid_dishes['first_appeared'], 5))  # 5th percentile start
end_year = int(np.percentile(valid_dishes['last_appeared'], 98))    # 95th percentile end

# Count dishes within this range
in_range = valid_dishes[
    (valid_dishes['first_appeared'] >= start_year) &
    (valid_dishes['last_appeared'] <= end_year)
]
coverage = len(in_range) / len(valid_dishes)

print(f"90% percentile range: {start_year}-{end_year}")
print(f"Covers {coverage:.1%} of dishes ({len(in_range)}/{len(valid_dishes)})")

90% percentile range: 1896-1997
Covers 93.5% of dishes (396002/423397)


In [None]:
# IC 9: Date range outsite of 1880-2000 in Dish
ic9_violations = dataset[DISH][
    dataset[DISH]["first_appeared"].notna() & 
    dataset[DISH]["last_appeared"].notna() & (
    (dataset[DISH]["first_appeared"] > 2000) | 
    (dataset[DISH]["last_appeared"] < 1880))
]
print(f"Violations found: {len(ic9_violations)}")
dataset[DISH].loc[ic9_violations.index][["id", "name", "first_appeared", "last_appeared"]].head(10)

Violations found: 7065


Unnamed: 0,id,name,first_appeared,last_appeared
2173,2605,SAUTERNES.,1865.0,1865.0
3252,4055,"Soups without meats,",1865.0,1865.0
3269,4078,Bropiled Blue Fish,1865.0,1865.0
3273,4082,"Broiled Cusk, Cream Sauce",1865.0,1865.0
3373,4203,Stewed Eels,1858.0,1865.0
3401,4239,"Baked Cod, Port Sauce",1858.0,1865.0
3424,4265,"Chicken and Pork, White Sauce",1865.0,1865.0
3427,4269,Shoulder Corned Pork,1865.0,1865.0
3459,4308,Stewed Calf's Liver,1865.0,1865.0
3460,4309,"Loin of Veal, Stuffed",1865.0,1865.0


In [None]:
# IC 10: Blank lowest_price and highest_price in Dish
ic10_violations_1 = dataset[DISH][
    dataset[DISH]["lowest_price"].isna() &
    dataset[DISH]["highest_price"].isna()
]
print(f"Violations found: {len(ic10_violations_1)}")
dataset[DISH].loc[ic10_violations_1.index][["id", "name", "lowest_price", "highest_price"]].head(10)

Violations found: 29100


Unnamed: 0,id,name,lowest_price,highest_price
32,34,Russian Caviare on Toast,,
35,39,Potage a la Victoria,,
55,60,Hafergrutze,,
58,63,Apfelsinen,,
60,65,Milchreis,,
79,87,Hot or cold ribs of beef,,
127,135,Consomme aux Quenelle's,,
128,136,Milk rice,,
161,170,Baked Stuffed Mullet & Sauce Pomard,,
293,346,Grilled Mutton Chops,,


In [None]:
# IC 10: Blank lowest_price and highest_price in Dish with Dependency on Menu Item
menu_item_price = (
    dataset[MENU_ITEM][['id', 'dish_id', 'price']]
    .groupby('dish_id')['price']
    .agg(['min', 'max'])
    .reset_index()
    .rename(columns={'min': 'calc_lowest', 'max': 'calc_highest'})
)

dataset[DISH] = dataset[DISH].merge(
    menu_item_price, left_on='id', right_on='dish_id', how='left')

ic10_violations_2 = dataset[DISH][
    (dataset[DISH]["lowest_price"].isna() | dataset[DISH]["lowest_price"].isna()) &
    (dataset[DISH]["calc_lowest"].notna() | dataset[DISH]["calc_highest"].notna())
]

print(f"Violations found: {len(ic10_violations_2)}")
dataset[DISH].loc[ic10_violations_2.index]

Violations found: 136


Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
15858,17901,6662,0.40,,466153.0,2011-04-21 14:36:32 UTC,2017-01-17 21:48:21 UTC,0.300000,0.236921
304296,320363,43733,0.25,,420362.0,2011-06-03 14:52:50 UTC,2014-04-19 22:19:43 UTC,0.751429,0.480178
327103,343755,42018,0.30,,200117.0,2011-06-07 21:57:34 UTC,2014-03-17 17:13:42 UTC,0.565714,0.446548
434680,455897,33946,0.25,,492912.0,2011-07-12 20:00:16 UTC,2016-08-30 01:38:40 UTC,0.265714,0.532618
466802,488990,44633,5.25,,491556.0,2011-07-29 22:21:52 UTC,2017-02-25 09:51:33 UTC,0.522857,0.278766
...,...,...,...,...,...,...,...,...,...
1332282,1385421,75531,0.20,,374348.0,2016-12-27 04:22:42 UTC,2016-12-27 04:22:42 UTC,0.342667,0.870699
1332284,1385423,75531,0.00,,381296.0,2016-12-27 04:23:05 UTC,2016-12-27 04:23:05 UTC,0.529333,0.884465
1332329,1385468,75428,0.20,,374348.0,2016-12-27 07:54:26 UTC,2016-12-27 07:54:54 UTC,0.674667,0.865621
1332331,1385470,75428,0.00,,381296.0,2016-12-27 07:55:22 UTC,2016-12-27 07:55:22 UTC,0.766667,0.879146


In [None]:
# IC 10: Blank lowest_price and highest_price in Dish with No price info in Menu Item
ic10_violations_3 = dataset[DISH][
    dataset[DISH]["lowest_price"].isna() &
    dataset[DISH]["highest_price"].isna() &
    dataset[DISH]["calc_lowest"].isna() &
    dataset[DISH]["calc_highest"].isna()
]

print(f"Violations found: {len(ic10_violations_3)}")
dataset[DISH].loc[ic10_violations_3.index]

Violations found: 53500


Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
32,37,142,,,34.0,2011-03-31 20:38:08 UTC,2011-03-31 20:38:08 UTC,0.352857,0.338290
37,42,142,,,39.0,2011-03-31 21:27:06 UTC,2011-03-31 21:27:06 UTC,0.181429,0.380227
58,65,134,,,60.0,2011-04-06 17:15:07 UTC,2011-04-06 17:15:07 UTC,0.230000,0.503738
61,68,134,,,63.0,2011-04-08 03:55:55 UTC,2011-04-08 03:55:55 UTC,0.205714,0.482629
63,70,134,,,65.0,2011-04-08 03:56:52 UTC,2011-04-08 03:56:52 UTC,0.058571,0.503738
...,...,...,...,...,...,...,...,...,...
1332493,1385656,51618,,,36683.0,2017-05-20 21:41:34 UTC,2017-05-20 21:41:34 UTC,0.617333,0.574743
1332513,1385680,26010,,,381048.0,2017-05-31 03:46:33 UTC,2017-05-31 03:46:33 UTC,0.270667,0.661833
1332515,1385682,26010,,,403489.0,2017-05-31 03:52:17 UTC,2017-05-31 03:52:17 UTC,0.420000,0.755181
1332524,1385695,58509,,,188087.0,2017-06-02 13:08:26 UTC,2017-06-02 13:08:26 UTC,0.301333,0.740746
