In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

ROOT_FOLDER = "NYPL-menus"
# ROOT_FOLDER = "NYPL-menus-cleaned"

OUTPUT_FOLDER = "NYPL-menus-cleaned"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

OUTPUT_FILE = []

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Cleaning Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Cleaning the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [2]:
dataset = []

print(f"Loading datasets from {ROOT_FOLDER}...")
for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    print(f"Loading {filename.name}...")
    OUTPUT_FILE.append(filename.name.replace(".csv", "_fixed.csv"))
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))

Loading datasets from NYPL-menus...
Loading Dish.csv...
Loading Menu.csv...
Loading MenuItem.csv...
Loading MenuPage.csv...


In [3]:
# IC 1: Temporal consistency in Dish
ic1_violations = dataset[DISH][dataset[DISH]["first_appeared"] > dataset[DISH]["last_appeared"]]

print(f"Violations found: {len(ic1_violations)}")
ic1_violations.head(10)

Violations found: 6


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
131193,164029,Clear beef broth,,0,1,1900,0,0.25,0.25
163257,204888,Hot roast beef with gravy,,0,1,1900,0,0.25,0.25
197050,250693,SURI LEBERLI - Shredded Calf's Liver Flambe in...,,0,1,1945,0,,
197053,250699,"SWISS MINCED VEAL, ROESTI",,0,1,1945,0,,
237740,301736,Cafe Glacee,,0,2,1940,0,0.4,0.4
244534,309629,Garlic Butter,,0,1,1947,0,0.4,0.4


In [4]:
# IC 1: Temporal consistency in Dish cleaning
dataset[DISH].loc[ic1_violations.index, "last_appeared"] = dataset[DISH].loc[ic1_violations.index, "first_appeared"]

print(f"After Cleaning Applied: {len(ic1_violations)}")
dataset[DISH].loc[ic1_violations.index]

After Cleaning Applied: 6


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
131193,164029,Clear beef broth,,0,1,1900,1900,0.25,0.25
163257,204888,Hot roast beef with gravy,,0,1,1900,1900,0.25,0.25
197050,250693,SURI LEBERLI - Shredded Calf's Liver Flambe in...,,0,1,1945,1945,,
197053,250699,"SWISS MINCED VEAL, ROESTI",,0,1,1945,1945,,
237740,301736,Cafe Glacee,,0,2,1940,1940,0.4,0.4
244534,309629,Garlic Butter,,0,1,1947,1947,0.4,0.4


In [5]:
# IC 2: Date Outliner in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: call_prefix == date_year when both exist
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].notna() & 
    dataset[MENU]["call_prefix"].str.isnumeric() &
    dataset[MENU]["date"].str.match(r"[^1]+") & 
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Before Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

Before Cleaning Applied: 4


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
525,13112,1900-2328,0190-03-06,1900,190
14659,32265,1918-0387_wotm,2928-03-26,1918,2928
16917,34727,1912-0667_wotm,0001-01-01,1912,1
16918,34728,1912-0668_wotm,0001-01-01,1912,1


In [6]:

# IC 2: Date Outliner in Menu Cleaning
dataset[MENU].loc[ic2_violations.index, "date"] = (
    dataset[MENU].loc[ic2_violations.index, "call_prefix"] + 
    dataset[MENU].loc[ic2_violations.index, "date"].str[4:]
)

print(f"After Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 4


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
525,13112,1900-2328,1900-03-06,1900,190
14659,32265,1918-0387_wotm,1918-03-26,1918,2928
16917,34727,1912-0667_wotm,1912-01-01,1912,1
16918,34728,1912-0668_wotm,1912-01-01,1912,1


In [7]:
# IC 3: Date Blank with call number year in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic3_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    dataset[MENU]["call_prefix"].str.isnumeric()
]

print(f"Violations found: {len(ic3_violations)}")
ic3_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head(20)

Violations found: 47


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
465,13042,1900-2517,,1900,
4598,20978,1906-783,,1906,
5025,21467,1886-036,,1886,
5400,21969,1887-028,,1887,
8633,25998,1900-189,,1900,
8732,26119,1899-606,,1899,
10093,27576,1910-881,,1910,
10426,27912,1973-0020_wotm,,1973,
10526,28012,1977-0004_wotm,,1977,
10560,28062,1978-0021_wotm,,1978,


In [8]:
# IC 3: Date Blank with call number year in Menu Cleaning
dataset[MENU].loc[ic3_violations.index, "date"] = (
    dataset[MENU].loc[ic3_violations.index, "call_prefix"] + "-01-01"
)

print(f"After Cleaning Applied: {len(ic3_violations)}")
dataset[MENU].loc[ic3_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 47


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
465,13042,1900-2517,1900-01-01,1900,
4598,20978,1906-783,1906-01-01,1906,
5025,21467,1886-036,1886-01-01,1886,
5400,21969,1887-028,1887-01-01,1887,
8633,25998,1900-189,1900-01-01,1900,
8732,26119,1899-606,1899-01-01,1899,
10093,27576,1910-881,1910-01-01,1910,
10426,27912,1973-0020_wotm,1973-01-01,1973,
10526,28012,1977-0004_wotm,1977-01-01,1977,
10560,28062,1978-0021_wotm,1978-01-01,1978,


In [9]:
# IC 4: Date Blank with no date info in call number in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic4_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() & (
    (dataset[MENU]["call_prefix"] == 'Zand') |
    (dataset[MENU]["call_prefix"] == 'Soet') |
    (dataset[MENU]["call_prefix"] == 'soet') |
    (dataset[MENU]["call_prefix"] == 'Bara') |
    (dataset[MENU]["call_prefix"] == '_wot'))
]

print(f"Violations found: {len(ic4_violations)}")
dataset[MENU].loc[ic4_violations.index]

Violations found: 536


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix
8913,26347,,Pan American,,AIRLINE,,Folder; 8.5 x 6 inches,,Dieter Zander Collection; cover is an illustra...,Zander 11,...,,Pan American,,,,complete,3,30,Zand,
8915,26349,,American Export Lines,Captain's Dinner,STEAMSHIP,S.S. Independence,Booklet; 8.5 x 11.5 inches,,"Dieter Zander Collection; captain, officers, a...",Zander 13,...,,American Export Lines,,,,complete,4,28,Zand,
8918,26352,,American Airlines,,AIRLINE,,Folded set of postcards; 7 x 4.5 in. folded; 7...,,Dieter Zander Collection; Americana themed; sc...,Zander 16,...,,American Airlines,,,,complete,3,34,Zand,
8921,26355,,Pan American,,AIRLINE,,Folder; 11 x 9 inches,,Dieter Zander Collection; cover is a watercolo...,Zander 19 undated,...,,Pan American,,,,complete,3,68,Zand,
8922,26356,,Pan American,,AIRLINE,,Tri-fold; 9.75 x 13.25 in. folded; 9.75 x 26 i...,,"Dieter Zander Collection, cover is a watercolo...",Zander 21 undated,...,,Pan American,,,,complete,3,47,Zand,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10224,27708,,Dinner held by Pennsylvania Railroad (Railroad...,,,,10 x 7 in. fold. ; ill.,,,Soete 235A-B,...,,Pennsylvania Railroad,,Dollars,$,complete,4,101,Soet,
10225,27709,,Dinner held by Pennsylvania Railroad (Railroad...,,,,10 x 7 in. fold. ; ill.,,,Soete 237A-C,...,,Pennsylvania Railroad,,Dollars,$,complete,5,91,Soet,
10226,27710,,Dinner held by Pennsylvania Railroad,,,,,,,Soete 209A,...,,Dinner Held By Pennsylvania Railroad,,Dollars,$,complete,2,101,Soet,
10228,27712,,Exec Committe California Midwinter Int Exposition,,,,,,,soete 56,...,,Exec Committe California Midwinter Int Exposition,,,,complete,3,0,soet,


In [10]:

# IC 4: Date Blank with no date info in call number in Menu Cleaning
dataset[MENU]["date"] = dataset[MENU]["date"].ffill().bfill().interpolate(method='nearest')

print(f"After Cleaning Applied: {len(ic4_violations)}")
dataset[MENU].loc[ic4_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 536


  dataset[MENU]["date"] = dataset[MENU]["date"].ffill().bfill().interpolate(method='nearest')


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
8913,26347,Zander 11,1989-01-01,Zand,
8915,26349,Zander 13,1969-12-01,Zand,
8918,26352,Zander 16,1952-01-01,Zand,
8921,26355,Zander 19 undated,1963-07-29,Zand,
8922,26356,Zander 21 undated,1963-07-29,Zand,
...,...,...,...,...,...
10224,27708,Soete 235A-B,1919-05-12,Soet,
10225,27709,Soete 237A-C,1919-05-12,Soet,
10226,27710,Soete 209A,1919-05-12,Soet,
10228,27712,soete 56,1931-11-01,soet,


In [None]:
# IC 5: Date range outsite of 1890-1970 in Menu
ic5_violations = dataset[MENU][
    dataset[MENU]["date"].notna() & 
    (dataset[MENU]["date"].str[:4].astype(int) < 1890) | 
    (dataset[MENU]["date"].str[:4].astype(int) > 1970)
]["id"]

print(f"Violations found: {len(ic5_violations)}")
print(f"Menu dataset size: {len(dataset[MENU])}")
print(f"MenuPage dataset size: {len(dataset[MENU_PAGE])}")
print(f"MenuItem dataset size: {len(dataset[MENU_ITEM])}")
dataset[MENU].loc[ic5_violations.index]

Violations found: 1248
Menu dataset size: 17545
MenuPage dataset size: 66937
MenuItem dataset size: 1332726
Dish dataset size: 423397


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix
39,12503,,POLICE DEPARTMENT OF THE CITY OF NEW YORK,SEVENTH ANNUAL DINNER,GOVT;,DELMONICO'S,FOL; 4.75 x 7.25;,,SEAL ON COVER; FRENCH; INCLUDES WINES SERVED W...,1888-0010,...,1888-01-23,Police Department Of The City Of New York,,,,complete,3,30,1888,1888
48,12515,,THE ALBANY,LUNCH,?,"DENVER, COLO;",CARD;3.5 X 5;,,,1888-0611,...,1888-10-15,The Albany,,,,complete,2,30,1888,1888
49,12516,,REVERE HOUSE,COMPLIMENTARY BANQUET GIVEN BY THE CITY GOVERN...,RESTAURANT,"BOSTON, MA",BROADSIDE; ILLUS; 4.25 X 11.75,,MENU PRINTED IN BLACK ON CREAM SILK RIBBON WIT...,1865-0001,...,1865-09-28,Parker House,,Dollars,$,complete,4,422,1865,1865
147,12635,,,,,,,,,,...,1888-10-15,The Albany,,,,complete,2,30,,1888
148,12636,,,,,,,,,,...,1865-06-09,Revere House,,Dollars,$,complete,4,403,,1865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11875,29396,,Hotel Mas De Vence,,,,19x14cm folded; 19x28cm open,,,1995-0016_wotm,...,1995-07-03,Hotel Mas De Vence,,,,complete,3,4,1995,1995
11877,29398,,Outpost,,,,21.5x12cm folded; 21.5x35.5cm open,,,1995-0006_wotm,...,1995-04-22,Outpost,,Dollars,$,complete,3,59,1995,1995
11878,29399,,Que Pasta's,,,,21.5x18cm folded; 21.5x35.5cm open,,,1988-0001_wotm,...,1988-01-01,Que Pasta's,,Dollars,$,complete,3,64,1988,1988
11880,29401,,Le Riveria,,,,21x15cm folded; 21x30cm open,,,1990-0012_wotm,...,1990-06-06,Le Riveria,,,,complete,3,7,1990,1990


Violations found: 1248<br>
Menu dataset size: 17545<br>
MenuPage dataset size: 66937<br>
MenuItem dataset size: 1332726<br>

In [None]:
# IC 5: Date range outsite of 1890-1970 in Menu Cleaning
# dataset[MENU] = dataset[MENU][
#     ~dataset[MENU].isin(ic5_violations)
# ]
dataset[MENU] = dataset[MENU][~dataset[MENU]['id'].isin(ic5_violations)]

# Cant remove yet, due to unlinked items
# dataset[MENU_PAGE] = dataset[MENU_PAGE][~dataset[MENU_PAGE]['menu_id'].isin(ic5_violations)]
# dataset[MENU_ITEM] = dataset[MENU_ITEM][~dataset[MENU_ITEM]['menu_page_id'].isin(dataset[MENU_PAGE]['id'])]

print(f"After Cleaning Applied: {len(ic5_violations)}")
print(f"Menu dataset size: {len(dataset[MENU])}")
print(f"MenuPage dataset size: {len(dataset[MENU_PAGE])}")
print(f"MenuItem dataset size: {len(dataset[MENU_ITEM])}")

After Cleaning Applied: 1248
Menu dataset size: 16297
MenuPage dataset size: 61868
MenuItem dataset size: 90151


In [15]:
# IC 5: Clean up (remove dishes that no longer appear in any menu items)
print(f"Dish dataset size: {len(dataset[DISH])}")

print("Cleaning up Dish dataset...")
active_dish_ids = dataset[MENU_ITEM]['dish_id'].unique()
dataset[DISH] = dataset[DISH][
    dataset[DISH]['id'].isin(active_dish_ids)
]
    
print(f"Dish dataset size: {len(dataset[DISH])}")

Dish dataset size: 423397
Cleaning up Dish dataset...
Dish dataset size: 67463


In [None]:
# Export the cleaned dataset
dataset[MENU].drop(columns=["date_prefix", "call_prefix"], inplace=True)

for i in range(len(dataset)):
    dataset[i].to_csv(Path(OUTPUT_FOLDER) / OUTPUT_FILE[i], index=False)