In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

ROOT_FOLDER = "NYPL-menus"
# ROOT_FOLDER = "NYPL-menus-cleaned"

OUTPUT_FOLDER = "NYPL-menus-cleaned"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

OUTPUT_FILE = []

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Cleaning Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Cleaning the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [2]:
dataset = []
OUTPUT_FILE = []

print(f"Loading datasets from {ROOT_FOLDER}...")
for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    print(f"Loading {filename.name}...")
    OUTPUT_FILE.append(filename.name.replace(".csv", "_fixed.csv").replace("_fixed_fixed", "_fixed"))
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))
print("Datasets loaded.\n")

print("output files:")
print(OUTPUT_FILE)

Loading datasets from NYPL-menus...
Loading Dish.csv...
Loading Menu.csv...
Loading MenuItem.csv...
Loading MenuPage.csv...
Datasets loaded.

output files:
['Dish_fixed.csv', 'Menu_fixed.csv', 'MenuItem_fixed.csv', 'MenuPage_fixed.csv']


In [3]:
# IC 2: Date Outliner in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: call_prefix == date_year when both exist
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].notna() & 
    dataset[MENU]["call_prefix"].str.isnumeric() &
    (dataset[MENU]["date"].str.match(r"[^1]+") |
    dataset[MENU]["date"].str.match(r"[1][^89]+")) &
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Before Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

Before Cleaning Applied: 5


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
525,13112,1900-2328,0190-03-06,1900,190
6195,22951,1901-213,1091-01-27,1901,1091
14659,32265,1918-0387_wotm,2928-03-26,1918,2928
16917,34727,1912-0667_wotm,0001-01-01,1912,1
16918,34728,1912-0668_wotm,0001-01-01,1912,1


In [4]:

# IC 2: Date Outliner in Menu Cleaning
dataset[MENU].loc[ic2_violations.index, "date"] = (
    dataset[MENU].loc[ic2_violations.index, "call_prefix"] + 
    dataset[MENU].loc[ic2_violations.index, "date"].str[4:]
)

print(f"After Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 5


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
525,13112,1900-2328,1900-03-06,1900,190
6195,22951,1901-213,1901-01-27,1901,1091
14659,32265,1918-0387_wotm,1918-03-26,1918,2928
16917,34727,1912-0667_wotm,1912-01-01,1912,1
16918,34728,1912-0668_wotm,1912-01-01,1912,1


In [5]:
# IC 3: Date Blank with call number year in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic3_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    dataset[MENU]["call_prefix"].str.isnumeric()
]

print(f"Violations found: {len(ic3_violations)}")
ic3_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head(20)

Violations found: 47


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
465,13042,1900-2517,,1900,
4598,20978,1906-783,,1906,
5025,21467,1886-036,,1886,
5400,21969,1887-028,,1887,
8633,25998,1900-189,,1900,
8732,26119,1899-606,,1899,
10093,27576,1910-881,,1910,
10426,27912,1973-0020_wotm,,1973,
10526,28012,1977-0004_wotm,,1977,
10560,28062,1978-0021_wotm,,1978,


In [6]:
# IC 3: Date Blank with call number year in Menu Cleaning
dataset[MENU].loc[ic3_violations.index, "date"] = (
    dataset[MENU].loc[ic3_violations.index, "call_prefix"] + "-01-01"
)

print(f"After Cleaning Applied: {len(ic3_violations)}")
dataset[MENU].loc[ic3_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 47


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
465,13042,1900-2517,1900-01-01,1900,
4598,20978,1906-783,1906-01-01,1906,
5025,21467,1886-036,1886-01-01,1886,
5400,21969,1887-028,1887-01-01,1887,
8633,25998,1900-189,1900-01-01,1900,
8732,26119,1899-606,1899-01-01,1899,
10093,27576,1910-881,1910-01-01,1910,
10426,27912,1973-0020_wotm,1973-01-01,1973,
10526,28012,1977-0004_wotm,1977-01-01,1977,
10560,28062,1978-0021_wotm,1978-01-01,1978,


In [7]:
# IC 4: Date Blank with no date info in call number in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic4_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() & (
    (dataset[MENU]["call_prefix"] == 'Zand') |
    (dataset[MENU]["call_prefix"] == 'Soet') |
    (dataset[MENU]["call_prefix"] == 'soet') |
    (dataset[MENU]["call_prefix"] == 'Bara') |
    (dataset[MENU]["call_prefix"] == '_wot'))
]

print(f"Violations found: {len(ic4_violations)}")
dataset[MENU].loc[ic4_violations.index]

Violations found: 536


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix
8913,26347,,Pan American,,AIRLINE,,Folder; 8.5 x 6 inches,,Dieter Zander Collection; cover is an illustra...,Zander 11,...,,Pan American,,,,complete,3,30,Zand,
8915,26349,,American Export Lines,Captain's Dinner,STEAMSHIP,S.S. Independence,Booklet; 8.5 x 11.5 inches,,"Dieter Zander Collection; captain, officers, a...",Zander 13,...,,American Export Lines,,,,complete,4,28,Zand,
8918,26352,,American Airlines,,AIRLINE,,Folded set of postcards; 7 x 4.5 in. folded; 7...,,Dieter Zander Collection; Americana themed; sc...,Zander 16,...,,American Airlines,,,,complete,3,34,Zand,
8921,26355,,Pan American,,AIRLINE,,Folder; 11 x 9 inches,,Dieter Zander Collection; cover is a watercolo...,Zander 19 undated,...,,Pan American,,,,complete,3,68,Zand,
8922,26356,,Pan American,,AIRLINE,,Tri-fold; 9.75 x 13.25 in. folded; 9.75 x 26 i...,,"Dieter Zander Collection, cover is a watercolo...",Zander 21 undated,...,,Pan American,,,,complete,3,47,Zand,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10224,27708,,Dinner held by Pennsylvania Railroad (Railroad...,,,,10 x 7 in. fold. ; ill.,,,Soete 235A-B,...,,Pennsylvania Railroad,,Dollars,$,complete,4,101,Soet,
10225,27709,,Dinner held by Pennsylvania Railroad (Railroad...,,,,10 x 7 in. fold. ; ill.,,,Soete 237A-C,...,,Pennsylvania Railroad,,Dollars,$,complete,5,91,Soet,
10226,27710,,Dinner held by Pennsylvania Railroad,,,,,,,Soete 209A,...,,Dinner Held By Pennsylvania Railroad,,Dollars,$,complete,2,101,Soet,
10228,27712,,Exec Committe California Midwinter Int Exposition,,,,,,,soete 56,...,,Exec Committe California Midwinter Int Exposition,,,,complete,3,0,soet,


In [8]:

# IC 4: Date Blank with no date info in call number in Menu Cleaning
dataset[MENU]["date"] = dataset[MENU]["date"].ffill()
# .interpolate(method='nearest')

print(f"After Cleaning Applied: {len(ic4_violations)}")
dataset[MENU].loc[ic4_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 536


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
8913,26347,Zander 11,1989-01-01,Zand,
8915,26349,Zander 13,1969-12-01,Zand,
8918,26352,Zander 16,1952-01-01,Zand,
8921,26355,Zander 19 undated,1963-07-29,Zand,
8922,26356,Zander 21 undated,1963-07-29,Zand,
...,...,...,...,...,...
10224,27708,Soete 235A-B,1919-05-12,Soet,
10225,27709,Soete 237A-C,1919-05-12,Soet,
10226,27710,Soete 209A,1919-05-12,Soet,
10228,27712,soete 56,1931-11-01,soet,


In [9]:
# # IC 5: Date range outsite of 1890-1970 in Menu
# ic5_violations = dataset[MENU][
#     dataset[MENU]["date"].notna() & 
#     (dataset[MENU]["date"].str[:4].astype(int) < 1890) | 
#     (dataset[MENU]["date"].str[:4].astype(int) > 1970)
# ]["id"]

# print(f"Violations found: {len(ic5_violations)}")
# print(f"Menu dataset size: {len(dataset[MENU])}")
# print(f"MenuPage dataset size: {len(dataset[MENU_PAGE])}")
# print(f"MenuItem dataset size: {len(dataset[MENU_ITEM])}")
# dataset[MENU].loc[ic5_violations.index]

Violations found: 1248<br>
Menu dataset size: 17545<br>
MenuPage dataset size: 66937<br>
MenuItem dataset size: 1332726<br>

In [10]:
# # IC 5: Date range outsite of 1890-1970 in Menu Cleaning
# # dataset[MENU] = dataset[MENU][
# #     ~dataset[MENU].isin(ic5_violations)
# # ]
# dataset[MENU] = dataset[MENU][~dataset[MENU]['id'].isin(ic5_violations)]

# # Cant remove yet, due to unlinked items
# # dataset[MENU_PAGE] = dataset[MENU_PAGE][~dataset[MENU_PAGE]['menu_id'].isin(ic5_violations)]
# # dataset[MENU_ITEM] = dataset[MENU_ITEM][~dataset[MENU_ITEM]['menu_page_id'].isin(dataset[MENU_PAGE]['id'])]

# print(f"After Cleaning Applied: {len(ic5_violations)}")
# print(f"Menu dataset size: {len(dataset[MENU])}")
# print(f"MenuPage dataset size: {len(dataset[MENU_PAGE])}")
# print(f"MenuItem dataset size: {len(dataset[MENU_ITEM])}")

In [11]:
# # IC 5: Clean up (remove dishes that no longer appear in any menu items)
# print(f"Dish dataset size: {len(dataset[DISH])}")

# print("Cleaning up Dish dataset...")
# active_dish_ids = dataset[MENU_ITEM]['dish_id'].unique()
# dataset[DISH] = dataset[DISH][
#     dataset[DISH]['id'].isin(active_dish_ids)
# ]
    
# print(f"Dish dataset size: {len(dataset[DISH])}")

In [12]:
# IC 6: Date blank in Dish with Dependency on Menu

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4].astype('int64')

dish_appearances = (
    dataset[MENU_ITEM][['id', 'dish_id', 'menu_page_id']]
    .merge(dataset[MENU_PAGE][['id', 'menu_id']], 
            left_on='menu_page_id', right_on='id', suffixes=('', '_page'))
    .merge(dataset[MENU], left_on='menu_id', right_on='id')
    .groupby('dish_id')['date_prefix']
    .agg(['min', 'max'])
    .reset_index()
    .rename(columns={'min': 'calc_first', 'max': 'calc_last'})
)

dishes = dataset[DISH].merge(
    dish_appearances, left_on='id', right_on='dish_id', how='left')

ic6_violations_6_first = dishes[
    (dishes['calc_first'].notna() &(
    (dishes['first_appeared'].isna()) |
    (dishes['first_appeared'] == 0) |
    (dishes['first_appeared'] == 1) |
    (dishes['calc_first'].notna() & (dishes['first_appeared'] > dishes['calc_first']))))
]

ic6_violations_6_last = dishes[
    (dishes['calc_last'].notna()) & (
    (dishes['last_appeared'].isna()) |
    (dishes['last_appeared'] == 0) |
    (dishes['last_appeared'] == 2928) |
    (dishes['calc_last'].notna() & (dishes['last_appeared'] < dishes['calc_last'])))
]

print(f"Violations found: {len(ic6_violations_6_first) + len(ic6_violations_6_last)}")
dishes.loc[ic6_violations_6_first.index ][
    ["id", "name", "first_appeared", "last_appeared", "calc_first", "calc_last"]
]

Violations found: 110910


Unnamed: 0,id,name,first_appeared,last_appeared,calc_first,calc_last
13,15,Celery,1,2928,1852.0,2012.0
24,26,Clams,1881,1970,1859.0,1970.0
25,27,Oysters,1862,1963,1859.0,1990.0
34,38,Apple Sauce,1,1987,1856.0,1987.0
72,78,Vegetable,1892,1987,1889.0,1987.0
...,...,...,...,...,...,...
423392,515673,Boiled: Corned beef & cabbage,0,0,1882.0,1882.0
423393,515674,Boiled: Knuckle of Veal & Bacon,0,0,1882.0,1882.0
423394,515675,Roast: Turkey & Cranberry Sauce,0,0,1882.0,1882.0
423395,515676,"Claret: Chateau Larose, Cruse et Fils Freres",0,0,1883.0,1883.0


In [13]:
# IC 6: Date blank in Dish with Dependency on Menu Cleaning
dataset[DISH].loc[ic6_violations_6_first.index, 'first_appeared'] = dishes.loc[ic6_violations_6_first.index, 'calc_first']

dataset[DISH].loc[ic6_violations_6_last.index, 'last_appeared'] = dishes.loc[ic6_violations_6_last.index, 'calc_last']

dataset[DISH]['first_appeared'] = dataset[DISH]['first_appeared'].ffill()
dataset[DISH]['last_appeared'] = dataset[DISH]['last_appeared'].ffill()

print(f"After Cleaning Applied: {len(ic6_violations_6_first)}")
dataset[DISH].loc[ic6_violations_6_first.index][['id', 'first_appeared', 'last_appeared']]

After Cleaning Applied: 54311


Unnamed: 0,id,first_appeared,last_appeared
13,15,1852,2012
24,26,1859,1970
25,27,1859,1990
34,38,1856,1987
72,78,1889,1987
...,...,...,...
423392,515673,1882,1882
423393,515674,1882,1882
423394,515675,1882,1882
423395,515676,1883,1883


In [14]:
# IC 7: Date Zero in Dish with no dependencies on Menu
ic7_violations_1 = dataset[DISH][
    (dataset[DISH]["first_appeared"] == 0) | 
    (dataset[DISH]["last_appeared"] == 0)
]

ic7_violations_2 = dataset[MENU_ITEM][
    dataset[MENU_ITEM]["dish_id"].isin(dataset[DISH].loc[ic7_violations_1.index, "id"])
]

ic7_violations_3 = dataset[MENU_PAGE][
    dataset[MENU_PAGE]["id"].isin(ic7_violations_2["menu_page_id"])
]

ic7_violations_4 = dataset[MENU][
    dataset[MENU]["id"].isin(ic7_violations_3["menu_id"])
]

ic7_violations_first = dataset[DISH][(dataset[DISH]['first_appeared'] == 0) | (dataset[DISH]['first_appeared'] == 1)]

ic7_violations_last = dataset[DISH][dataset[DISH]['last_appeared'] == 0]

print(f"Violations found: {len(ic7_violations_1)}")
dataset[DISH].loc[ic7_violations_1.index]
# dataset[MENU_ITEM].loc[ic7_violations_2.index]
# dataset[MENU_PAGE].loc[ic7_violations_3.index]
# dataset[MENU].loc[ic7_violations_4.index]


Violations found: 4805


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
70131,88446,toddy,,0,1,0,1969,0.0,0.0
106416,132992,Pate en Croute,,0,1,0,0,0.0,0.0
110907,138404,"Caldo Xochitl (chicken broth, rice, tomatoes, ...",,1,0,0,0,0.0,0.0
110908,138405,"Ceviche de Acapulco (marinated fish, lemon jui...",,1,0,0,0,0.0,0.0
110909,138406,Chiles Rellenos con Picadillo (sweet peppers s...,,1,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
423321,515602,Fig Cômpote,,1,0,0,0,0.0,0.0
423323,515604,Compôte of Apricots,,0,0,0,0,0.0,0.0
423339,515620,Filet de Soles à l'Amiral,,0,0,0,0,0.0,0.0
423365,515646,Lebendfrischer Helgoländer Hummer - Zubereitun...,,0,0,0,0,0.0,0.0


In [15]:
# IC 7: Date Zero in Dish with no dependencies on Menu Cleaning
dataset[DISH].loc[ic7_violations_first.index, 'first_appeared'] = dataset[DISH].loc[ic7_violations_first.index, 'first_appeared'].replace(0, np.nan)
dataset[DISH].loc[ic7_violations_last.index, 'last_appeared'] = dataset[DISH].loc[ic7_violations_last.index, 'last_appeared'].replace(0, np.nan)

dataset[DISH]['first_appeared'] = dataset[DISH]['first_appeared'].ffill()
dataset[DISH]['last_appeared'] = dataset[DISH]['last_appeared'].ffill()

print(f"After Cleaning Applied: {len(ic7_violations_1)}")
dataset[DISH].loc[ic7_violations_1.index][['id', 'first_appeared', 'last_appeared']]

After Cleaning Applied: 4805


Unnamed: 0,id,first_appeared,last_appeared
70131,88446,1906.0,1969.0
106416,132992,1937.0,1971.0
110907,138404,1973.0,1973.0
110908,138405,1973.0,1973.0
110909,138406,1973.0,1973.0
...,...,...,...
423321,515602,1953.0,1953.0
423323,515604,1953.0,1953.0
423339,515620,1910.0,1910.0
423365,515646,1965.0,1965.0


In [16]:
# IC 8: Temporal consistency in Dish
ic8_violations = dataset[DISH][dataset[DISH]["first_appeared"] > dataset[DISH]["last_appeared"]]

print(f"Violations found: {len(ic8_violations)}")
ic8_violations.head(10)

Violations found: 3


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
197050,250693,SURI LEBERLI - Shredded Calf's Liver Flambe in...,,0,1,1945.0,1901.0,,
197053,250699,"SWISS MINCED VEAL, ROESTI",,0,1,1945.0,1901.0,,
241858,306637,CARRE DE PORC FROID,,0,1,1945.0,1944.0,0.0,0.0


In [17]:
# IC 8: Temporal consistency in Dish cleaning
dataset[DISH].loc[ic8_violations.index, "last_appeared"] = dataset[DISH].loc[ic8_violations.index, "first_appeared"]

print(f"After Cleaning Applied: {len(ic8_violations)}")
dataset[DISH].loc[ic8_violations.index]

After Cleaning Applied: 3


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
197050,250693,SURI LEBERLI - Shredded Calf's Liver Flambe in...,,0,1,1945.0,1945.0,,
197053,250699,"SWISS MINCED VEAL, ROESTI",,0,1,1945.0,1945.0,,
241858,306637,CARRE DE PORC FROID,,0,1,1945.0,1945.0,0.0,0.0


In [18]:
# IC 9: Date range outsite of 1880-2000 in Dish
ic9_violations = dataset[DISH][
    dataset[DISH]["first_appeared"].notna() & 
    dataset[DISH]["last_appeared"].notna() & (
    (dataset[DISH]["first_appeared"] > 2000) | 
    (dataset[DISH]["last_appeared"] < 1880))
]
print(f"Violations found: {len(ic9_violations)}")
dataset[DISH].loc[ic9_violations.index][["id", "name", "first_appeared", "last_appeared"]].head(10)

Violations found: 7065


Unnamed: 0,id,name,first_appeared,last_appeared
2173,2605,SAUTERNES.,1865.0,1865.0
3252,4055,"Soups without meats,",1865.0,1865.0
3269,4078,Bropiled Blue Fish,1865.0,1865.0
3273,4082,"Broiled Cusk, Cream Sauce",1865.0,1865.0
3373,4203,Stewed Eels,1858.0,1865.0
3401,4239,"Baked Cod, Port Sauce",1858.0,1865.0
3424,4265,"Chicken and Pork, White Sauce",1865.0,1865.0
3427,4269,Shoulder Corned Pork,1865.0,1865.0
3459,4308,Stewed Calf's Liver,1865.0,1865.0
3460,4309,"Loin of Veal, Stuffed",1865.0,1865.0


In [None]:
# IC 9: Date range outsite of 1880-2000 in Dish Cleaning
dataset[DISH] = dataset[DISH][~dataset[DISH]['id'].isin(ic9_violations['id'])]
print(f"After Cleaning Applied: {len(ic9_violations)}")

print(dataset[DISH]['first_appeared'].agg(['min', 'max']))

print(dataset[DISH]['last_appeared'].agg(['min', 'max']))

After Cleaning Applied: 7065
min    1851.0
max    2000.0
Name: first_appeared, dtype: float64
min    1880.0
max    2015.0
Name: last_appeared, dtype: float64


In [None]:
# Export the cleaned dataset
dataset[MENU].drop(columns=["date_prefix", "call_prefix"], inplace=True)

for i in range(len(dataset)):
    dataset[i].to_csv(Path(OUTPUT_FOLDER) / OUTPUT_FILE[i], index=False)