In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# ROOT_FOLDER = "NYPL-menus"
ROOT_FOLDER = "NYPL-menus-cleaned"

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Profiling Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Profiling the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [2]:
dataset = []

for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    print(filename)
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))


NYPL-menus-cleaned\Dish_fixed.csv
NYPL-menus-cleaned\Menu_fixed.csv
NYPL-menus-cleaned\MenuItem_fixed.csv
NYPL-menus-cleaned\MenuPage_fixed.csv


In [3]:
# IC 1: Temporal consistency in Dish
ic1_violations = dataset[DISH][dataset[DISH]["first_appeared"] > dataset[DISH]["last_appeared"]]

print(f"Violations found: {len(ic1_violations)}")
ic1_violations.head(10)

Violations found: 0


Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price


In [4]:
dataset[MENU_ITEM][dataset[MENU_ITEM]["dish_id"].isin(dataset[DISH].loc[ic1_violations.index, "id"])]

Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos


In [5]:
# IC 2: Date Outliner in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: call_prefix == date_prefix when both exist
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].notna() & 
    dataset[MENU]["call_prefix"].str.isnumeric() &
    dataset[MENU]["date"].str.match(r"[^1]+") & 
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Violations found: {len(ic2_violations)}")
ic2_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head()

Violations found: 0


Unnamed: 0,id,call_number,date,call_prefix,date_prefix


In [6]:
# IC 3: Date Blank with call number year in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic3_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    dataset[MENU]["call_prefix"].str.isnumeric()
]

print(f"Violations found: {len(ic3_violations)}")
ic3_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head(20)

Violations found: 0


Unnamed: 0,id,call_number,date,call_prefix,date_prefix


In [7]:
# IC 4: Date Blank with no date info in call number in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic4_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() & (
    (dataset[MENU]["call_prefix"] == 'Zand') |
    (dataset[MENU]["call_prefix"] == 'Soet') |
    (dataset[MENU]["call_prefix"] == 'soet') |
    (dataset[MENU]["call_prefix"] == 'Bara') |
    (dataset[MENU]["call_prefix"] == '_wot'))
]

print(f"Violations found: {len(ic4_violations)}")
dataset[MENU].loc[ic4_violations.index]

Violations found: 0


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix


In [8]:
# IC 5: Date range outsite of 1890-1970 in Menu
ic5_violations = dataset[MENU][
    dataset[MENU]["date"].notna() & 
    (dataset[MENU]["date"].str[:4].astype(int) < 1890) | 
    (dataset[MENU]["date"].str[:4].astype(int) > 1970)
]

print(f"Violations found: {len(ic5_violations)}")
dataset[MENU].loc[ic5_violations.index]

Violations found: 0


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix


In [9]:
# IC 6: Date blank in Dish dependencies on All Dataset
ic6_violations_1 = dataset[DISH][
    (dataset[DISH]["first_appeared"] == 0) | 
    (dataset[DISH]["last_appeared"] == 0)
]

ic6_violations_2 = dataset[MENU_ITEM][
    dataset[MENU_ITEM]["dish_id"].isin(dataset[DISH].loc[ic6_violations_1.index, "id"])
]

ic6_violations_3 = dataset[MENU_PAGE][
    dataset[MENU_PAGE]["id"].isin(ic6_violations_2["menu_page_id"])
]

ic6_violations_4 = dataset[MENU][
    dataset[MENU]["id"].isin(ic6_violations_3["menu_id"])
]

print(f"Violations found: {len(ic6_violations_4)}")
# dataset[DISH].loc[ic6_violations.index]
# dataset[MENU_ITEM].loc[ic6_violations_2.index]
# dataset[MENU_PAGE].loc[ic6_violations_3.index]
dataset[MENU].loc[ic6_violations_4.index]

Violations found: 1163


Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,call_prefix,date_prefix
124,12610,,,,,,,,,,...,1900-04-20,Hotel Imperial,,,,complete,2,15,,1900
294,12841,,HOTEL MARLBOROUGH,BREAKFAST,COMMERCIAL,"[66 STREET AND BROADWAY,NEW YORK,NY]",CARD;ILL;COL;7X11;,DAILY;,COAT OF ARMS;DATE HANDWRITTEN BY FB;,1900-2226,...,1900-03-02,Hotel Marlborough,,Dollars,$,complete,2,317,1900,1900
331,12882,,CLAREMONT HOTEL,MENU,COMMERCIAL,,CARD; ILLUS; 6X11;,,PRICED MENU & PRICED WINE LISTS; PRICES APPEAR...,1900-2740,...,1900-04-09,Claremont Hotel,,Dollars,$,complete,2,226,1900,1900
338,12896,,HOTEL MARIE ANTOINETTE,LUNCHEON,COMMERCIAL,66TH STREET AND BR6ADWAY,CARD; 6.5 X 9.75;,,PRICED MENU; CREST;,1900-2015,...,1900-02-18,Hotel Marie Antoinette,,Dollars,$,under review,2,82,1900,1900
345,12905,,SAN REMO HOTEL,DINNER,COMMERCIAL,"75TH ST & CENTRAL PARK WEST, NY",FOLDER; 6.5 X 10.5;,DAILY,,1900-1042,...,1900-02-13,San Remo Hotel,,Dollars,$,complete,4,243,1900,1900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15989,35097,Hotel Earlington,Hotel Earlington,,,,,,New York Nautical School,1913-0325_wotm,...,1913-03-29,Hotel Earlington,,,,complete,8,19,1913,1913
15990,35099,Waldorf Astoria,Waldorf Astoria,,,,,,Missouri Society of the City of New York,1913-0327_wotm,...,1913-03-29,Waldorf Astoria,,,,complete,8,19,1913,1913
15991,35102,Palace Hotel,Palace Hotel,,,,,,Northern California Hotel Association,1913-0330_wotm,...,1913-03-29,Palace Hotel,,,,complete,3,21,1913,1913
15992,35103,Waldorf Astoria,Waldorf Astoria,,,,,,Wall Paper Manufacturers Association,1913-0331_wotm,...,1913-03-29,Waldorf Astoria,,,,complete,11,22,1913,1913


In [10]:
# IC 6: Date blank in Dish

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4].astype('int64')

dish_appearances = (
    dataset[MENU_ITEM][['id', 'dish_id', 'menu_page_id']]
    .merge(dataset[MENU_PAGE][['id', 'menu_id']], 
            left_on='menu_page_id', right_on='id', suffixes=('', '_page'))
    .merge(dataset[MENU], left_on='menu_id', right_on='id')
    .groupby('dish_id')['date_prefix']
    .agg(['min', 'max'])
    .reset_index()
    .rename(columns={'min': 'calc_first', 'max': 'calc_last'})
)

dishes = dataset[DISH].merge(
    dish_appearances, left_on='id', right_index=True, how='left')

ic6_violations_6_first = dishes[
    (dishes['calc_first'].notna() &(
    (dishes['first_appeared'].isna()) |
    (dishes['first_appeared'] == 0) |
    (dishes['calc_first'].notna() & (dishes['first_appeared'] > dishes['calc_first']))))
]

ic6_violations_6_last = dishes[
    (dishes['calc_last'].notna()) & (
    (dishes['last_appeared'].isna()) |
    (dishes['last_appeared'] == 0) |
    (dishes['calc_last'].notna() & (dishes['last_appeared'] < dishes['calc_last'])))
]

print(f"Violations found: {len(ic6_violations_6_first) + len(ic6_violations_6_last)}")
dishes.loc[ic6_violations_6_first.index ][
    ["id", "name", "first_appeared", "last_appeared", "calc_first", "calc_last"]
]

Violations found: 0


Unnamed: 0,id,name,first_appeared,last_appeared,calc_first,calc_last
