In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

ROOT_FOLDER = "NYPL-menus"
# ROOT_FOLDER = "NYPL-menus-cleaned"

OUTPUT_FOLDER = "NYPL-menus-cleaned"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

OUTPUT_FILE = ["Dish_fixed.csv", "Menu_fixed.csv", "MenuItem_fixed.csv", "MenuPage_fixed.csv"]

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Cleaning Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Cleaning the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [None]:
dataset = []

for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    print(filename)
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))

In [None]:
# IC 1: Temporal consistency in Dish
ic1_violations = dataset[DISH][dataset[DISH]["first_appeared"] > dataset[DISH]["last_appeared"]]

print(f"Violations found: {len(ic1_violations)}")
ic1_violations.head(10)

In [None]:
# IC 1: Temporal consistency in Dish cleaning
dataset[DISH].loc[ic1_violations.index, "last_appeared"] = dataset[DISH].loc[ic1_violations.index, "first_appeared"]

print(f"After Cleaning Applied: {len(ic1_violations)}")
dataset[DISH].loc[ic1_violations.index]

In [12]:
# IC 2: Date Outliner in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: call_prefix == date_year when both exist
ic2_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].notna() & 
    dataset[MENU]["call_prefix"].str.isnumeric() &
    dataset[MENU]["date"].str.match(r"[^1]+") & 
    (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Before Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

Before Cleaning Applied: 4


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
525,13112,1900-2328,0190-03-06,1900,190
14659,32265,1918-0387_wotm,2928-03-26,1918,2928
16917,34727,1912-0667_wotm,0001-01-01,1912,1
16918,34728,1912-0668_wotm,0001-01-01,1912,1


In [13]:

# IC 2: Date Outliner in Menu Cleaning
dataset[MENU].loc[ic2_violations.index, "date"] = (
    dataset[MENU].loc[ic2_violations.index, "call_prefix"] + 
    dataset[MENU].loc[ic2_violations.index, "date"].str[4:]
)

print(f"After Cleaning Applied: {len(ic2_violations)}")
dataset[MENU].loc[ic2_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 4


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
525,13112,1900-2328,1900-03-06,1900,190
14659,32265,1918-0387_wotm,1918-03-26,1918,2928
16917,34727,1912-0667_wotm,1912-01-01,1912,1
16918,34728,1912-0668_wotm,1912-01-01,1912,1


In [14]:
# IC 3: Date Blank in Menu

# Extract first 4 digits of call_number (if not null)
dataset[MENU]["call_prefix"] = dataset[MENU]["call_number"].str[:4]

# Extract first 4 digits of date (if not null)
dataset[MENU]["date_prefix"] = dataset[MENU]["date"].str[:4]

# Check constraint: date should not be blank when call_prefix is numeric
ic3_violations = dataset[MENU][
    dataset[MENU]["call_number"].notna() & 
    dataset[MENU]["date"].isna() &
    # (dataset[MENU]["call_prefix"] != 'Zand') &
    # (dataset[MENU]["call_prefix"] != 'Soet') &
    # (dataset[MENU]["call_prefix"] != 'soet') &
    # (dataset[MENU]["call_prefix"] != 'Bara') &
    # (dataset[MENU]["call_prefix"] != '_wot') 
    dataset[MENU]["call_prefix"].str.isnumeric()
    # dataset[MENU]["date"].str.match(r"[^1]+") & 
    # (dataset[MENU]["call_prefix"] != dataset[MENU]["date_prefix"])
]

print(f"Violations found: {len(ic3_violations)}")
ic3_violations[["id", "call_number", "date", "call_prefix", "date_prefix"]].head(20)

Violations found: 47


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
465,13042,1900-2517,,1900,
4598,20978,1906-783,,1906,
5025,21467,1886-036,,1886,
5400,21969,1887-028,,1887,
8633,25998,1900-189,,1900,
8732,26119,1899-606,,1899,
10093,27576,1910-881,,1910,
10426,27912,1973-0020_wotm,,1973,
10526,28012,1977-0004_wotm,,1977,
10560,28062,1978-0021_wotm,,1978,


In [15]:
# IC 3: Date Blank in Menu Cleaning
dataset[MENU].loc[ic3_violations.index, "date"] = (
    dataset[MENU].loc[ic3_violations.index, "call_prefix"] + "-01-01"
)

print(f"After Cleaning Applied: {len(ic3_violations)}")
dataset[MENU].loc[ic3_violations.index][["id", "call_number", "date", "call_prefix", "date_prefix"]]

After Cleaning Applied: 47


Unnamed: 0,id,call_number,date,call_prefix,date_prefix
465,13042,1900-2517,1900-01-01,1900,
4598,20978,1906-783,1906-01-01,1906,
5025,21467,1886-036,1886-01-01,1886,
5400,21969,1887-028,1887-01-01,1887,
8633,25998,1900-189,1900-01-01,1900,
8732,26119,1899-606,1899-01-01,1899,
10093,27576,1910-881,1910-01-01,1910,
10426,27912,1973-0020_wotm,1973-01-01,1973,
10526,28012,1977-0004_wotm,1977-01-01,1977,
10560,28062,1978-0021_wotm,1978-01-01,1978,


In [16]:
# Export the cleaned dataset
dataset[MENU].drop(columns=["date_prefix", "call_prefix"], inplace=True)

for i in range(len(dataset)):
    dataset[i].to_csv(Path(OUTPUT_FOLDER) / OUTPUT_FILE[i], index=False)