In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

ROOT_FOLDER = "NYPL-menus"
# ROOT_FOLDER = "NYPL-menus-cleaned"

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Preview Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data
```

## Preview the Cleaned Dataset

Update the ROOT_FOLDER variable to point to your cleaned data directory:

```python
ROOT_FOLDER = "NYPL-menus-cleaned"  # Path to cleaned data
```

In [2]:
dataset = []

for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))

In [6]:
# dataset[MENU].head()
# dataset[MENU_PAGE].head()
# dataset[MENU_ITEM].head()
dataset[DISH].head()

Unnamed: 0,id,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price
0,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4
1,2,Chicken gumbo,,111,117,1895,1960,0.1,0.8
2,3,Tomato aux croutons,,13,13,1893,1917,0.25,0.4
3,4,Onion au gratin,,41,41,1900,1971,0.25,1.0
4,5,St. Emilion,,66,68,1881,1981,0.0,18.0


In [6]:
dataset[MENU].info()
# dataset[MENU_PAGE].info()
# dataset[MENU_ITEM].info()
# dataset[DISH].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17545 entries, 0 to 17544
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    17545 non-null  int64  
 1   name                  3197 non-null   object 
 2   sponsor               15984 non-null  object 
 3   event                 8154 non-null   object 
 4   venue                 8119 non-null   object 
 5   place                 8123 non-null   object 
 6   physical_description  14763 non-null  object 
 7   occasion              3791 non-null   object 
 8   notes                 10613 non-null  object 
 9   call_number           15983 non-null  object 
 10  keywords              0 non-null      float64
 11  language              0 non-null      float64
 12  date                  16959 non-null  object 
 13  location              17545 non-null  object 
 14  location_type         0 non-null      float64
 15  currency           

In [33]:
# print(dataset[MENU].isna().sum())
# print(dataset[MENU_PAGE].isna().sum())
print(dataset[MENU_ITEM].isna().sum())
# print(dataset[DISH].isna().sum())

id                    0
menu_page_id          0
price            445916
high_price      1240821
dish_id             241
created_at            0
updated_at            0
xpos                  0
ypos                  0
dtype: int64


In [15]:
dataset[MENU].describe(include="all")
# dataset[MENU_PAGE].describe(include="all")
# dataset[MENU_ITEM].describe(include="all")
# dataset[DISH].describe(include="all")

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
count,17545.0,3197,15984,8154,8119,8123,14763,3791,10613,15983,0.0,0.0,16959,17545,0.0,6456,6456,17545,17545.0,17545.0
unique,,797,6370,1770,233,3714,6268,423,6969,15936,,,6599,6283,,42,34,2,,
top,,Waldorf Astoria,Waldorf Astoria,DINNER,COMMERCIAL,EN ROUTE,BROADSHEET; ILLUS; 6.5 X 7.75;,DAILY;,Dieter Zander Collection.,1899-806,,,1915-01-01,Waldorf Astoria,,Dollars,$,complete,,
freq,,475,691,1827,4704,293,127,828,210,3,,,263,703,,5549,5579,17371,,
mean,25325.953377,,,,,,,,,,,,,,,,,,3.484412,75.617213
std,6431.552203,,,,,,,,,,,,,,,,,,3.295643,98.435862
min,12463.0,,,,,,,,,,,,,,,,,,1.0,0.0
25%,20742.0,,,,,,,,,,,,,,,,,,2.0,20.0
50%,26165.0,,,,,,,,,,,,,,,,,,2.0,35.0
75%,30707.0,,,,,,,,,,,,,,,,,,4.0,93.0


In [32]:
dishJoinMenuItem = pd.merge(
    dataset[DISH],
    dataset[MENU_ITEM],
    how="left",
    left_on="id",
    right_on="dish_id",
)

dishJoinMenuItem.head()

Unnamed: 0,id_x,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,id_y,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
0,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,1.0,1389.0,0.4,,1.0,2011-03-28 15:00:44 UTC,2011-04-19 04:33:15 UTC,0.111429,0.254735
1,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,73003.0,248.0,,,1.0,2011-04-26 20:51:20 UTC,2011-04-26 20:51:20 UTC,0.344286,0.498271
2,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,140158.0,5129.0,0.2,,1.0,2011-05-01 14:33:47 UTC,2012-12-23 21:55:44 UTC,0.264286,0.423484
3,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,168416.0,1677.0,0.4,,1.0,2011-05-04 20:00:45 UTC,2011-05-04 20:00:45 UTC,0.1,0.254735
4,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,244132.0,24766.0,,,1.0,2011-05-17 23:09:51 UTC,2011-05-17 23:09:51 UTC,0.38,0.33331
