In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

ROOT_FOLDER = "NYPL-menus"
ROOT_FOLDER_CLEANED = "NYPL-menus-cleaned"

MENU = 1
MENU_PAGE = 3
MENU_ITEM = 2
DISH = 0

# Data Preview Workflow Instructions

## Accessing Datasets

Each dataset can be accessed from the dataset collection using the following keys:

```python
dataset["MENU"]          # Menu data
dataset["MENU_PAGE"]     # Menu page data
dataset["MENU_ITEM"]     # Menu item data 
dataset["DISH"]          # Dish data


dataset_cleaned["MENU"]          # Menu data cleaned
dataset_cleaned["MENU_PAGE"]     # Menu page data cleaned
dataset_cleaned["MENU_ITEM"]     # Menu item data cleaned
dataset_cleaned["DISH"]          # Dish data cleaned
```

In [3]:
dataset = []

for filename in sorted(Path(ROOT_FOLDER).iterdir()):
    if filename.name.endswith(".csv"):
        dataset.append(pd.read_csv(filename, na_values=[""]))

In [4]:
dataset_cleaned = []

for filename in sorted(Path(ROOT_FOLDER_CLEANED).iterdir()):
    if filename.name.endswith(".csv"):
        dataset_cleaned.append(pd.read_csv(filename, na_values=[""]))

In [5]:
dataset[MENU].head()
# dataset[MENU_PAGE].head()
# dataset[MENU_ITEM].head()
# dataset[DISH].head()

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
0,12463,,HOTEL EASTMAN,BREAKFAST,COMMERCIAL,"HOT SPRINGS, AR",CARD; 4.75X7.5;,EASTER;,,1900-2822,,,1900-04-15,Hotel Eastman,,,,complete,2,67
1,12464,,REPUBLICAN HOUSE,[DINNER],COMMERCIAL,"MILWAUKEE, [WI];",CARD; ILLUS; COL; 7.0X9.0;,EASTER;,WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY ...,1900-2825,,,1900-04-15,Republican House,,,,under review,2,34
2,12465,,NORDDEUTSCHER LLOYD BREMEN,FRUHSTUCK/BREAKFAST;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP A...",1900-2827,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,84
3,12466,,NORDDEUTSCHER LLOYD BREMEN,LUNCH;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2828,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,63
4,12467,,NORDDEUTSCHER LLOYD BREMEN,DINNER;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,FOLDER; ILLU; COL; 5.5X7.5;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2829,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,4,33


In [6]:
dataset_cleaned[MENU].head()
# dataset_cleaned[MENU_PAGE].head()
# dataset_cleaned[MENU_ITEM].head()
# dataset_cleaned[DISH].head()

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,...,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count,cleaned_place
0,12463,,HOTEL EASTMAN,BREAKFAST,COMMERCIAL,"HOT SPRINGS, AR",CARD; 4.75X7.5;,EASTER;,,1900-2822,...,,1900-04-15,Hotel Eastman,,Dollars,,complete,2,67,United States
1,12464,,REPUBLICAN HOUSE,[DINNER],COMMERCIAL,"MILWAUKEE, [WI];",CARD; ILLUS; COL; 7.0X9.0;,EASTER;,WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY ...,1900-2825,...,,1900-04-15,Republican House,,Dollars,,under review,2,34,United States
2,12465,,NORDDEUTSCHER LLOYD BREMEN,FRUHSTUCK/BREAKFAST;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP A...",1900-2827,...,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,84,Shipboard
3,12466,,NORDDEUTSCHER LLOYD BREMEN,LUNCH;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2828,...,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,63,Shipboard
4,12467,,NORDDEUTSCHER LLOYD BREMEN,DINNER;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,FOLDER; ILLU; COL; 5.5X7.5;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2829,...,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,4,33,Shipboard


In [7]:
# dataset[MENU].info()
# dataset[MENU_PAGE].info()
# dataset[MENU_ITEM].info()
dataset[DISH].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423397 entries, 0 to 423396
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              423397 non-null  int64  
 1   name            423397 non-null  object 
 2   description     0 non-null       float64
 3   menus_appeared  423397 non-null  int64  
 4   times_appeared  423397 non-null  int64  
 5   first_appeared  423397 non-null  int64  
 6   last_appeared   423397 non-null  int64  
 7   lowest_price    394297 non-null  float64
 8   highest_price   394297 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 29.1+ MB


In [8]:
# dataset_cleaned[MENU].info()
# dataset_cleaned[MENU_PAGE].info()
# dataset_cleaned[MENU_ITEM].info()
dataset_cleaned[DISH].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387783 entries, 0 to 387782
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              387783 non-null  int64  
 1   name            387783 non-null  object 
 2   description     0 non-null       float64
 3   menus_appeared  387783 non-null  int64  
 4   times_appeared  387783 non-null  int64  
 5   first_appeared  387783 non-null  float64
 6   last_appeared   387783 non-null  float64
 7   lowest_price    387783 non-null  float64
 8   highest_price   387783 non-null  float64
 9   dish_id         380045 non-null  float64
dtypes: float64(6), int64(3), object(1)
memory usage: 29.6+ MB


In [9]:
# print(dataset[MENU].isna().sum())
# print(dataset[MENU_PAGE].isna().sum())
# print(dataset[MENU_ITEM].isna().sum())
print(dataset[DISH].isna().sum())

id                     0
name                   0
description       423397
menus_appeared         0
times_appeared         0
first_appeared         0
last_appeared          0
lowest_price       29100
highest_price      29100
dtype: int64


In [10]:
# dataset[MENU].describe(include="all")
# dataset[MENU_PAGE].describe(include="all")
dataset[MENU_ITEM].describe(include="all")
# dataset[DISH].describe(include="all")

Unnamed: 0,id,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
count,1332726.0,1332726.0,886810.0,91905.0,1332485.0,1332726,1332726,1332726.0,1332726.0
unique,,,,,,1291090,1295796,,
top,,,,,,2011-04-29 02:04:21 UTC,2011-04-29 02:04:21 UTC,,
freq,,,,,,6,6,,
mean,697898.4,47594.87,12.838627,8.106321,158011.0,,,0.3890132,0.5497062
std,399980.7,22039.21,499.547387,90.095222,167762.0,,,0.2208378,0.2239532
min,1.0,130.0,0.0,0.0,1.0,,,0.0,0.0
25%,350251.2,32049.0,0.25,0.5,5089.0,,,0.184286,0.3669165
50%,702410.5,53371.0,0.4,1.25,80700.0,,,0.377143,0.567294
75%,1045549.0,66823.0,1.0,3.0,332524.0,,,0.565333,0.7385308


In [11]:
dishJoinMenuItem = pd.merge(
    dataset[DISH],
    dataset[MENU_ITEM],
    how="left",
    left_on="id",
    right_on="dish_id",
)

dishJoinMenuItem.head()

Unnamed: 0,id_x,name,description,menus_appeared,times_appeared,first_appeared,last_appeared,lowest_price,highest_price,id_y,menu_page_id,price,high_price,dish_id,created_at,updated_at,xpos,ypos
0,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,1.0,1389.0,0.4,,1.0,2011-03-28 15:00:44 UTC,2011-04-19 04:33:15 UTC,0.111429,0.254735
1,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,73003.0,248.0,,,1.0,2011-04-26 20:51:20 UTC,2011-04-26 20:51:20 UTC,0.344286,0.498271
2,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,140158.0,5129.0,0.2,,1.0,2011-05-01 14:33:47 UTC,2012-12-23 21:55:44 UTC,0.264286,0.423484
3,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,168416.0,1677.0,0.4,,1.0,2011-05-04 20:00:45 UTC,2011-05-04 20:00:45 UTC,0.1,0.254735
4,1,Consomme printaniere royal,,8,8,1897,1927,0.2,0.4,244132.0,24766.0,,,1.0,2011-05-17 23:09:51 UTC,2011-05-17 23:09:51 UTC,0.38,0.33331


In [12]:
dataset[MENU][dataset[MENU]["currency"].isna()]

Unnamed: 0,id,name,sponsor,event,venue,place,physical_description,occasion,notes,call_number,keywords,language,date,location,location_type,currency,currency_symbol,status,page_count,dish_count
0,12463,,HOTEL EASTMAN,BREAKFAST,COMMERCIAL,"HOT SPRINGS, AR",CARD; 4.75X7.5;,EASTER;,,1900-2822,,,1900-04-15,Hotel Eastman,,,,complete,2,67
1,12464,,REPUBLICAN HOUSE,[DINNER],COMMERCIAL,"MILWAUKEE, [WI];",CARD; ILLUS; COL; 7.0X9.0;,EASTER;,WEDGEWOOD BLUE CARD; WHITE EMBOSSED GREEK KEY ...,1900-2825,,,1900-04-15,Republican House,,,,under review,2,34
2,12465,,NORDDEUTSCHER LLOYD BREMEN,FRUHSTUCK/BREAKFAST;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, STEAMSHIP A...",1900-2827,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,84
3,12466,,NORDDEUTSCHER LLOYD BREMEN,LUNCH;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,CARD; ILLU; COL; 5.5X8.0;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2828,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,2,63
4,12467,,NORDDEUTSCHER LLOYD BREMEN,DINNER;,COMMERCIAL,DAMPFER KAISER WILHELM DER GROSSE;,FOLDER; ILLU; COL; 5.5X7.5;,,"MENU IN GERMAN AND ENGLISH; ILLUS, HARBOR SCEN...",1900-2829,,,1900-04-16,Norddeutscher Lloyd Bremen,,,,complete,4,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17517,35492,Hotel Sherman,Hotel Sherman,,,,,,5 images,1913-0723_wotm,,,1913-09-18,Hotel Sherman,,,,complete,5,20
17518,35493,Ritz Carlton,Ritz Carlton,,,,,,2 images,1913-0724_wotm,,,1913-09-20,Ritz Carlton,,,,complete,2,9
17539,35514,Woolpack Hotel,Woolpack Hotel,,,,,,2 images,1913-0745_wotm,,,1913-09-24,Woolpack Hotel,,,,complete,2,8
17540,35515,Hotel LaSalle,Hotel LaSalle,,,,,,1 image,1913-0746_wotm,,,1913-09-24,Hotel LaSalle,,,,complete,1,22
