In [24]:
import pandas as pd

In [25]:
#importing dataset of NYC restaurant information, violations, and grades from the Dept of Mental Health and Hygiene
#https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j

#original file was upwards of 170mb, preliminary data cleaning involved manually removing unnecessary columns in Excel

df = pd.read_csv("./DOHMH_New_York_City_Restaurant_Inspection_Results.csv")
df.head()

Unnamed: 0,DBA,BORO,CUISINE DESCRIPTION,VIOLATION CODE,SCORE,GRADE,Latitude,Longitude
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,17.0,B,40.829178,-73.875707
1,ARIANA KEBAB HOUSE,Manhattan,Afghan,10B,5.0,A,40.764993,-73.987856
2,TAP NYC,Manhattan,Café/Coffee/Tea,08C,20.0,,40.777716,-73.978556
3,GEORGES RESTAURANT,Brooklyn,American,10F,7.0,A,40.641088,-74.014474
4,Q & I BAKERY,Brooklyn,Bakery,06C,22.0,,40.628482,-74.006099


How scores and grades are determined
https://www1.nyc.gov/assets/doh/downloads/pdf/rii/restaurant-grading-faq.pdf

GRADE DEFINITIONS
N = Not Yet Graded
A = Grade A
B = Grade B
C = Grade C
Z = Grade Pending
P= Grade Pending issued on re-opening following an initial inspection that resulted in a closure

In [26]:
print(df.shape)
print('\n')
print(df.dtypes)
print('\n')
print(df.columns)

(397490, 8)


DBA                     object
BORO                    object
CUISINE DESCRIPTION     object
VIOLATION CODE          object
SCORE                  float64
GRADE                   object
Latitude               float64
Longitude              float64
dtype: object


Index(['DBA', 'BORO', 'CUISINE DESCRIPTION', 'VIOLATION CODE', 'SCORE',
       'GRADE', 'Latitude', 'Longitude'],
      dtype='object')


In [27]:
#renaming columns

df = df.rename(columns={"DBA":"Name","BORO":"Borough", "CUISINE DESCRIPTION":"Cuisine", "VIOLATION CODE":"Violation Code", "SCORE":"Score", "GRADE":"Grade"})

df.columns

Index(['Name', 'Borough', 'Cuisine', 'Violation Code', 'Score', 'Grade',
       'Latitude', 'Longitude'],
      dtype='object')

In [28]:
#removing rows with null values from Name, Violation Code, and Grade rows

df = df.dropna(subset=['Name', 'Violation Code', 'Grade'])
del df['Score']

df.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,B,40.829178,-73.875707
1,ARIANA KEBAB HOUSE,Manhattan,Afghan,10B,A,40.764993,-73.987856
3,GEORGES RESTAURANT,Brooklyn,American,10F,A,40.641088,-74.014474
7,ROBERTA'S PIZZA & BAKERY,Brooklyn,Pizza,10F,A,40.704818,-73.934015
8,"DUNKIN', BASKIN ROBBINS",Manhattan,Donuts,10B,A,40.750971,-73.996492


In [29]:
df.shape

(201343, 7)

In [35]:
df.shape

(74679, 7)

In [54]:
#Filtering to only Manhattan restaurant data
df = df[df.Borough == "Manhattan"]

#removing rows with missing Longitude and Latitude values
df = df[df.Latitude != 0]
df = df[df.Longitude != 0]

#removing rows with Grades of "N", "G", and "Z", which are unnecessary for our analysis
df = df[df.Grade != "N"]
df = df[df.Grade != "G"]
df = df[df.Grade != "Z"]

#may keep p grades for some data viz
df = df[df.Grade != "P"]

df.head(20)

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
1,ARIANA KEBAB HOUSE,Manhattan,Afghan,10B,A,40.764993,-73.987856
8,"DUNKIN', BASKIN ROBBINS",Manhattan,Donuts,10B,A,40.750971,-73.996492
13,SPRING PLACE,Manhattan,Mediterranean,05D,A,40.720721,-74.005916
14,THAI BKK,Manhattan,Thai,10F,B,40.802967,-73.938432
16,UPTOWN BOURBON,Manhattan,Café/Coffee/Tea,10F,A,40.829264,-73.948388
23,JOE COFFEE COMPANY,Manhattan,Café/Coffee/Tea,10F,A,40.71183,-74.012087
24,MCDONALD'S,Manhattan,Hamburgers,08A,A,40.729312,-73.993469
27,SHALEL,Manhattan,Mediterranean,10B,A,40.775641,-73.978625
32,ELMO,Manhattan,American,08A,A,40.741864,-73.997279
46,LA SALLE DUMPLING ROOM,Manhattan,Chinese,06E,A,40.813912,-73.959596


In [63]:
#getting summaries
df["Grade"].value_counts()
df["Cuisine"].value_counts()
df["Violation Code"].value_counts()

10F    15337
08A     7242
06D     6395
10B     5861
06C     5252
       ...  
03D        2
06I        1
04I        1
06H        1
04B        1
Name: Violation Code, Length: 63, dtype: int64

In [37]:
#importing csv of violation code descriptions and their maximum penalty
#original dataset was stored as a word doc PDF and converted to a tabular format with Excel
#https://www1.nyc.gov/assets/doh/downloads/pdf/rii/ri-violation-penalty.pdf

codedf = pd.read_csv("./ViolationCodes.csv")
codedf.head()

Unnamed: 0,Violation Code,Violation Group,Violation Description,Maximum Penalty
0,02A,2,Food temperature storage violation,600
1,02B,2,Food temperature storage violation,600
2,02C,2,Food temperature storage violation,300
3,02D,2,Food temperature storage violation,300
4,02E,2,Food temperature storage violation,200


In [39]:
#index df by violation code
#codedf.set_index('Violation Code')

In [41]:
#merge restaurant data and violation code data frame on the "Violation Code" column
mergeddf = pd.merge(df, codedf, on='Violation Code')
mergeddf.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude,Violation Group,Violation Description,Maximum Penalty
0,ARIANA KEBAB HOUSE,Manhattan,Afghan,10B,A,40.764993,-73.987856,10,Facility and machinery related,200
1,"DUNKIN', BASKIN ROBBINS",Manhattan,Donuts,10B,A,40.750971,-73.996492,10,Facility and machinery related,200
2,SHALEL,Manhattan,Mediterranean,10B,A,40.775641,-73.978625,10,Facility and machinery related,200
3,WHISKEY TOWN,Manhattan,American,10B,A,40.725795,-73.990371,10,Facility and machinery related,200
4,GO GO CURRY,Manhattan,Japanese,10B,A,40.740985,-73.995919,10,Facility and machinery related,200


In [67]:
mergeddf.columns

Index(['Name', 'Borough', 'Cuisine', 'Violation Code', 'Grade', 'Latitude',
       'Longitude', 'Violation Group', 'Violation Description',
       'Maximum Penalty'],
      dtype='object')

In [68]:
#counts of each grade... most restaurants are in the A&B grade range!
gradecounts = df.groupby('Grade').count()
gradecounts

Unnamed: 0_level_0,Name,Borough,Cuisine,Violation Code,Latitude,Longitude
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,61796,61796,61796,61796,61722,61722
B,9183,9183,9183,9183,9175,9175
C,3700,3700,3700,3700,3692,3692


In [69]:
#group combined dataframe by Grades
groupedbygrade = df.groupby('Grade')

In [80]:
#group of A grade restaurants
groupA= groupedbygrade.get_group('A')
del groupA['Borough']
groupA.head()

Unnamed: 0,Name,Cuisine,Violation Code,Grade,Latitude,Longitude
1,ARIANA KEBAB HOUSE,Afghan,10B,A,40.764993,-73.987856
8,"DUNKIN', BASKIN ROBBINS",Donuts,10B,A,40.750971,-73.996492
13,SPRING PLACE,Mediterranean,05D,A,40.720721,-74.005916
16,UPTOWN BOURBON,Café/Coffee/Tea,10F,A,40.829264,-73.948388
23,JOE COFFEE COMPANY,Café/Coffee/Tea,10F,A,40.71183,-74.012087


In [71]:
#converting A grade group to dictionary
groupA.reset_index(inplace=True)
Adata=groupA.to_dict("records")
#Adata

In [81]:
#group of B grade restaurants
groupB= groupedbygrade.get_group('B')
del groupB['Borough']
groupB.head()

Unnamed: 0,Name,Cuisine,Violation Code,Grade,Latitude,Longitude
14,THAI BKK,Thai,10F,B,40.802967,-73.938432
71,TEN DEGREES,American,04C,B,40.727249,-73.984471
86,GFG BAKERY,Bakery,06A,B,40.710274,-74.00549
183,CALLE DAO CHELSEA,Chinese/Cuban,04N,B,40.747186,-74.002963
197,CHIU HONG BAKERY,Bakery,10F,B,40.719573,-73.996281


In [48]:
#converting B grade group to dictionary
groupB.reset_index(inplace=True)
Bdata=groupB.to_dict("records")
#Bdata

In [82]:
#group of C grade restaurants
groupC= groupedbygrade.get_group('C')
del groupC['Borough']
groupC.head()

Unnamed: 0,Name,Cuisine,Violation Code,Grade,Latitude,Longitude
50,SUN SAI GAI RESTAURANT,Chinese,06D,C,40.717139,-73.998795
79,AUGUST GATHERINGS,Chinese,02G,C,40.718786,-74.001053
350,DALLAS BBQ,Barbecue,04N,C,40.770262,-73.960218
545,SYMPOSIUM GREEK RESTAURANT,Greek,04N,C,40.80549,-73.963947
677,CECI RESTAURANT,Spanish,04M,C,40.863419,-73.926435


In [50]:
#converting C grade group to dictionary
groupC.reset_index(inplace=True)
Cdata=groupC.to_dict("records")
#Cdata

In [52]:
#group of P grade restaurants
#groupP= groupedbygrade.get_group('P')
#groupP.head()

#converting P grade group to dictionary
#groupP.reset_index(inplace=True)
#Pdata=groupP.to_dict("records")
#Pdata

In [53]:
#Prepping to export dataframes to MongoDB
import pymongo

In [166]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define the 'classDB' database in Mongo
db = client["restaurantgradesDB"]

#collections by grade
collectionA = db["GroupA"]
collectionB = db["GroupB"]
collectionC = db["GroupC"]
#collectionP = db["GroupP"]

In [None]:
# Insert data from grade groups into MongoDB collection
collectionA.insert_many(Adata)

In [None]:
collectionB.insert_many(Bdata)

In [None]:
collectionC.insert_many(Cdata)

In [None]:
#collectionP.insert_many(Pdata)