In [81]:
import pandas as pd

In [82]:
#importing dataset of NYC restaurant information, violations, and grades from the Dept of Mental Health and Hygiene
#https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/43nn-pn8j



#original file was upwards of 170mb, preliminary data cleaning involving removing unnecessary columns was done in Excel

df = pd.read_csv("./DOHMH_New_York_City_Restaurant_Inspection_Results.csv")
df.head()

Unnamed: 0,DBA,BORO,CUISINE DESCRIPTION,VIOLATION CODE,SCORE,GRADE,Latitude,Longitude
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,17.0,B,40.829178,-73.875707
1,ARIANA KEBAB HOUSE,Manhattan,Afghan,10B,5.0,A,40.764993,-73.987856
2,TAP NYC,Manhattan,Café/Coffee/Tea,08C,20.0,,40.777716,-73.978556
3,GEORGES RESTAURANT,Brooklyn,American,10F,7.0,A,40.641088,-74.014474
4,Q & I BAKERY,Brooklyn,Bakery,06C,22.0,,40.628482,-74.006099


How scores and grades are determined
https://www1.nyc.gov/assets/doh/downloads/pdf/rii/restaurant-grading-faq.pdf

GRADE DEFINITIONS
N = Not Yet Graded
A = Grade A
B = Grade B
C = Grade C
Z = Grade Pending
P= Grade Pending issued on re-opening following an initial inspection that resulted in a closure

In [83]:
df.shape

(397490, 8)

In [84]:
df.dtypes

DBA                     object
BORO                    object
CUISINE DESCRIPTION     object
VIOLATION CODE          object
SCORE                  float64
GRADE                   object
Latitude               float64
Longitude              float64
dtype: object

In [85]:
df.columns

Index(['DBA', 'BORO', 'CUISINE DESCRIPTION', 'VIOLATION CODE', 'SCORE',
       'GRADE', 'Latitude', 'Longitude'],
      dtype='object')

In [86]:
#renaming columns

df = df.rename(columns={"DBA":"Name","BORO":"Borough", "CUISINE DESCRIPTION":"Cuisine", "VIOLATION CODE":"Violation Code", "SCORE":"Score", "GRADE":"Grade"})

df.columns

Index(['Name', 'Borough', 'Cuisine', 'Violation Code', 'Score', 'Grade',
       'Latitude', 'Longitude'],
      dtype='object')

In [87]:
#removing rows with null values from Name, Violation Code, and Grade rows

df = df.dropna(subset=['Name', 'Violation Code', 'Grade'])
del df['Score']

df.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,B,40.829178,-73.875707
1,ARIANA KEBAB HOUSE,Manhattan,Afghan,10B,A,40.764993,-73.987856
3,GEORGES RESTAURANT,Brooklyn,American,10F,A,40.641088,-74.014474
7,ROBERTA'S PIZZA & BAKERY,Brooklyn,Pizza,10F,A,40.704818,-73.934015
8,"DUNKIN', BASKIN ROBBINS",Manhattan,Donuts,10B,A,40.750971,-73.996492


In [88]:
#df.shape

In [89]:
df = df[df.Borough != "Manhattan"]

#removing rows with missing Longitude and Latitude values
df = df[df.Latitude != 0]
df = df[df.Longitude != 0]

#removing rows with Grades of "N", "G", and "Z", which are unnecessary for our analysis
df = df[df.Grade != "N"]
df = df[df.Grade != "G"]
df = df[df.Grade != "Z"]
df.shape

(117818, 7)

In [90]:
df.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,B,40.829178,-73.875707
3,GEORGES RESTAURANT,Brooklyn,American,10F,A,40.641088,-74.014474
7,ROBERTA'S PIZZA & BAKERY,Brooklyn,Pizza,10F,A,40.704818,-73.934015
11,GOOD HOPE RESTAURANT,Queens,Chinese,08A,B,40.687063,-73.822045
15,MORNING GLORY BAKERY,Queens,Bakery,10F,A,40.72813,-73.863607


In [91]:
#importing csv of violation code descriptions and their maximum penalty
#original dataset was stored as a word doc PDF and converted to a tabular format with Excel
#https://www1.nyc.gov/assets/doh/downloads/pdf/rii/ri-violation-penalty.pdf

codedf = pd.read_csv("./ViolationCodes.csv")
codedf.head()

Unnamed: 0,Violation Code,Violation Group,Violation Description,Maximum Penalty
0,02A,2,Food temperature storage violation,600
1,02B,2,Food temperature storage violation,600
2,02C,2,Food temperature storage violation,300
3,02D,2,Food temperature storage violation,300
4,02E,2,Food temperature storage violation,200


In [92]:
#index df by violation code

codedf.set_index('Violation Code')

Unnamed: 0_level_0,Violation Group,Violation Description,Maximum Penalty
Violation Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
02A,2,Food temperature storage violation,600
02B,2,Food temperature storage violation,600
02C,2,Food temperature storage violation,300
02D,2,Food temperature storage violation,300
02E,2,Food temperature storage violation,200
...,...,...,...
22B,22,Failure to take reasonable precautions to prot...,200
22C,22,Failure to take reasonable precautions to prot...,200
22E,22,Failure to take reasonable precautions to prot...,200
22F,22,Failure to take reasonable precautions to prot...,1000


In [93]:
#merge restaurant data and violation code data frame on the "Violation Code" column

mergeddf = pd.merge(df, codedf, on='Violation Code')
mergeddf

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude,Violation Group,Violation Description,Maximum Penalty
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,B,40.829178,-73.875707,10,Facility and machinery related,200
1,GEORGES RESTAURANT,Brooklyn,American,10F,A,40.641088,-74.014474,10,Facility and machinery related,200
2,ROBERTA'S PIZZA & BAKERY,Brooklyn,Pizza,10F,A,40.704818,-73.934015,10,Facility and machinery related,200
3,MORNING GLORY BAKERY,Queens,Bakery,10F,A,40.728130,-73.863607,10,Facility and machinery related,200
4,LA SABROSURA,Queens,"Latin (Cuban, Dominican, Puerto Rican, South &...",10F,A,40.766028,-73.919554,10,Facility and machinery related,200
...,...,...,...,...,...,...,...,...,...,...
117813,EMPANADAS MONUMENTAL,Bronx,"Latin (Cuban, Dominican, Puerto Rican, South &...",03G,C,40.854345,-73.911797,3,Food not from an approved source,300
117814,GOOD MORNING BROOKLYN,Brooklyn,Salads,03G,A,40.706331,-73.922465,3,Food not from an approved source,300
117815,THE WILD GOOSE,Queens,Irish,03G,C,40.744798,-73.910247,3,Food not from an approved source,300
117816,ANGIE'S BREAKFAST SPOT,Brooklyn,"Latin (Cuban, Dominican, Puerto Rican, South &...",03G,A,40.684938,-73.914031,3,Food not from an approved source,300


In [94]:
mergeddf.columns

Index(['Name', 'Borough', 'Cuisine', 'Violation Code', 'Grade', 'Latitude',
       'Longitude', 'Violation Group', 'Violation Description',
       'Maximum Penalty'],
      dtype='object')

In [95]:
#counts of each grade... most restaurants are in the A&B grade range!
gradecounts = df.groupby('Grade').count()
gradecounts

Unnamed: 0_level_0,Name,Borough,Cuisine,Violation Code,Latitude,Longitude
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,95054,95054,95054,95054,94972,94972
B,15654,15654,15654,15654,15644,15644
C,5565,5565,5565,5565,5565,5565
P,1545,1545,1545,1545,1542,1542


In [96]:
#group combined dataframe by Grades
groupedbygrade = df.groupby('Grade')

In [97]:
#group of A grade restaurants
groupA= groupedbygrade.get_group('A')
groupA.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
3,GEORGES RESTAURANT,Brooklyn,American,10F,A,40.641088,-74.014474
7,ROBERTA'S PIZZA & BAKERY,Brooklyn,Pizza,10F,A,40.704818,-73.934015
15,MORNING GLORY BAKERY,Queens,Bakery,10F,A,40.72813,-73.863607
18,LA SABROSURA,Queens,"Latin (Cuban, Dominican, Puerto Rican, South &...",10F,A,40.766028,-73.919554
28,FRENCH LOUIE,Brooklyn,French,10F,A,40.68817,-73.988061


In [98]:
#converting A grade group to dictionary
groupA.reset_index(inplace=True)
Adata=groupA.to_dict("records")
Adata

[{'index': 3,
  'Name': 'GEORGES RESTAURANT',
  'Borough': 'Brooklyn',
  'Cuisine': 'American',
  'Violation Code': '10F',
  'Grade': 'A',
  'Latitude': 40.641088399999994,
  'Longitude': -74.01447431},
 {'index': 7,
  'Name': "ROBERTA'S PIZZA & BAKERY",
  'Borough': 'Brooklyn',
  'Cuisine': 'Pizza',
  'Violation Code': '10F',
  'Grade': 'A',
  'Latitude': 40.70481776,
  'Longitude': -73.93401504},
 {'index': 15,
  'Name': 'MORNING GLORY BAKERY',
  'Borough': 'Queens',
  'Cuisine': 'Bakery',
  'Violation Code': '10F',
  'Grade': 'A',
  'Latitude': 40.7281301,
  'Longitude': -73.8636072},
 {'index': 18,
  'Name': 'LA SABROSURA',
  'Borough': 'Queens',
  'Cuisine': 'Latin (Cuban, Dominican, Puerto Rican, South & Central American)',
  'Violation Code': '10F',
  'Grade': 'A',
  'Latitude': 40.76602777,
  'Longitude': -73.91955368},
 {'index': 28,
  'Name': 'FRENCH LOUIE',
  'Borough': 'Brooklyn',
  'Cuisine': 'French',
  'Violation Code': '10F',
  'Grade': 'A',
  'Latitude': 40.68816986,
 

In [99]:
#group of B grade restaurants

groupB= groupedbygrade.get_group('B')
groupB.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
0,YANKEE JZ PIZZA,Bronx,Pizza,10F,B,40.829178,-73.875707
11,GOOD HOPE RESTAURANT,Queens,Chinese,08A,B,40.687063,-73.822045
37,JR PRIMOS RESTAURANT,Bronx,Spanish,10F,B,40.865413,-73.867511
43,MOJAVE,Queens,Southwestern,04H,B,40.775332,-73.911679
55,FULL KEE SEAFOOD RESTURANT,Queens,Chinese,08A,B,40.760941,-73.830333


In [100]:
#converting B grade group to dictionary

groupB.reset_index(inplace=True)
Bdata=groupB.to_dict("records")
#Bdata

In [101]:
#group of C grade restaurants

groupC= groupedbygrade.get_group('C')
groupC.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
90,AUTENTICO TACOS AL SUADERO,Queens,Mexican,05D,C,40.748151,-73.879038
131,OBAAPA AFRICAN & CARIBBEAN RESTAURANT,Bronx,African,04L,C,40.860269,-73.902719
189,EIGHT JANE FOOD,Queens,Asian,10F,C,40.761359,-73.83104
325,YOLIE'S BAR & RESTAURANT,Brooklyn,Creole,04H,C,40.644856,-73.922561
376,LA CASA DE JULIA,Queens,Mexican,04N,C,40.745965,-73.899844


In [102]:
#converting C grade group to dictionary

groupC.reset_index(inplace=True)
Cdata=groupC.to_dict("records")
#Cdata

In [103]:
#group of P grade restaurants

groupP= groupedbygrade.get_group('P')
groupP.head()

Unnamed: 0,Name,Borough,Cuisine,Violation Code,Grade,Latitude,Longitude
53,TASTY THAI,Queens,Thai,10F,P,40.699992,-73.899383
70,SUMELA TO GO,Brooklyn,Turkish,10B,P,40.689353,-73.992514
149,GOLDEN PUNJAB INDIAN RESTAURANT,Queens,Indian,10F,P,40.692517,-73.816913
703,KOREAN NOODLE HOUSE,Queens,Korean,10F,P,40.759449,-73.772734
807,NEW CHEUN HING KITCHEN,Queens,Chinese,08A,P,40.691699,-73.819793


In [104]:
#converting P grade group to dictionary

groupP.reset_index(inplace=True)
Pdata=groupP.to_dict("records")
#Pdata

In [105]:
#Prepping to export dataframes to MongoDB
import pymongo

In [106]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Define the 'classDB' database in Mongo
db = client["restaurantgradesDB"]

#collections by grade
collectionA = db["GroupA"]
collectionB = db["GroupB"]
collectionC = db["GroupC"]
collectionP = db["GroupP"]

In [107]:
# Insert data from grade groups into MongoDB collection
collectionA.insert_many(Adata)

<pymongo.results.InsertManyResult at 0x1390705f0>

In [108]:
collectionB.insert_many(Bdata)

<pymongo.results.InsertManyResult at 0x12293f410>

In [109]:
collectionC.insert_many(Cdata)

<pymongo.results.InsertManyResult at 0x13834bd20>

In [110]:
collectionP.insert_many(Pdata)

<pymongo.results.InsertManyResult at 0x12f84a3c0>