https://www1.nyc.gov/site/doh/business/food-operators/the-inspection-process.page

See https://www1.nyc.gov/assets/doh/downloads/pdf/rii/blue-book.pdf 

In [None]:
!sudo pip install pandas-profiling[notebook]

## Download Latest Dataset

In [None]:
!curl 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o restaurants.csv

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("restaurants.csv", dtype = 'object')

In [None]:
initial_size = len(df)
initial_size

In [None]:
# Render our plots inline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
df.dtypes

In [None]:
# Adding underscores in all column names
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_').upper())
df.columns = cols

In [None]:
df.dtypes

## Cleaning Individual Columns

In [None]:
df.CAMIS.isnull().sum()

In [None]:
df.DBA.isnull().sum()

In [None]:
# Checks that all CAMIS values correpond to a unique DBA value
# ie the same CAMIS always has the same DBA
max_cardinality = df[['DBA','CAMIS']].pivot_table(
    index='CAMIS',
    values='DBA',
    aggfunc=pd.Series.nunique
).DBA.max()

assert(max_cardinality==1)

### INSPECTION_TYPE

In [None]:
df.INSPECTION_TYPE.value_counts()

In [None]:
sum(df.INSPECTION_TYPE.isnull())

In [None]:
# We create a column "TO_DELETE" to mark the entries that we are not interested in.
# We will perform first the inspection/analysis on all the attributes, and then delete the rows

# Drop all cases where inspection is NULL
df['TO_DELETE'] = df.INSPECTION_TYPE.isnull()

In [None]:
sum(df['TO_DELETE'])

In [None]:
# Break INSPECTION_TYPE into two columns, and also delete some inspection types

# Create a new column that contains the results of the split on the '/'' character
lst = df.INSPECTION_TYPE.str.split(' / ').values.tolist()
lst = [ l if type(l)==type(list()) else ['',''] for l in lst ]
t = pd.DataFrame(lst)


In [None]:
t[0].value_counts()

In [None]:
t[1].value_counts()

In [None]:
# Keep only cycle inspections. Ignore admin, pre-permit, transfat, and calorie posting inspections
# We will only keep the initial inspections and the re-inspections. The other types are border cases

keep = df.INSPECTION_TYPE.isin( ['Cycle Inspection / Initial Inspection', 'Cycle Inspection / Re-inspection'])


In [None]:
# Check how many we will drop: ~keep means negation on the keep and summing up adds up the 'True'
sum(~keep)

In [None]:
# Adding the "not keep" entries into the TO_DELETE

# The |= operator is a shortcut for df['TO_DELETE'] = df['TO_DELETE'] | ~keep
# We use the bit-OR operator (|), as we want to keep the existing deletions, and add the ones from the 
# additional condition
df['TO_DELETE'] |= ~keep

In [None]:
# Rename the two entries that we will keep into simpler values

df["INSPECTION_TYPE"].replace(
    to_replace='Cycle Inspection / Initial Inspection',
    value = 'Initial Inspection',
    inplace=True
)
df["INSPECTION_TYPE"].replace(
    to_replace='Cycle Inspection / Re-inspection',
    value = 'Re-inspection',
    inplace=True
)

In [None]:
sum(df['TO_DELETE'])

### BORO

In [None]:
df.BORO.value_counts()

In [None]:
sum(df.BORO.isnull())

In [None]:
# Replace the "Missing" in BORO with null
# df.BORO = df.BORO.replace('Missing', np.NaN)

In [None]:
missing = set(df[df.BORO.isnull()].CAMIS)
# missing

In [None]:
sum(df.BORO == '0')

In [None]:
# Remove the entries with null/missing BORO value
# Not worth trying to fix.
df['TO_DELETE'] |= (df.BORO == '0')

In [None]:
sum(df['TO_DELETE'])

### BUILDING

In [None]:
sum(df.BUILDING.isnull())

In [None]:
# Inspect the entries with missing street number
# df[df.BUILDING.isnull()]

In [None]:
# Dropping the violations listed without street number
# Most are in train stations and in airports, but there
# are a few others in 'regular' locations

df['TO_DELETE'] |= df.BUILDING.isnull()

In [None]:
sum(df['TO_DELETE'])

### STREET

In [None]:
# Check that no street values are empty
sum(df.STREET.isnull())

In [None]:
df['TO_DELETE'] |= df.STREET.isnull()

In [None]:
sum(df['TO_DELETE'])

### ZIPCODE

In [None]:
sum(df.ZIPCODE.isnull())

In [None]:
len(set(df[df.ZIPCODE.isnull()].CAMIS))

In [None]:
df['TO_DELETE'] |= df.ZIPCODE.isnull()

In [None]:
# TODO: Fill in the missing zip codes, perhaps with geocoding of the addresses
# For now, we just drop the cases without ZIPCODE

### CUISINE DESCRIPTION

In [None]:
df.CUISINE_DESCRIPTION.value_counts()

In [None]:
len(df.CUISINE_DESCRIPTION.value_counts())

In [None]:
sum(df.CUISINE_DESCRIPTION.isnull())

In [None]:
df.CUISINE_DESCRIPTION.replace(
    to_replace='Café/Coffee/Tea',
    value = 'Cafe',
    inplace=True
)

df.CUISINE_DESCRIPTION.replace(
    to_replace='Latin (Cuban, Dominican, Puerto Rican, South & Central American)',
    value = 'Latin',
    inplace=True
)


### INSPECTION_DATE

In [None]:
df["INSPECTION_DATE"] = pd.to_datetime(df["INSPECTION_DATE"], format="%m/%d/%Y")

In [None]:
df.INSPECTION_DATE.describe(datetime_is_numeric=True)

In [None]:
sum(df.INSPECTION_DATE.isnull())

In [None]:
df.INSPECTION_DATE.hist(range=(pd.to_datetime('2010-01-01'),pd.to_datetime('2014-12-31')))

In [None]:
# Drop the 1900-01-01 inspections. These are all incorrect and we cannot fix them
df['TO_DELETE'] |= (df['INSPECTION_DATE'] == '1900-01-01')

In [None]:
# After analysis, it seems that we have minimal number of inspections before 2015
df['TO_DELETE'] |=  (df['INSPECTION_DATE'] < '2015-01-01')

In [None]:
sum(df['TO_DELETE'])

### ACTION

In [None]:
sum(df.ACTION.isnull())

In [None]:
df["ACTION"].value_counts()

In [None]:
df["ACTION"].replace(
    to_replace='Violations were cited in the following area(s).',
    value = 'Violations found',
    inplace=True
)

In [None]:
df["ACTION"].replace(
    to_replace='No violations were recorded at the time of this inspection.',
    value = 'No violations',
    inplace=True
)

In [None]:
df["ACTION"].replace(
    to_replace='Establishment Closed by DOHMH.  Violations were cited in the following area(s) and those requiring immediate action were addressed.',
    value = 'Establishment closed',
    inplace=True
)

In [None]:
df["ACTION"].replace(
    to_replace='Establishment re-opened by DOHMH',
    value = 'Establishment re-opened',
    inplace=True
)

In [None]:
df["ACTION"].replace(
    to_replace='Establishment re-closed by DOHMH',
    value = 'Establishment re-closed',
    inplace=True
)

In [None]:
df["ACTION"].value_counts()

In [None]:
# Drop the Establishment re-opened and re-closed cases
# as the inspection scores for these can be misleading
df['TO_DELETE'] |=  (df.ACTION == 'Establishment re-closed')
df['TO_DELETE'] |=  (df.ACTION == 'Establishment re-opened')
df['TO_DELETE'] |=  df.ACTION.isnull()

In [None]:
sum(df['TO_DELETE'])

### SCORE

In [None]:
df["SCORE"] = pd.to_numeric(df["SCORE"])

In [None]:
df.SCORE.describe()

In [None]:
len(df)

In [None]:
len( df[ df.SCORE < 0 ] )

In [None]:
df['TO_DELETE'] |= (df.SCORE < 0)

In [None]:
sum(df['TO_DELETE'])

In [None]:
# Drop cases reported as "no violations but with positive score"
df['TO_DELETE'] |= ((df.SCORE > 0)  & (df.ACTION == 'No violations'))

# Drop cases with zero score but with violations found
df['TO_DELETE'] |= ((df.SCORE == 0)  & (df.ACTION == 'Violations found'))

In [None]:
# Drop incorrectly scored inspections
df['TO_DELETE'] |=  (df.GRADE=='A') & (df.SCORE>13)

df['TO_DELETE'] |=  (df.GRADE=='B') & ( (df.SCORE<14) | (df.SCORE>27) )

# Drop incorrectly scored inspections
df['TO_DELETE'] |=  (df.GRADE=='C') & (df.SCORE<28)


In [None]:
sum(df['TO_DELETE'])

### RECORD_DATE

In [None]:
df["RECORD_DATE"] = pd.to_datetime(df["RECORD_DATE"], format="%m/%d/%Y")

In [None]:
# Drop record date field, as it only contains a single value
df = df.drop( 'RECORD_DATE', axis = 'columns')

### GRADE

In [None]:
df.GRADE.value_counts()

In [None]:
sum(df.GRADE.isnull())

In [None]:
df.query(" GRADE == 'G' ")

In [None]:
# Seems that all the "G" correspond to "A"
df.GRADE.replace('G', 'A', inplace=True)

In [None]:
# P assigned to 'Establishment re-opened' actions
df.query(" GRADE == 'P' ")

In [None]:
# P assigned to 'Establishment re-opened' actions
df.query(" GRADE == 'P' ").ACTION.value_counts()

In [None]:
# P assigned to 'Reopening Inspection' inspection types
df.query(" GRADE == 'P' ").INSPECTION_TYPE.value_counts()

In [None]:
# df.query(" GRADE == 'Z' ").SCORE.hist(bins=100)

In [None]:
# TODO: Figure out what Z, and Not Yet Graded means in GRADE
# Until then, we just replace these values with NULL, keeping only the A, B, C grades

# 
df.GRADE.value_counts()

In [None]:
df['TO_DELETE'] |=  (df.GRADE=='Z')
df['TO_DELETE'] |=  (df.GRADE=='P')
df['TO_DELETE'] |=  (df.GRADE=='N') # Not Yet Graded
df['TO_DELETE'] |=  (df.GRADE=='Not Yet Graded')

### GRADE_DATE

In [None]:
df["GRADE_DATE"] = pd.to_datetime(df["GRADE_DATE"], format="%m/%d/%Y")


In [None]:
# Grade date and inspection date should be the same. 
assert np.abs((df.GRADE_DATE - df.INSPECTION_DATE).dropna()).sum().days == 0

In [None]:
# Check that is there is a grade date, a grade is also assigned
assert sum ( ~df.GRADE_DATE.isnull() & df.GRADE.isnull() )  == 0

In [None]:
# We do not need GRADE DATE if we have INSPECTION DATE
df = df.drop("GRADE_DATE", axis='columns')

### VIOLATION_CODE and VIOLATION_DESCRIPTION

In [None]:
df.VIOLATION_CODE.isnull().sum()

In [None]:
# All the cases where violation code is NULL should be either cases that we delete
# or a "No violations" case
check = df[df.VIOLATION_CODE.isnull() & ~df.TO_DELETE & (df.ACTION!='No violations')]

assert( len(check) == 0 )


In [None]:
df.VIOLATION_DESCRIPTION.isnull().sum()

In [None]:
# Checks that all VIOLATION_CODE has unique VIOLATION_DESCRIPTION
df[['VIOLATION_CODE','VIOLATION_DESCRIPTION']].drop_duplicates().pivot_table(
    index='VIOLATION_CODE',
    values='VIOLATION_DESCRIPTION',
    aggfunc=pd.Series.nunique
).sort_values('VIOLATION_DESCRIPTION', ascending=False)


In [None]:
# In the dataset we have a NULL violation, when there is no violation
# To make this more explicit, we replace NULL with 000
df.VIOLATION_CODE.fillna('000', inplace=True)

In [None]:
df.VIOLATION_CODE.isnull().sum()

In [None]:
## LATITUDE and LONGITUDE

In [None]:
df['LONGITUDE'] = pd.to_numeric(df['LONGITUDE'])
df['LATITUDE'] = pd.to_numeric(df['LATITUDE'])

### MISC analysis

In [None]:
# Find the unique values in each column
# 
# df.describe(include = [np.object, 'category']).T['unique']
unique = df.describe(include = 'all').T['unique'].sort_values()

for column in unique.index:
    if unique[column] < 200:
        print(df[column].value_counts())
        print("=====")

In [None]:
df.dtypes

## Deleting Entries

In this section, we use the results of the analysis above, and delete (additional) entries that will not be useful in our analysis. (Note that it is important to document this, as others may want to go back to the original source, if the entries that we leave are not sufficient.)

In [None]:
len(df)

In [None]:
assert len(df) == initial_size

In [None]:
df = df[ df.TO_DELETE == False].copy()

In [None]:
len(df)

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

In [None]:
profile

## Additional Data Quality Checks

In [None]:


assert sum(df.INSPECTION_TYPE.isnull()) == 0

assert len(set(df.INSPECTION_TYPE.values)) == 2

df["INSPECTION_TYPE"] =  pd.Categorical(df["INSPECTION_TYPE"], ordered=False)

In [None]:
# Check that no borough entries are empty
assert sum(df.BORO.isnull()) == 0

df["BORO"] =  pd.Categorical(df["BORO"], ordered=False)

In [None]:
# Check that no street numbers are empty
assert sum(df.BUILDING.isnull()) == 0

In [None]:
assert sum(df.STREET.isnull()) == 0

In [None]:

assert sum(df.CUISINE_DESCRIPTION.isnull()) == 0

df["CUISINE_DESCRIPTION"] =  pd.Categorical(df["CUISINE_DESCRIPTION"], ordered=False)

In [None]:
# We only keep three different actions
assert len(set(df.ACTION.values)) == 3

# No action is empty
assert sum(df.ACTION.isnull()) == 0

df["ACTION"] =  pd.Categorical(df["ACTION"], ordered=False)

In [None]:
# The below drops any grade values other than A, B, C, and converts the remaining entries to null
df["GRADE"] =  pd.Categorical(df["GRADE"], categories = ['A', 'B', 'C'], ordered=True)

# https://www1.nyc.gov/assets/doh/downloads/pdf/rii/how-we-score-grade.pdf
# 0-13 get an A
assert sum( (df.GRADE=='A') & (df.SCORE>13)) == 0

# 14-27 get a B
assert sum( (df.GRADE=='B') & ( (df.SCORE<14) | (df.SCORE>27) ) ) == 0

# 28- get a C
assert sum( (df.GRADE=='C') & (df.SCORE<28) ) == 0

# TODO: In principle, a NULL grade is only when the score is above 14, and it was an initial inspection

In [None]:
# Check that is there is a grade date, a grade is also assigned
# assert sum ( ~df.GRADE_DATE.isnull() & df.GRADE.isnull() ) == 0

In [None]:
df["VIOLATION_CODE"] =  pd.Categorical(df["VIOLATION_CODE"], ordered=False)
df["CRITICAL_FLAG"] =  pd.Categorical(df["CRITICAL_FLAG"], ordered=False)


## Normalization

### Violation Codes

In [None]:
# Recreating the table at https://www1.nyc.gov/assets/doh/downloads/pdf/rii/ri-violation-penalty.pdf

violation_codes = df [ ['VIOLATION_CODE', 'VIOLATION_DESCRIPTION', 'CRITICAL_FLAG'] ].drop_duplicates()
violation_codes = violation_codes.rename( {
    'VIOLATION_DESCRIPTION' : 'DESCRIPTION',
    'CRITICAL_FLAG' : 'CRITICAL'
},  axis = 'columns').sort_values('VIOLATION_CODE').set_index('VIOLATION_CODE')#.drop(np.nan)
violation_codes.head(5)

In [None]:
# Drop the description and critical part from the main dataframe
df = df.drop(['VIOLATION_DESCRIPTION' ,  'CRITICAL_FLAG'], axis='columns')

In [None]:
df

### Restaurants

In [None]:
restaurants =  df [ ['CAMIS', 'DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'BORO', 'PHONE', 'CUISINE_DESCRIPTION'] ].drop_duplicates()

In [None]:
# Check that we have the same attributes for a given CAMIS
# and the we do not have duplicate CAMIS values
assert len(restaurants) == len(set(restaurants.CAMIS.values))

In [None]:
# TODO: Pass the addresses through Google Maps API and get the x and y coordinates and fix zipcodes etc

restaurants.head(5)

In [None]:
restaurants.PHONE.value_counts().head(20)

In [None]:
# __ = restaurants.PHONE.value_counts().head(10).index.values[7]

In [None]:
# restaurants.PHONE.replace(to_replace=__, value=np.nan, inplace=True)

In [None]:
# Citi Field concessions
# restaurants.query("PHONE == '7185958100'").head(5)

In [None]:
# Madison Square Garden concession stands
# restaurants.query("PHONE == '2124656273'").head(5)

In [None]:
restaurants.DBA.value_counts() 

In [None]:
df = df.drop (['DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'BORO', 'PHONE', 'CUISINE_DESCRIPTION'], axis='columns')

### Inspections

In [None]:
# Each inspection has multiple violations. We want to keep just the inspections for now
inspection = df.drop('VIOLATION_CODE', axis='columns').drop_duplicates().sort_values(['INSPECTION_DATE', 'CAMIS'])

In [None]:
# We create an ID for each inspection here
inspection = inspection.reset_index().drop('index', axis='columns').reset_index().rename({'index': 'INSPECTION_ID'}, axis='columns')

In [None]:
inspection.ACTION.value_counts()

In [None]:
# Ensure that the inspection table contains 
# no duplicate pairs for 'INSPECTION_DATE', 'CAMIS'
pvt = inspection.pivot_table(
    index = ['INSPECTION_DATE', 'CAMIS'],
    values = 'INSPECTION_ID',
    aggfunc = 'count'
)
pvt [ pvt.INSPECTION_ID > 1 ]

# assert len(pvt [ pvt.INSPECTION_ID > 1 ]) == 0 

In [None]:
# df[ (df.CAMIS =='41007054') & (df.INSPECTION_DATE == '2017-03-03')  ].sort_values('VIOLATION_CODE')

In [None]:
# df[ (df.CAMIS =='50048062') & (df.INSPECTION_DATE == '2018-10-30')  ].sort_values('VIOLATION_CODE')

In [None]:
df[ (df.CAMIS =='40911114') & (df.INSPECTION_DATE == '2017-11-04')  ].sort_values('VIOLATION_CODE')

In [None]:
# df[ (df.CAMIS =='41485450') & (df.INSPECTION_DATE == '2018-04-12')  ]

In [None]:
inspection.INSPECTION_TYPE.value_counts()

In [None]:
inspection_stats = inspection.pivot_table(
    index = 'CAMIS',
    aggfunc = ['min', 'max', 'count'],
    values = 'INSPECTION_DATE'
)

In [None]:
# Distribution of last inspection across all restaurants
inspection_stats['max'].sort_values('INSPECTION_DATE').reset_index().pivot_table(
    index='INSPECTION_DATE',
    aggfunc='count'
).resample('1W').sum().plot()

In [None]:
# Longevity 
# (inspection_stats['max'] - inspection_stats['min'])['INSPECTION DATE'].sort_values()

In [None]:
# Distribution of all inspections
inspection['INSPECTION_DATE'].value_counts().sort_index().resample('1W').sum().plot()

In [None]:
len(df)

In [None]:
violations = pd.merge(
    inspection,
    df[ ['CAMIS', 'INSPECTION_DATE', 'VIOLATION_CODE' ] ],
    on= ['CAMIS', 'INSPECTION_DATE'],
    how = 'inner'
)
violations = violations[ ['INSPECTION_ID', 'VIOLATION_CODE'] ].drop_duplicates()
len(violations)

In [None]:
print(f"Violations: {len(violations)}")
print(f"Inspections: {len(inspection)}")
print(f"Restaurants: {len(restaurants)}")
print(f"Violation Codes: {len(violation_codes)}")