https://www1.nyc.gov/site/doh/business/food-operators/the-inspection-process.page

See https://www1.nyc.gov/assets/doh/downloads/pdf/rii/blue-book.pdf 

## Download Latest Dataset

In [1]:
!curl 'https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv?accessType=DOWNLOAD' -o restaurants.csv

In [2]:
!rm restaurants.csv.gz
!gzip restaurants.csv

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv("restaurants.csv.gz", dtype = 'object')

In [0]:
initial_size = len(df)
initial_size

In [0]:
# Render our plots inline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [0]:
df.dtypes

In [0]:
# Adding underscores in all column names
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_'))
df.columns = cols

In [0]:
df.dtypes

## Cleaning Individual Columns

### INSPECTION_TYPE

In [0]:
df.INSPECTION_TYPE.value_counts()

In [0]:
sum(df.INSPECTION_TYPE.isnull())

In [0]:
# We create a column "TO_DELETE" to mark the entries that we are not interested in.
# We will perform first the inspection/analysis on all the attributes, and then delete the rows

# Drop all cases where inspection is NULL
df['TO_DELETE'] = df.INSPECTION_TYPE.isnull()

In [0]:
sum(df['TO_DELETE'])

In [0]:
# Break INSPECTION_TYPE into two columns, and also delete some inspection types

# Create a new column that contains the results of the split on the '/'' character
lst = df.INSPECTION_TYPE.str.split(' / ').values.tolist()
lst = [ l if type(l)==type(list()) else ['',''] for l in lst ]
t = pd.DataFrame(lst)


In [0]:
t[0].value_counts()

In [0]:
t[1].value_counts()

In [0]:
# Keep only cycle inspections. Ignore admin, pre-permit, transfat, and calorie posting inspections
# We will only keep the initial inspections and the re-inspections. The other types are border cases

keep = df.INSPECTION_TYPE.isin( ['Cycle Inspection / Initial Inspection', 'Cycle Inspection / Re-inspection'])


In [0]:
# Check how many we will drop: ~keep means negation on the keep and summing up adds up the 'True'
sum(~keep)

In [0]:
# Adding the "not keep" entries into the TO_DELETE

# The |= operator is a shortcut for df['TO_DELETE'] = df['TO_DELETE'] | ~keep
# We use the bit-OR operator (|), as we want to keep the existing deletions, and add the ones from the 
# additional condition
df['TO_DELETE'] |= ~keep

In [0]:
# Rename the two entries that we will keep into simpler values

df["INSPECTION_TYPE"].replace(
    to_replace='Cycle Inspection / Initial Inspection',
    value = 'Initial Inspection',
    inplace=True
)
df["INSPECTION_TYPE"].replace(
    to_replace='Cycle Inspection / Re-inspection',
    value = 'Re-inspection',
    inplace=True
)

In [0]:
sum(df['TO_DELETE'])

### BORO

In [0]:
df.BORO.value_counts()

In [0]:
sum(df.BORO.isnull())

In [0]:
# Replace the "Missing" in BORO with null
# df.BORO = df.BORO.replace('Missing', np.NaN)

In [0]:
missing = set(df[df.BORO.isnull()].CAMIS)
# missing

In [0]:
sum(df.BORO == 'Missing')

In [0]:
# Remove the entries with null/missing BORO value
# Not worth trying to fix.
df['TO_DELETE'] |= (df.BORO == 'Missing')

In [0]:
sum(df['TO_DELETE'])

### BUILDING

In [0]:
sum(df.BUILDING.isnull())

In [0]:
# Inspect the entries with missing street number
# df[df.BUILDING.isnull()]

In [0]:
# Dropping the violations listed without street number
# Most are in train stations and in airports, but there
# are a few others in 'regular' locations

df['TO_DELETE'] |= df.BUILDING.isnull()

In [0]:
sum(df['TO_DELETE'])

### STREET

In [0]:
# Check that no street values are empty
sum(df.STREET.isnull())

In [0]:
df['TO_DELETE'] |= df.STREET.isnull()

In [0]:
sum(df['TO_DELETE'])

### ZIPCODE

In [0]:
sum(df.ZIPCODE.isnull())

In [0]:
len(set(df[df.ZIPCODE.isnull()].CAMIS))

In [0]:
# Filling zipcodes for Restaurants after normalization has occured

### CUISINE DESCRIPTION

In [0]:
df.CUISINE_DESCRIPTION.value_counts()

In [0]:
len(df.CUISINE_DESCRIPTION.value_counts())

In [0]:
sum(df.CUISINE_DESCRIPTION.isnull())

In [0]:
df.CUISINE_DESCRIPTION.replace(
    to_replace='CafÃ©/Coffee/Tea',
    value = 'Cafe',
    inplace=True
)

df.CUISINE_DESCRIPTION.replace(
    to_replace='Latin (Cuban, Dominican, Puerto Rican, South & Central American)',
    value = 'Latin',
    inplace=True
)


### INSPECTION_DATE

In [0]:
df["INSPECTION_DATE"] = pd.to_datetime(df["INSPECTION_DATE"], format="%m/%d/%Y")

In [0]:
df.INSPECTION_DATE.describe()

In [0]:
sum(df.INSPECTION_DATE.isnull())

In [0]:
df.INSPECTION_DATE.hist()

In [0]:
# Drop the 1900-01-01 inspections. These are all incorrect and we cannot fix them
df['TO_DELETE'] |= (df['INSPECTION_DATE'] == '1900-01-01')

In [0]:
# After analysis, it seems that we have minimal number of inspections before 2015
df['TO_DELETE'] |=  (df['INSPECTION_DATE'] < '2015-01-01')

In [0]:
sum(df['TO_DELETE'])

### ACTION

In [0]:
sum(df.ACTION.isnull())

In [0]:
df["ACTION"].value_counts()

In [0]:
df["ACTION"].replace(
    to_replace='Violations were cited in the following area(s).',
    value = 'Violations found',
    inplace=True
)

In [0]:
df["ACTION"].replace(
    to_replace='No violations were recorded at the time of this inspection.',
    value = 'No violations',
    inplace=True
)

In [0]:
df["ACTION"].replace(
    to_replace='Establishment Closed by DOHMH.  Violations were cited in the following area(s) and those requiring immediate action were addressed.',
    value = 'Establishment closed',
    inplace=True
)

In [0]:
df["ACTION"].replace(
    to_replace='Establishment re-opened by DOHMH',
    value = 'Establishment re-opened',
    inplace=True
)

In [0]:
df["ACTION"].replace(
    to_replace='Establishment re-closed by DOHMH',
    value = 'Establishment re-closed',
    inplace=True
)

In [0]:
df["ACTION"].value_counts()

In [0]:
# Drop the Establishment re-opened and re-closed cases
# as the inspection scores for these can be misleading
df['TO_DELETE'] |=  (df.ACTION == 'Establishment re-closed')
df['TO_DELETE'] |=  (df.ACTION == 'Establishment re-opened')
df['TO_DELETE'] |=  df.ACTION.isnull()

In [0]:
sum(df['TO_DELETE'])

### SCORE

In [0]:
df["SCORE"] = pd.to_numeric(df["SCORE"])

In [0]:
df.SCORE.describe()

In [0]:
len(df)

In [0]:
len( df[ df.SCORE < 0 ] )

In [0]:
df['TO_DELETE'] |= (df.SCORE < 0)

In [0]:
sum(df['TO_DELETE'])

In [0]:
# Drop cases reported as "no violations but with positive score"
df['TO_DELETE'] |= ((df.SCORE > 0)  & (df.ACTION == 'No violations'))

# Drop cases with zero score but with violations found
df['TO_DELETE'] |= ((df.SCORE == 0)  & (df.ACTION == 'Violations found'))

In [0]:
# Drop incorrectly scored inspections
df['TO_DELETE'] |=  (df.GRADE=='A') & (df.SCORE>13)

df['TO_DELETE'] |=  (df.GRADE=='B') & ( (df.SCORE<14) | (df.SCORE>27) )

# Drop incorrectly scored inspections
df['TO_DELETE'] |=  (df.GRADE=='C') & (df.SCORE<28)


In [0]:
sum(df['TO_DELETE'])

### RECORD_DATE

In [0]:
df["RECORD_DATE"] = pd.to_datetime(df["RECORD_DATE"], format="%m/%d/%Y")

In [0]:
# Drop record date field, as it only contains a single value
df = df.drop( 'RECORD_DATE', axis = 'columns')

### GRADE

In [0]:
df.GRADE.value_counts()

In [0]:
sum(df.GRADE.isnull())

In [0]:
df.query(" GRADE == 'G' ")

In [0]:
# Seems that all the "G" correspond to "A"
df.GRADE.replace('G', 'A', inplace=True)

In [0]:
# P assigned to 'Establishment re-opened' actions
df.query(" GRADE == 'P' ")

In [0]:
# P assigned to 'Establishment re-opened' actions
df.query(" GRADE == 'P' ").ACTION.value_counts()

In [0]:
# P assigned to 'Reopening Inspection' inspection types
df.query(" GRADE == 'P' ").INSPECTION_TYPE.value_counts()

In [0]:
# df.query(" GRADE == 'Z' ").SCORE.hist(bins=100)

In [0]:
# TODO: Figure out what Z, and Not Yet Graded means in GRADE
# Until then, we just replace these values with NULL, keeping only the A, B, C grades

# 
df.GRADE.value_counts()

In [0]:
df['TO_DELETE'] |=  (df['INSPECTION_TYPE'] != 'Initial Inspection') & (df['SCORE'] < 14) & (df['GRADE'].isnull())
df['TO_DELETE'] |=  (df.GRADE=='Z')
df['TO_DELETE'] |=  (df.GRADE=='P')
df['TO_DELETE'] |=  (df.GRADE=='Not Yet Graded')

### GRADE_DATE

In [0]:
df["GRADE_DATE"] = pd.to_datetime(df["GRADE_DATE"], format="%m/%d/%Y")


In [0]:
# Grade date and inspection date should be the same. 
assert np.abs((df.GRADE_DATE - df.INSPECTION_DATE).dropna()).sum().days == 0

In [0]:
# Check that is there is a grade date, a grade is also assigned
assert sum ( ~df.GRADE_DATE.isnull() & df.GRADE.isnull() )  == 0

In [0]:
# We do not need GRADE DATE if we have INSPECTION DATE
df = df.drop("GRADE_DATE", axis='columns')

### VIOLATION_CODE

In [0]:
# In the dataset we have a NULL violation, when there is no violation
# To make this more explicit, we replace NULL with 000
df.VIOLATION_CODE.fillna('000', inplace=True)

### MISC analysis

In [0]:
# Find the unique values in each column
# 
# df.describe(include = [np.object, 'category']).T['unique']
unique = df.describe(include = 'all').T['unique'].sort_values()

for column in unique.index:
    if unique[column] < 200:
        print(df[column].value_counts())
        print("=====")

In [0]:
df.dtypes

## Deleting Entries

In this section, we use the results of the analysis above, and delete (additional) entries that will not be useful in our analysis. (Note that it is important to document this, as others may want to go back to the original source, if the entries that we leave are not sufficient.)

In [0]:
len(df)

In [0]:
assert len(df) == initial_size

In [0]:
df = df[ df.TO_DELETE == False].copy()

In [0]:
len(df)

In [0]:


assert sum(df.INSPECTION_TYPE.isnull()) == 0

assert len(set(df.INSPECTION_TYPE.values)) == 2

df["INSPECTION_TYPE"] =  pd.Categorical(df["INSPECTION_TYPE"], ordered=False)

In [0]:
# Check that no borough entries are empty
assert sum(df.BORO.isnull()) == 0

df["BORO"] =  pd.Categorical(df["BORO"], ordered=False)

In [0]:
# Check that no street numbers are empty
assert sum(df.BUILDING.isnull()) == 0

In [0]:
assert sum(df.STREET.isnull()) == 0

In [0]:

assert sum(df.CUISINE_DESCRIPTION.isnull()) == 0

df["CUISINE_DESCRIPTION"] =  pd.Categorical(df["CUISINE_DESCRIPTION"], ordered=False)

In [0]:
# We only keep three different actions
assert len(set(df.ACTION.values)) == 3

# No action is empty
assert sum(df.ACTION.isnull()) == 0

df["ACTION"] =  pd.Categorical(df["ACTION"], ordered=False)

In [0]:
# The below drops any grade values other than A, B, C, and converts the remaining entries to null
df["GRADE"] =  pd.Categorical(df["GRADE"], categories = ['A', 'B', 'C'], ordered=True)

# https://www1.nyc.gov/assets/doh/downloads/pdf/rii/how-we-score-grade.pdf
# 0-13 get an A
assert sum( (df.GRADE=='A') & (df.SCORE>13)) == 0

# 14-27 get a B
assert sum( (df.GRADE=='B') & ( (df.SCORE<14) | (df.SCORE>27) ) ) == 0

# 28- get a C
assert sum( (df.GRADE=='C') & (df.SCORE<28) ) == 0

# In principle, a NULL grade is only when the score is above 14, and it was an initial inspection
assert sum( (df['INSPECTION_TYPE'] != 'Initial Inspection') & (df['SCORE'] < 14) & (df['GRADE'].isnull()) ) == 0

In [0]:
# Check that is there is a grade date, a grade is also assigned
# assert sum ( ~df.GRADE_DATE.isnull() & df.GRADE.isnull() ) == 0

In [0]:
df["VIOLATION_CODE"] =  pd.Categorical(df["VIOLATION_CODE"], ordered=False)
df["CRITICAL_FLAG"] =  pd.Categorical(df["CRITICAL_FLAG"], ordered=False)


## Normalization

### Violation Codes

In [0]:
# Recreating the table at https://www1.nyc.gov/assets/doh/downloads/pdf/rii/ri-violation-penalty.pdf

violation_codes = df [ ['VIOLATION_CODE', 'VIOLATION_DESCRIPTION', 'CRITICAL_FLAG'] ].drop_duplicates()
violation_codes = violation_codes.rename( {
    'VIOLATION_DESCRIPTION' : 'DESCRIPTION',
    'CRITICAL_FLAG' : 'CRITICAL'
},  axis = 'columns').sort_values('VIOLATION_CODE').set_index('VIOLATION_CODE')#.drop(np.nan)
violation_codes.head(5)

In [0]:
# Drop the description and critical part from the main dataframe
df = df.drop(['VIOLATION_DESCRIPTION' ,  'CRITICAL_FLAG'], axis='columns')

### Restaurants

In [0]:
restaurants =  df [ ['CAMIS', 'DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'BORO', 'PHONE', 'CUISINE_DESCRIPTION', 'Latitude', 'Longitude'] ].drop_duplicates()

In [0]:
# Check that we have the same attributes for a given CAMIS
# and the we do not have duplicate CAMIS values
assert len(restaurants) == len(set(restaurants.CAMIS.values))

In [0]:
# TODO: Pass the addresses through Google Maps API and get the x and y coordinates and fix zipcodes etc

restaurants.head(5)

In [0]:
restaurants.PHONE.value_counts().head(20)

In [0]:
# __ = restaurants.PHONE.value_counts().head(10).index.values[7]

In [0]:
# restaurants.PHONE.replace(to_replace=__, value=np.nan, inplace=True)

In [0]:
# Citi Field concessions
# restaurants.query("PHONE == '7185958100'").head(5)

In [0]:
# Madison Square Garden concession stands
# restaurants.query("PHONE == '2124656273'").head(5)

In [0]:
restaurants.DBA.value_counts() 

In [0]:
df = df.drop (['DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'BORO', 'PHONE', 'CUISINE_DESCRIPTION'], axis='columns')

### Inspections

In [0]:
# Each inspection has multiple violations. We want to keep just the inspections for now
inspection = df.drop('VIOLATION_CODE', axis='columns').drop_duplicates().sort_values(['INSPECTION_DATE', 'CAMIS'])
inspection = inspection.drop(['TO_DELETE'], axis='columns')

In [0]:
# We create an ID for each inspection here
inspection = inspection.reset_index().drop('index', axis='columns').reset_index().rename({'index': 'INSPECTION_ID'}, axis='columns')

In [0]:
inspection.ACTION.value_counts()

In [0]:
# Ensure that the inspection table contains 
# no duplicate pairs for 'INSPECTION_DATE', 'CAMIS'
pvt = inspection.pivot_table(
    index = ['INSPECTION_DATE', 'CAMIS'],
    values = 'INSPECTION_ID',
    aggfunc = 'count'
)
pvt [ pvt.INSPECTION_ID > 1 ]

# assert len(pvt [ pvt.INSPECTION_ID > 1 ]) == 0 

In [0]:
# df[ (df.CAMIS =='41007054') & (df.INSPECTION_DATE == '2017-03-03')  ].sort_values('VIOLATION_CODE')

In [0]:
# df[ (df.CAMIS =='50048062') & (df.INSPECTION_DATE == '2018-10-30')  ].sort_values('VIOLATION_CODE')

In [0]:
df[ (df.CAMIS =='40911114') & (df.INSPECTION_DATE == '2017-11-04')  ].sort_values('VIOLATION_CODE')

In [0]:
# df[ (df.CAMIS =='41485450') & (df.INSPECTION_DATE == '2018-04-12')  ]

In [0]:
inspection.INSPECTION_TYPE.value_counts()

In [0]:
inspection_stats = inspection.pivot_table(
    index = 'CAMIS',
    aggfunc = ['min', 'max', 'count'],
    values = 'INSPECTION_DATE'
)

In [0]:
# Distribution of last inspection across all restaurants
inspection_stats['max'].sort_values('INSPECTION_DATE').reset_index().pivot_table(
    index='INSPECTION_DATE',
    aggfunc='count'
).resample('1W').sum().plot()

In [0]:
# Longevity 
# (inspection_stats['max'] - inspection_stats['min'])['INSPECTION DATE'].sort_values()

In [0]:
# Distribution of all inspections
inspection['INSPECTION_DATE'].value_counts().sort_index().resample('1W').sum().plot()

In [0]:
len(df)

In [0]:
violations = pd.merge(
    inspection,
    df[ ['CAMIS', 'INSPECTION_DATE', 'VIOLATION_CODE' ] ],
    on= ['CAMIS', 'INSPECTION_DATE'],
    how = 'inner'
)
violations = violations[ ['INSPECTION_ID', 'VIOLATION_CODE'] ].drop_duplicates()
len(violations)

In [0]:
inspection = inspection.drop(
        [
            'Longitude', 'Latitude', 'Community_Board', 'Council_District', 'Census_Tract', 'BIN', 'BBL', 'NTA'
        ], axis='columns')

In [0]:
print(f"Violations: {len(violations)}")
print(f"Inspections: {len(inspection)}")
print(f"Restaurants: {len(restaurants)}")
print(f"Violation Codes: {len(violation_codes)}")

## Geocoding restaurants

In [0]:
import requests
def call_google_api(address):
    
    # API key from 'ipeirotis' project. Limited to be called only from ipython.ipeirotis.com
    # We need a different key for Travis or other setting
    GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json' 
    params = {
        'address': address,
        'key' : 'INSERT-KEY',
        'region': 'INSERT-REGION'
    }
    req = requests.get(GOOGLE_MAPS_API_URL, params=params)
    
    results = req.json()

    
    # Use the first result
    if 'results' in results and len(results['results'])>0:
        result = results['results'][0]
        return result
    else:
        # Something went wrong
        print(results)

In [0]:
def parse(data):
    # x = pd.DataFrame(google_result['address_components']).long_name.values
    # lng = x[0] + ' ' + x[1] + ', ' + x[2] + ', ' + x[3] + ', ' + x[4] + ', ' + x[5] + ', ' + x[7] + '-' + x[8] 
    
    zipcode = [element['long_name'] for element in data['address_components'] if 'postal_code' in element['types']]
    country = [element['long_name'] for element in data['address_components'] if 'country' in element['types']]
    state   = [element['short_name'] for element in data['address_components'] if 'administrative_area_level_1' in element['types']]
    borough    = [element['short_name'] for element in data['address_components'] if 'sublocality_level_1' in element['types']]
    street = [element['long_name'] for element in data['address_components'] if 'route' in element['types']]
    street_num = [element['long_name'] for element in data['address_components'] if 'street_number' in element['types']]
    
    
    result = dict()
    result["formatted_address"] = data['formatted_address']
    result["lon"] = data['geometry']['location']['lng']
    result["lat"] = data['geometry']['location']['lat']
    result["zipcode"] = zipcode[0] if len(zipcode)>0 else None
    result["country"] = country[0] if len(country)>0 else None
    result["state"]   = state[0] if len(state)>0 else None
    result["borough"]    = borough[0] if len(borough)>0 else None
    result["street"]    = street[0] if len(street)>0 else None
    result["street_num"]    = street_num[0] if len(street_num)>0 else None
    if 'plus_code' in data:
        result['plus_code'] = data['plus_code']['global_code']
    
    return result 


In [0]:
from tqdm import tqdm
import json
import gzip    

def load_cache():
    with gzip.open('geocoding.json.gz', 'rb') as fp:
        json_bytes = fp.read()
        data = json.loads(json_bytes.decode('utf-8'))
    return data

def update_cache(restaurants, cache):

    for index, row in tqdm(restaurants.iterrows(), total=restaurants.shape[0]):

        # If we already have the result in cache, skip querying the Google Maps API
        if row.CAMIS in cache:
            continue

        # Query first with the name of the restaurant
        address = row.DBA + ', ' + row.BUILDING + ' ' + row.STREET + ', ' + row.BORO + ' ' + (row.ZIPCODE if type(row.ZIPCODE)==str  else "")
        google_result = call_google_api(address)

         # If we do not get an answer with the restaurant name, query just with the address
        if google_result == None:
            address = row.BUILDING + ' ' + row.STREET + ', ' + row.BORO + ' ' + (row.ZIPCODE if type(row.ZIPCODE)==str  else "")
            google_result = call_google_api(address)

        # If still none, then skip
        if google_result == None:
            continue

        cache[row.CAMIS] = google_result
        
    # Remove geocoding results that we do not need anymore
    notneeded = set(cache.keys()) - set(restaurants.CAMIS.values) 
    for n in notneeded:
        del cache[n]
        
    return cache
     
def write_cache(data):
    json_bytes = json.dumps(data, sort_keys=True, indent=4).encode('utf-8')  
    with gzip.open('geocoding.json.gz', 'wb') as fp:
        fp.write(json_bytes)

In [0]:
#cache = load_cache()
#cache = update_cache(restaurants, cache)
#write_cache(cache)        

In [0]:
#nogoogle = set(restaurants.CAMIS.values) - set(cache.keys())

In [0]:
#len(nogoogle)

In [0]:
#restaurants [ restaurants.CAMIS.isin(nogoogle) ]

In [0]:
# result = []
# for camis in restaurants.CAMIS.values:
#     g = cache[camis]
#     # print(camis)
#     f = parse(g)
#     f['camis'] = camis
#     result.append(f)
    
# zdf = pd.DataFrame(result)
# zdf

In [0]:
# zdf[zdf.zipcode.isnull()]

In [0]:
# zdf = zdf[zdf.state == 'NY']

In [0]:
# zdf[ ~zdf.borough.isin(['Manhattan', 'Queens', 'Bronx', 'Brooklyn', 'Staten Island'])]

In [0]:
# zdf = zdf[zdf.lat<41]

In [0]:
inspection.query( " CAMIS == '41480442' ")

In [0]:
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt

borough = 'Queens'

shapefile_url = 'https://data.cityofnewyork.us/api/geospatial/cpf4-rkhq?method=export&format=Shapefile'
df_nyc = gpd.GeoDataFrame.from_file(shapefile_url)

base = df_nyc[df_nyc.boro_name==borough].plot(
    linewidth=0.5,
    color='White',
    edgecolor='Black',
    figsize=(15, 20),
    alpha=0.75)

scatterplot = zdf[zdf.borough==borough].plot(
    kind='scatter',
    x='lon',
    y='lat',
    s=2,
    c='g',
    alpha=0.4,
    ax=base)


sns.kdeplot(
    zdf[zdf.borough==borough].lon,
    zdf[zdf.borough==borough].lat,
    gridsize=200,
    n_levels=25,
    shade=True,
    alpha=0.3,
    cmap=plt.cm.coolwarm,
    shade_lowest=False,
    ax=scatterplot)

## Store final data to databases

In [0]:
from tqdm import tqdm
# Function to save the data to a MySQL or a SQLite table.
def addToTable(engine, db_name, table, data, useIndex=False):
    batchsize = 50000
    batches = len(data) // batchsize + 1

    t = tqdm(range(batches))

    if db_name == "":
        for i in t:
            # print("Batch:",i)
            # continue # Cannot execute this on Travis
            start = batchsize * i
            end = batchsize * (i+1)
            data[start:end].to_sql(
                name = table,
                con = engine,
                if_exists = 'append',
                index = useIndex,
                chunksize = 1000)
    else:
        for i in t:
            # print("Batch:",i)
            # continue # Cannot execute this on Travis
            start = batchsize * i
            end = batchsize * (i+1)
            data[start:end].to_sql(
                name = table,
                schema = db_name,
                con = engine,
                if_exists = 'append',
                index = useIndex,
                chunksize = 1000)

## MySQL

In [0]:
from sqlalchemy import create_engine
# Create link with username, password and hostname to connect to database
conn_string = 'mysql://{user}:{password}@{host}/?charset={encoding}'.format(
host = 'localhost', 
user = 'root',
password = 'root',
encoding = 'utf8mb4')
# Create engine to connect to mySQL
engine = create_engine(conn_string)
con = engine.connect()

In [0]:
db_name = 'restaurant_inspections'
charset = 'utf8mb4'
# Drop previous outdated database
sql = f'DROP DATABASE IF EXISTS {db_name}'
engine.execute(sql)
# Create Database restaurant_inspections
sql = f'CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET {charset}'
engine.execute(sql)

sql = f'USE {db_name}'
engine.execute(sql)

In [0]:
critical_categories = list(violation_codes.CRITICAL.cat.categories)
critical_categories = '\'' + ('\', \''.join(critical_categories)) + '\''
# Create Violation Codes Table
violationCodesTable = """
CREATE TABLE ViolationCodes (
    VIOLATION_CODE varchar(3) PRIMARY KEY,
    DESCRIPTION text,
    CRITICAL enum(""" + critical_categories + """)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
engine.execute(violationCodesTable)
# Save violations codes to table
addToTable(engine, db_name, table='ViolationCodes', data=violation_codes, useIndex=True)

In [0]:
cuisine_description_categories = list(restaurants.CUISINE_DESCRIPTION.cat.categories)
cuisine_description_categories = '\'' + ('\', \''.join(cuisine_description_categories)) + '\''
boro_categories = list(restaurants.BORO.cat.categories)
boro_categories = '\'' + ('\', \''.join(boro_categories)) + '\''
# Create Restaurants Table
restaurantsTable = """
CREATE TABLE Restaurants (
    CAMIS int(9) unsigned PRIMARY KEY,
    DBA tinytext,
    BUILDING varchar(10),
    STREET tinytext,
    ZIPCODE int(6) unsigned,
    BORO enum(""" + boro_categories + """),
    PHONE varchar(11),
    CUISINE_DESCRIPTION enum(""" + cuisine_description_categories + """),
    Latitude double,
    Longitude double
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
engine.execute(restaurantsTable)
# Add Restaurant data to Restaurants Table
addToTable(engine, db_name, table='Restaurants', data=restaurants)

In [0]:
# Create Inspections Table
inspectionTable = """
CREATE TABLE Inspections (
    INSPECTION_ID int PRIMARY KEY,
    CAMIS int(9) unsigned NOT NULL,
    INSPECTION_DATE Date,
    ACTION enum('Establishment closed', 'No violations', 'Violations found'),
    SCORE float,
    GRADE enum('A','B','C'),
    GRADE_DATE Date,
    RECORD_DATE Date,
    INSPECTION_TYPE enum('Initial Inspection', 'Re-inspection'),
    FOREIGN KEY (CAMIS) REFERENCES Restaurants(CAMIS)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
engine.execute(inspectionTable)
# Save inspecton data to Inspections Table
addToTable(engine, db_name, table='Inspections', data=inspection)

In [0]:
# Create Violations Table
violationsTable = """
CREATE TABLE Violations (
    INSPECTION_ID int,
    VIOLATION_CODE varchar(3),
    PRIMARY KEY (INSPECTION_ID, VIOLATION_CODE),
    FOREIGN KEY (INSPECTION_ID) REFERENCES Inspections(INSPECTION_ID),
    FOREIGN KEY (VIOLATION_CODE) REFERENCES ViolationCodes(VIOLATION_CODE)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
"""
engine.execute(violationsTable)
# Save violation data to Violations Table
addToTable(engine, db_name, table='Violations', data=violations)

In [0]:
# Close our connection to the MySQL database
con.close()

## SQLite

In [0]:
# Before saving data to SQLite we first delete the database

In [0]:
!rm restaurant_inspections.db

In [0]:
import sqlite3
# Create and connect to the SQLite database
db_name = 'restaurant_inspections.db'
con = sqlite3.connect(db_name)
cursor = con.cursor()

In [0]:
# Create Violation Codes Table
violationCodesTable = """
CREATE TABLE ViolationCodes (
    VIOLATION_CODE varchar(3) PRIMARY KEY,
    DESCRIPTION text,
    CRITICAL text
);
"""
cursor.execute(violationCodesTable)
# Save violation codes to database
addToTable(con, db_name, table='ViolationCodes', data=violation_codes, useIndex=True)

In [0]:
# Create Restaurants Table
restaurantsTable = """
CREATE TABLE Restaurants (
    CAMIS int unsigned PRIMARY KEY,
    DBA varchar(150),
    BUILDING varchar(10),
    STREET varchar(150),
    ZIPCODE int unsigned,
    BORO varchar(20),
    PHONE varchar(11),
    CUISINE_DESCRIPTION varchar(20),
    Latitude double,
    Longitude double
);
"""
cursor.execute(restaurantsTable)
# Save restaurants data to Restaurants Table
addToTable(con, db_name, table='Restaurants', data=restaurants)

In [0]:
# Create Inspections Table
inspectionTable = """
CREATE TABLE Inspections (
    INSPECTION_ID int PRIMARY KEY,
    CAMIS int NOT NULL,
    INSPECTION_DATE Date,
    ACTION varchar(20),
    SCORE float,
    GRADE nchar(1),
    GRADE_DATE Date,
    RECORD_DATE Date,
    INSPECTION_TYPE varchar(20)
);
"""
cursor.execute(inspectionTable)
# Save inspection data to Inspections Table
addToTable(con, db_name, table='Inspections', data=inspection)

In [0]:
# Create Violations Table
violationsTable = """
CREATE TABLE Violations (
    INSPECTION_ID int,
    VIOLATION_CODE varchar(3),
    PRIMARY KEY (INSPECTION_ID, VIOLATION_CODE)
);
"""
cursor.execute(violationsTable)
# Save violations data to Violations Table
addToTable(con, db_name, table='Violations', data=violations)

In [0]:
# Make sure the changes to the database have been saved
con.commit()
# Close connection to the SQLite database
con.close()

## Save to CSV

In [0]:
violation_codes.to_csv('violationCodes.csv.gz', index=True, compression="gzip")
restaurants.to_csv('restaurantsTable.csv.gz', index=False, compression="gzip")
inspection.to_csv('inspections.csv.gz', index=False, compression="gzip")
violations.to_csv('violations.csv.gz', index=False, compression="gzip")

## Exercise: Figure out the typical points for violations

In [0]:
# Trying to figure out the costliest violations
# Here is the guide, which explains that 
# https://www1.nyc.gov/assets/doh/downloads/pdf/rii/how-we-score-grade.pdf


In [0]:
scores = pd.DataFrame(inspection [ ~inspection.SCORE.isnull() ].set_index('INSPECTION_ID').SCORE)
scores.shape

In [0]:
join = violations = pd.merge(
    inspection,
    df[ ['CAMIS', 'INSPECTION_DATE', 'VIOLATION_CODE' ] ],
    on= ['CAMIS', 'INSPECTION_DATE'],
    how = 'inner'
)[ ['INSPECTION_ID', 'VIOLATION_CODE', 'SCORE'] ]
join.head()
len(join)

In [0]:
wide_violations = join [ ~join.SCORE.isnull() ].pivot_table(
    index = 'INSPECTION_ID',
    columns = 'VIOLATION_CODE',
    values = 'SCORE',
    aggfunc = 'count'
).fillna(0)

wide_violations.shape

In [0]:
dataset = pd.merge(
    scores, wide_violations, how='inner', left_index=True, right_index=True
)

dataset.shape

In [0]:
Y = dataset.SCORE.values
X = dataset.drop('SCORE', axis='columns').values

In [0]:
X.shape

In [0]:
Y.shape

In [0]:
# cols = dataset.columns
# cols = cols.map(lambda x : 'V'+x if x!='SCORE' else x)
# dataset.columns = cols


In [0]:
from statsmodels.formula.api import ols, rlm
# ols_model = ols('SCORE ~ V00 + V02A + V02B + V02C + V02D + V02E + V02F + V02G + V02H -1', dataset).fit()
# ols_model.summary()

In [0]:
import statsmodels.api as sm

model = sm.OLS(Y,X)
results = model.fit()
results.summary()

In [0]:
# points = pd.DataFrame(violation_codes)
violation_codes["POINTS"] = results.params
violation_codes = violation_codes.sort_values('POINTS', ascending=False)
violation_codes

In [0]:
# A quick exposure to various options of the "hist" command 
ax = inspection.SCORE.hist(bins=50, # use 50 bars
                          range=(0,50), # x-axis from 0 to 50
                          density=False,  # show normalized count (density=True), or raw counts (density= False)
                          figsize=(10,5), # controls the size of the plot
                          alpha = 0.8 # make the plot 20% transparent
)

ax.set_xlabel("Inspection Score")
ax.set_ylabel("Number of Inspections")

In [0]:
inspection.CAMIS.value_counts().head()

In [0]:
restaurants.set_index('CAMIS').loc['41311804']

In [0]:
restaurants.set_index('CAMIS').loc['41178236']

In [0]:
inspection.query(' CAMIS == "50035784" ')

## Save to PostgreSQL

In [0]:
violation_codes.index.name = violation_codes.index.name.lower()
violation_codes.columns = map(str.lower, violation_codes.columns)
restaurants.columns = map(str.lower, restaurants.columns)
inspection.columns = map(str.lower, inspection.columns)
violations.columns = map(str.lower, violations.columns)

In [0]:
conn_string = 'postgresql://{user}:{password}@{host}:{port}/restaurant_inspections'.format(
    host = cfg.postgres_host, 
    user = cfg.postgres_user,
    password = cfg.postgres_pass,
    port = cfg.postgres_port
)

engine = create_engine(conn_string)
con = engine.connect()

In [0]:
engine.execute('DROP TABLE IF EXISTS violations')
engine.execute('DROP TABLE IF EXISTS inspections')
engine.execute('DROP TABLE IF EXISTS violation_codes')
engine.execute('DROP TABLE IF EXISTS restaurants')

In [0]:
violationCodesTable = """
CREATE TABLE violation_codes (
    violation_code varchar(3) PRIMARY KEY,
    description text,
    critical critical_enum
);
"""

critical_categories = list(violation_codes.critical.cat.categories)
critical_categories = '\'' + ('\', \''.join(critical_categories)) + '\''
engine.execute('DROP TYPE IF EXISTS critical_enum')
engine.execute("CREATE TYPE critical_enum AS ENUM(" + critical_categories +")")
engine.execute(violationCodesTable)

addToTable(engine, "", table='violation_codes', data=violation_codes, useIndex=True)

In [0]:
restaurantsTable = """
CREATE TABLE restaurants (
    camis int PRIMARY KEY,
    dba text,
    building varchar(10),
    street text,
    zipcode int,
    boro boro_enum,
    phone varchar(11),
    cuisine_description cuisine_description_enum,
    latitude double precision,
    longitude double precision
);
"""

cuisine_description_categories = list(restaurants.cuisine_description.cat.categories)
cuisine_description_categories = '\'' + ('\', \''.join(cuisine_description_categories)) + '\''
boro_categories = list(restaurants.boro.cat.categories)
boro_categories = '\'' + ('\', \''.join(boro_categories)) + '\''
engine.execute('DROP TYPE IF EXISTS boro_enum')
engine.execute('DROP TYPE IF EXISTS cuisine_description_enum')
engine.execute("CREATE TYPE boro_enum AS ENUM(" + boro_categories + ")")
engine.execute("CREATE TYPE cuisine_description_enum AS ENUM(" + cuisine_description_categories + ")")
engine.execute(restaurantsTable)

addToTable(engine, "", table='restaurants', data=restaurants)

In [0]:
inspectionTable = """
CREATE TABLE inspections (
    inspection_id int PRIMARY KEY,
    camis int NOT NULL,
    inspection_date Date,
    action action_enum,
    score float,
    grade grade_enum,
    grade_date Date,
    record_date Date,
    inspection_type inspection_type_enum,
    FOREIGN KEY (camis) REFERENCES restaurants(camis)
);
"""
engine.execute('DROP TYPE IF EXISTS action_enum')
engine.execute('DROP TYPE IF EXISTS inspection_type_enum')
engine.execute('DROP TYPE IF EXISTS grade_enum')
engine.execute("CREATE TYPE action_enum AS ENUM('Establishment closed', 'No violations', 'Violations found')")
engine.execute("CREATE TYPE inspection_type_enum AS ENUM('Initial Inspection', 'Re-inspection')")
engine.execute("CREATE TYPE grade_enum AS ENUM('A','B','C')")
engine.execute(inspectionTable)

addToTable(engine, "", table='inspections', data=inspection)

In [0]:
violationsTable = """
CREATE TABLE violations (
    inspection_id int,
    violation_code varchar(3),
    PRIMARY KEY (inspection_id, violation_code),
    FOREIGN KEY (inspection_id) REFERENCES inspections(inspection_id),
    FOREIGN KEY (violation_code) REFERENCES violation_codes(violation_code)
);
"""
engine.execute(violationsTable)

addToTable(engine, "", table='violations', data=violations)

con.close()

## Testing
This part of the script is used to validate the integrity 
of the incoming dataset and point out inconsistencies

In [0]:
import pandas as pd
import logging

In [1]:
def init():
    # configure logging
    logger = logging.getLogger()
    handler = logging.FileHandler('.log','w')
    formatter = logging.Formatter('[%(asctime)s] %(levelname)-8s %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)

In [0]:
def readCSV():
    logging.info('Reading CSV...')
    return pd.read_csv('restaurants.csv.gz', dtype = 'object')

In [0]:
def fixIndeces(data):
    # Adding underscores in all column names
    data.columns = data.columns.map(lambda x: x.replace(' ', '_'))

In [0]:
def checkColumns(data):
    errors = False
    logging.info('***Checking if any columns have benn removed or added to the dataset.***')
    # Check if all columns needed are in the dataset
    VALUES = ['INSPECTION_TYPE','CUISINE_DESCRIPTION','INSPECTION_DATE',
    'ACTION','SCORE','RECORD_DATE','GRADE','GRADE_DATE','VIOLATION_CODE',
    'BORO','BUILDING','STREET','CRITICAL_FLAG','VIOLATION_DESCRIPTION',
    'CAMIS', 'DBA', 'ZIPCODE', 'PHONE', 'Latitude', 'Longitude','Community_Board', 
    'Council_District', 'Census_Tract', 'BIN', 'BBL','NTA']
    sortedInput = list(data.columns)
    sortedInput.sort()
    VALUES.sort()
    if (VALUES != sortedInput):
        added = set(sortedInput).difference(VALUES)
        removed = set(VALUES).difference(sortedInput)
        logging.error('Inconsistency found...')
        if (len(added) != 0):
            logging.error('New columns added to the dataset: ' + str(added) + '.')
        if (len(removed) != 0):
            logging.error('Columns removed from the dataset: ' + str(removed) + '.')
        errors = True
    return errors

In [0]:
def checkValues(data):
    errors = False
    logging.info('***Check the validity of the values of the dataset we use***')
    testVals = ['Cycle Inspection / Initial Inspection', 'Cycle Inspection / Re-inspection']
    errors |= testValues(data, 'INSPECTION_TYPE', testVals)
    testVals = [
        'Violations were cited in the following area(s).',
        'No violations were recorded at the time of this inspection.',
        'Establishment Closed by DOHMH.  Violations were cited in the following area(s) and those requiring immediate action were addressed.',
        'Establishment re-opened by DOHMH',
        'Establishment re-closed by DOHMH'
    ]
    errors |= testValues(data, 'ACTION', testVals)
    testVals = ['A', 'B', 'C', 'G']
    errors |= testValues(data, 'GRADE', testVals)

    errors |= testDates(data, 'INSPECTION_DATE')
    errors |= testDates(data, 'RECORD_DATE')
    errors |= testDates(data, 'GRADE_DATE')
    return errors

In [0]:
def testColumn(data, column):
    if not column in data.columns:
        logging.error('Inconsistency found...')  
        logging.error('Column "%s" does not exists' % (column))
        return True
    return False

In [0]:
def testDates(data, column):
    logging.info('Checking dates on column "%s"' % column)
    if testColumn(data, column):
        return True

    try:
        data[column] = pd.to_datetime(data[column], format='%m/%d/%Y')
    except ValueError as er:
        logging.error('Inconsistency found...')
        logging.error('There are malformed date values in column "%s"' % (column))
        logging.error(str(er))
        return True
    return False

In [0]:
def testValues(data, column, values):
    error = False
    logging.info('Checking values on column "%s"' % column)
    if testColumn(data, column):
        return True

    inspVals = data[column].values
    for tval in values:
        if not tval in inspVals:
            logging.error('Inconsistency found...')  
            logging.error('Value "%s" is not in column "%s"' % (tval, column))
            error = True
    return error

In [0]:
def main():
    errors = False
    init()
    logging.info('Running Tests...')
    df = readCSV()
    fixIndeces(df)
    errors |= checkColumns(df)
    errors |= checkValues(df)
    if errors:
        logging.error('Errors found.')
        logging.info('Exiting with error code 1.')
        exit(1)
    logging.info('No errors found.')
    logging.info('Exiting...')

# in order to execute the script you just need to call the main function
# main()