### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Read Chicago Food Inspections Data
Count records and columns.

In [2]:
food_inspections_df = pd.read_csv('../../Food_Inspections.csv')

In [3]:
food_inspections_df.shape

(195116, 17)

### Rename Columns

In [4]:
food_inspections_df.columns.tolist()

['Inspection ID',
 'DBA Name',
 'AKA Name',
 'License #',
 'Facility Type',
 'Risk',
 'Address',
 'City',
 'State',
 'Zip',
 'Inspection Date',
 'Inspection Type',
 'Results',
 'Violations',
 'Latitude',
 'Longitude',
 'Location']

In [5]:
columns = ['inspection_id', 'dba_name', 'aka_name', 'license_number', 'facility_type',
           'risk', 'address', 'city', 'state', 'zip', 'inspection_date', 'inspection_type',
           'result', 'violation', 'latitude', 'longitude', 'location']

In [6]:
food_inspections_df.columns = columns

### Normalize Casing of Chicago
Accept only proper spellings of the word Chicago with mixed casing accepted

In [7]:
food_inspections_df['city'] = food_inspections_df['city'].apply(lambda x: 'CHICAGO'
                                                                if str(x).upper() == 'CHICAGO'
                                                                else x)

### Filter for Facilities in Chicago Illinois

In [8]:
loc_condition = (food_inspections_df['city'] == 'CHICAGO') & (food_inspections_df['state'] == 'IL')

### Drop Redundant Information
- Only Chicago is considered
- Only Illinois is considered
- Location is encoded as separate latitute and longitude columns

In [9]:
food_inspections_df = food_inspections_df[loc_condition].drop(['city', 'state', 'location'], 1)

In [10]:
food_inspections_df.shape

(194684, 14)

### Create Codes Corresponding to Each Violation Type by Parsing Violation Text

In [11]:
def create_violation_code(violation_text):
    
    if violation_text != violation_text:
        return -1
    else:
        return int(violation_text.split('.')[0])

In [12]:
food_inspections_df['violation_code'] = food_inspections_df['violation'].apply(create_violation_code)

### Create Attribute Dataframes with the Unique Inspection ID for Lookups Following Quantitative Analysis
- Names
- Licenses
- Locations
- Violations
- Dates

In [13]:
names = ['inspection_id', 'dba_name', 'aka_name']
names_df = food_inspections_df[names]

In [14]:
licenses = ['inspection_id', 'license_number']
licenses_df = food_inspections_df[licenses]

In [15]:
locations = ['inspection_id', 'address', 'latitude', 'longitude']
locations_df = food_inspections_df[locations]

In [16]:
violations = ['inspection_id', 'violation', 'violation_code']
violations_df = food_inspections_df[violations]

In [17]:
dates = ['inspection_id', 'inspection_date']
dates_df = food_inspections_df[dates]

### Drop Features Not Used in Statistical Analysis
Features such as:

- `DBA Name`
- `AKA Name`
- `License #`
- `Address`
- `Violations`
- `Inspection Date`

May be examined following statistical analysis by joining on `Inspection ID`.  **Note:** future iterations of this work may wish to consider:

- Text from the the facility name
- Street level information from the facility address
- Prior inspections of the same facility by performing a temporal analysis of the data using `Inspection Date`

In [18]:
not_considered = ['dba_name', 'aka_name', 'license_number', 'address', 'violation', 'inspection_date']
food_inspections_df = food_inspections_df.drop(not_considered, 1)

### Create Dataframes of Count and Prevalence for Categorical Features
- Facility types
- Violation codes
- Zip codes
- Inspection types

In [21]:
facilities = food_inspections_df['facility_type'].value_counts()
facilities_df = pd.DataFrame({'facility_type':facilities.index, 'count':facilities.values})
facilities_df['prevalence'] = facilities_df['count'] / food_inspections_df.shape[0]

In [22]:
facilities_df.nlargest(10, 'count')

Unnamed: 0,facility_type,count,prevalence
0,Restaurant,129938,0.66743
1,Grocery Store,24829,0.127535
2,School,12062,0.061957
3,Children's Services Facility,3031,0.015569
4,Bakery,2837,0.014572
5,Daycare (2 - 6 Years),2682,0.013776
6,Daycare Above and Under 2 Years,2355,0.012097
7,Long Term Care,1340,0.006883
8,Catering,1190,0.006112
9,Liquor,847,0.004351


In [23]:
facilities_df.nsmallest(10, 'count')

Unnamed: 0,facility_type,count,prevalence
401,COFFEE ROASTER,1,5e-06
402,GROCERY/LIQUOR,1,5e-06
403,SERVICE BAR/THEATRE,1,5e-06
404,CAT/LIQUOR,1,5e-06
405,NEWSSTAND,1,5e-06
406,FROZEN DESSERTS DISPENSER -NON MOTORIZED,1,5e-06
407,LONG TERM CARE FACILITY,1,5e-06
408,Pop-Up Food Establishment User-Tier II,1,5e-06
409,NP-KIOSK,1,5e-06
410,WHOLESALE BAKERY,1,5e-06


In [24]:
violations = food_inspections_df['violation_code'].value_counts()
violations_df = pd.DataFrame({'violation_code':violations.index, 'count':violations.values})
violations_df['prevalence'] = violations_df['count'] / food_inspections_df.shape[0]

In [25]:
violations_df.nlargest(10, 'count')

Unnamed: 0,violation_code,count,prevalence
0,-1,51543,0.264752
1,32,25408,0.130509
2,33,17449,0.089627
3,3,13609,0.069903
4,34,10576,0.054324
5,18,9989,0.051309
6,30,9441,0.048494
7,2,7166,0.036808
8,35,5631,0.028924
9,21,4582,0.023536


In [26]:
violations_df.nsmallest(10, 'count')

Unnamed: 0,violation_code,count,prevalence
61,61,1,5e-06
62,63,1,5e-06
60,70,6,3.1e-05
58,15,11,5.7e-05
59,60,11,5.7e-05
57,59,13,6.7e-05
56,50,17,8.7e-05
55,20,18,9.2e-05
54,52,19,9.8e-05
53,54,25,0.000128


In [27]:
zips = food_inspections_df['zip'].value_counts()
zips_df = pd.DataFrame({'zip':zips.index, 'count':zips.values})
zips_df['prevalence'] = zips_df['count'] / food_inspections_df.shape[0]

In [None]:
zips_df.nlargest(10, 'count')

In [None]:
zips_df.nsmallest(10, 'count')

In [28]:
inspections = food_inspections_df['inspection_type'].value_counts()
inspections_df = pd.DataFrame({'inspection_type':inspections.index, 'count':inspections.values})
inspections_df['prevalence'] = inspections_df['count'] / food_inspections_df.shape[0]

In [29]:
inspections_df.nlargest(10, 'count')

Unnamed: 0,inspection_type,count,prevalence
0,Canvass,102944,0.528775
1,License,25614,0.131567
2,Canvass Re-Inspection,20457,0.105078
3,Complaint,18093,0.092935
4,License Re-Inspection,8899,0.04571
5,Complaint Re-Inspection,7499,0.038519
6,Short Form Complaint,6717,0.034502
7,Suspected Food Poisoning,850,0.004366
8,Consultation,669,0.003436
9,License-Task Force,604,0.003102


In [30]:
inspections_df.nsmallest(10, 'count')

Unnamed: 0,inspection_type,count,prevalence
41,CANVASS,1,5e-06
42,LICENSE WRONG ADDRESS,1,5e-06
43,CANVAS,1,5e-06
44,TASTE OF CHICAGO,1,5e-06
45,Recent inspection,1,5e-06
46,Sample Collection,1,5e-06
47,CANVASS SPECIAL EVENTS,1,5e-06
48,CANVASS SCHOOL/SPECIAL EVENT,1,5e-06
49,CITF,1,5e-06
50,FIRE/COMPLAIN,1,5e-06


In [31]:
results = food_inspections_df['result'].value_counts()
results_df = pd.DataFrame({'result':results.index, 'count':results.values})
results_df['prevalence'] = results_df['count'] / food_inspections_df.shape[0]

In [32]:
results_df.nlargest(10, 'count')

Unnamed: 0,result,count,prevalence
0,Pass,105369,0.541231
1,Fail,37658,0.193431
2,Pass w/ Conditions,26795,0.137633
3,Out of Business,16757,0.086073
4,No Entry,6198,0.031836
5,Not Ready,1843,0.009467
6,Business Not Located,64,0.000329


### Create Risk Group Feature
If the feature cannot be found in the middle of the text string as a value 1-3, return -1.

In [33]:
def create_risk_groups(risk_text):
    
    try: 
        risk = int(risk_text.split(' ')[1])
        return risk
    except:
        return -1

In [34]:
food_inspections_df['risk'] = food_inspections_df['risk'].apply(create_risk_groups)

### Format Result
- Encode Pass and Pass w/ Conditions as 0
- Encode Fail as 1
- Encode all others as -1 and filter out

In [35]:
def format_results(result):
    
    if result == 'Pass':
        return 0
    elif result == 'Pass w/ Conditions':
        return 0
    elif result == 'Fail':
        return 1
    else:
        return -1

In [36]:
food_inspections_df['result'] = food_inspections_df['result'].apply(format_results)
food_inspections_df = food_inspections_df[food_inspections_df['result'] != -1]

In [37]:
food_inspections_df.shape

(169822, 9)

### Drop Violation Code

In [38]:
food_inspections_df = food_inspections_df.drop('violation_code', 1)

In [63]:
# filter...

In [64]:
def prev_filter(df, feature, prevalence='prevalence', prevalence_threshold=0.001):
    return df[df[prevalence] > prevalence_threshold][feature].tolist()

In [65]:
categorical_features = ['facility_type', 'zip', 'inspection_type']

In [66]:
feature_dict = dict(zip(categorical_features, [prev_filter(facilities_df, 'facility_type'),
                                               prev_filter(zips_df, 'zip'),
                                               prev_filter(inspections_df, 'inspection_type')]))

In [67]:
for feature in categorical_features:
    food_inspections_df[feature] = food_inspections_df[feature].apply(lambda x: x 
                                                                      if x in feature_dict[feature]
                                                                      else 'DROP')

In [68]:
food_inspections_df

Unnamed: 0,inspection_id,facility_type,risk,zip,inspection_type,result,latitude,longitude
1,2320793,DROP,2,DROP,DROP,0,41.850451,-87.658798
2,2320830,DROP,2,DROP,DROP,0,41.885699,-87.648789
3,2320717,DROP,1,DROP,DROP,0,41.944974,-87.645660
6,2320574,DROP,1,DROP,DROP,0,41.811990,-87.743128
8,2320544,DROP,1,DROP,DROP,0,41.968336,-87.708783
...,...,...,...,...,...,...,...,...
195111,160276,DROP,1,DROP,DROP,0,41.897476,-87.628368
195112,120297,DROP,1,DROP,DROP,0,41.878307,-87.654440
195113,60325,DROP,1,DROP,DROP,1,41.807551,-87.740097
195114,197264,DROP,1,DROP,DROP,0,41.883237,-87.632556


In [69]:
feature_df = pd.get_dummies(food_inspections_df,
                            prefix=['{}_'.format(feature) for feature in categorical_features],
                            columns=categorical_features)

In [70]:
feature_df.head()

Unnamed: 0,inspection_id,risk,result,latitude,longitude,facility_type__DROP,zip__DROP,inspection_type__DROP
1,2320793,2,0,41.850451,-87.658798,1,1,1
2,2320830,2,0,41.885699,-87.648789,1,1,1
3,2320717,1,0,41.944974,-87.64566,1,1,1
6,2320574,1,0,41.81199,-87.743128,1,1,1
8,2320544,1,0,41.968336,-87.708783,1,1,1


In [None]:
def bar_count_plot(df, feature, count='count'):

    plt.figure(figsize=(8,16))
    plt.barh(range(len(df[feature])), df[count], align='center', alpha=0.5)
    plt.yticks(range(len(df[feature])), df[feature])
    plt.xlabel(feature)
    plt.ylabel(count)
    plt.title('Count of {}'.format(feature))
    plt.show()

In [None]:
pd.get_dummies(food_inspections_df['Zip'][], )