### Imports

In [1]:
import pandas as pd

### Read Chicago Food Inspections Data
Count records and columns.

In [2]:
food_inspections_df = pd.read_csv('../../Food_Inspections.csv')

In [3]:
food_inspections_df.shape

(195116, 17)

### Normalize Casing of Chicago
Accept only proper spellings of the word Chicago with mixed casing accepted

In [4]:
food_inspections_df['City'] = food_inspections_df['City'].apply(lambda x: 'CHICAGO'
                                                                if str(x).upper() == 'CHICAGO'
                                                                else x)

### Filter for Facilities in Chicago Illinois

In [5]:
loc_condition = (food_inspections_df['City'] == 'CHICAGO') & (food_inspections_df['State'] == 'IL')

### Drop Redundant Information
- Only Chicago is considered
- Only Illinois is considered
- Location is encoded as separate latitute and longitude columns

In [6]:
food_inspections_df = food_inspections_df[loc_condition].drop(['City', 'State', 'Location'], 1)

In [7]:
food_inspections_df.shape

(194684, 14)

### Create Codes Corresponding to Each Violation Type by Parsing Violation Text

In [8]:
def create_violation_code(violation_text):
    
    if violation_text != violation_text:
        return -1
    else:
        return int(violation_text.split('.')[0])

In [9]:
food_inspections_df['violation_code'] = food_inspections_df['Violations'].apply(create_violation_code)

### Create Attribute Dataframes with the Unique Inspection ID for Lookups Following Quantitative Analysis
- Names
- Licenses
- Locations
- Violations
- Dates

In [10]:
names = ['Inspection ID', 'DBA Name', 'AKA Name']
names_df = food_inspections_df[names]

In [11]:
licenses = ['Inspection ID', 'License #']
licenses_df = food_inspections_df[licenses]

In [12]:
locations = ['Inspection ID', 'Address', 'Latitude', 'Longitude']
locations_df = food_inspections_df[locations]

In [13]:
violations = ['Inspection ID', 'Violations', 'violation_code']
violations_df = food_inspections_df[violations]

In [14]:
dates = ['Inspection ID', 'Inspection Date']
dates_df = food_inspections_df[dates]

### Drop Features Not Used in Statistical Analysis
Features such as:

- `DBA Name`
- `AKA Name`
- `License #`
- `Address`
- `Violations`
- `Inspection Date`

May be examined following statistical analysis by joining on `Inspection ID`.  **Note:** future iterations of this work may wish to consider:

- Text from the the facility name
- Street level information from the facility address
- Prior inspections of the same facility by performing a temporal analysis of the data using `Inspection Date`

In [15]:
not_considered = ['DBA Name', 'AKA Name', 'License #', 'Address', 'Violations', 'Inspection Date']
food_inspections_df = food_inspections_df.drop(not_considered, 1)

### Create Dataframes of Count and Prevalence for Categorical Features
- Facility types
- Violation codes
- Zip codes
- Inspection types

In [16]:
facilities = food_inspections_df['Facility Type'].value_counts()
facilities_df = pd.DataFrame({'facility':facilities.index, 'count':facilities.values})
facilities_df['prevalence'] = facilities_df['count'] / food_inspections_df.shape[0]
facilities_df.nlargest(10, 'count')

Unnamed: 0,facility,count,prevalence
0,Restaurant,129938,0.66743
1,Grocery Store,24829,0.127535
2,School,12062,0.061957
3,Children's Services Facility,3031,0.015569
4,Bakery,2837,0.014572
5,Daycare (2 - 6 Years),2682,0.013776
6,Daycare Above and Under 2 Years,2355,0.012097
7,Long Term Care,1340,0.006883
8,Catering,1190,0.006112
9,Liquor,847,0.004351


In [17]:
violations = food_inspections_df['violation_code'].value_counts()
violations_df = pd.DataFrame({'violation_code':violations.index, 'count':violations.values})
violations_df['prevalence'] = violations_df['count'] / food_inspections_df.shape[0]
violations_df.nlargest(10, 'count')

Unnamed: 0,violation_code,count,prevalence
0,-1,51543,0.264752
1,32,25408,0.130509
2,33,17449,0.089627
3,3,13609,0.069903
4,34,10576,0.054324
5,18,9989,0.051309
6,30,9441,0.048494
7,2,7166,0.036808
8,35,5631,0.028924
9,21,4582,0.023536


In [18]:
zips = food_inspections_df['Zip'].value_counts()
zips_df = pd.DataFrame({'zip':zips.index, 'count':zips.values})
zips_df['prevalence'] = zips_df['count'] / food_inspections_df.shape[0]
zips_df.nlargest(10, 'count')

Unnamed: 0,zip,count,prevalence
0,60614.0,7284,0.037414
1,60647.0,7088,0.036408
2,60657.0,6824,0.035052
3,60622.0,6108,0.031374
4,60611.0,6094,0.031302
5,60608.0,5925,0.030434
6,60618.0,5923,0.030424
7,60625.0,5387,0.02767
8,60639.0,5208,0.026751
9,60607.0,5145,0.026427


In [19]:
inspections = food_inspections_df['Inspection Type'].value_counts()
inspections_df = pd.DataFrame({'inspection':inspections.index, 'count':inspections.values})
inspections_df['prevalence'] = inspections_df['count'] / food_inspections_df.shape[0]

In [20]:
inspections_df.nlargest(10, 'count')

Unnamed: 0,inspection,count,prevalence
0,Canvass,102944,0.528775
1,License,25614,0.131567
2,Canvass Re-Inspection,20457,0.105078
3,Complaint,18093,0.092935
4,License Re-Inspection,8899,0.04571
5,Complaint Re-Inspection,7499,0.038519
6,Short Form Complaint,6717,0.034502
7,Suspected Food Poisoning,850,0.004366
8,Consultation,669,0.003436
9,License-Task Force,604,0.003102


In [21]:
results = food_inspections_df['Results'].value_counts()
results_df = pd.DataFrame({'result':results.index, 'count':results.values})
results_df['prevalence'] = results_df['count'] / food_inspections_df.shape[0]

In [22]:
results_df.nlargest(10, 'count')

Unnamed: 0,result,count,prevalence
0,Pass,105369,0.541231
1,Fail,37658,0.193431
2,Pass w/ Conditions,26795,0.137633
3,Out of Business,16757,0.086073
4,No Entry,6198,0.031836
5,Not Ready,1843,0.009467
6,Business Not Located,64,0.000329


### Create Risk Group Feature
If the feature cannot be found in the middle of the text string as a value 1-3, return -1.

In [23]:
def create_risk_groups(risk_text):
    
    try: 
        risk = int(x.split(' ')[1])
        return risk
    except:
        return -1

In [24]:
food_inspections_df['risk'] = food_inspections_df['Risk'].apply(create_risk_groups)
food_inspections_df = food_inspections_df.drop('Risk', 1)

### Encode Features

In [25]:
food_inspections_df

Unnamed: 0,Inspection ID,Facility Type,Zip,Inspection Type,Results,Latitude,Longitude,violation_code,risk
0,2320831,Grocery Store,60623.0,Canvass,Out of Business,41.855266,-87.712402,-1,-1
1,2320793,Mobile Food Preparer,60608.0,License,Pass,41.850451,-87.658798,-1,-1
2,2320830,Restaurant,60607.0,License,Pass,41.885699,-87.648789,36,-1
3,2320717,Restaurant,60657.0,Canvass Re-Inspection,Pass,41.944974,-87.645660,47,-1
4,2320618,,60647.0,License,Not Ready,41.921076,-87.694138,-1,-1
...,...,...,...,...,...,...,...,...,...
195111,160276,Restaurant,60610.0,Canvass,Pass,41.897476,-87.628368,32,-1
195112,120297,School,60607.0,Canvass,Pass,41.878307,-87.654440,33,-1
195113,60325,School,60632.0,Canvass,Fail,41.807551,-87.740097,9,-1
195114,197264,Restaurant,60602.0,Canvass,Pass,41.883237,-87.632556,30,-1
