# Inspection Data Preprocessing

In [22]:
import pandas as pd
import numpy as np

In [23]:
dataset_csv = "../data/food-inspections.csv"

In [24]:
fi = pd.read_csv(dataset_csv)

## Dealing with NA values

`Inspection ID` is a reliable attribute to work with, e.g. for grouby counts, since there are no NAs:

In [25]:
fi['Inspection ID'].isna().any()

False

In [26]:
# There are no entries in these columns, we can drop them
empty_columns = fi.columns[fi[fi.notna()].count() == 0]
fi.drop(columns=empty_columns, inplace=True)

In [27]:
# Location column is redundant
fi[['Latitude', 'Longitude', 'Location']].head()

Unnamed: 0,Latitude,Longitude,Location
0,41.945065,-87.816734,"{'longitude': '41.945064857019986', 'latitude'..."
1,41.895692,-87.620143,"{'longitude': '41.895692401410514', 'latitude'..."
2,41.982582,-87.708996,"{'longitude': '41.98258181784537', 'latitude':..."
3,41.953378,-87.718848,"{'longitude': '41.95337788158545', 'latitude':..."
4,41.793235,-87.777776,"{'longitude': '41.7932347787373', 'latitude': ..."


In [28]:
fi.drop(columns='Location', inplace=True)

In [29]:
# Standardize empty strings to NANs https://stackoverflow.com/a/21942746
fi.replace(r'^\s*$', np.nan, inplace=True)

In [30]:
# Remove places from outside of Illinois
fi.drop(index=fi[(fi['State'].notna()) & (fi['State'] != 'IL')].index, inplace=True)

How many NA values we have in each column with some NA values:

In [31]:
# Remaining NA values
na_columns = fi.columns[fi.isna().any()]
len(fi) - fi[na_columns].count()

AKA Name            2450
License #             17
Facility Type       4776
Risk                  73
City                 139
State                 42
Zip                   51
Inspection Type        1
Violations         51782
Latitude             682
Longitude            682
dtype: int64

We should be bothered by missing: `License #` (possibly useful as place identifier), `Facility Type`, `Risk`?, `Zip`, `Inspection Type`? (only 1), geocoordinates

### `License #`

These were special events, churches, grocery store, so not included in our restaurant analysis, we can drop these rows

In [32]:
special_events = fi[fi['License #'].isna()].index
fi.drop(index=special_events, inplace=True)

In [33]:
# Rename license column for easier access
fi.rename(columns={'License #': 'License'}, inplace=True)

In [34]:
# We still have some suspicious license ids
# TODO: what to do with them?
fi[fi['License'] == 0.0]['Inspection ID'].count()

514

### `Facility Type`

Maybe we can get it from earlier inspections of the same place (same `License`)

In [35]:
na_facility_type_counts = fi[fi['Facility Type'].isna()].groupby(by=['License'])['Inspection ID'].count()

In [36]:
dba_names_for_na_facility = fi[fi['Facility Type'].isna()]['DBA Name'].unique()

In [37]:
total_na_facility_counts = fi[fi['DBA Name'].isin(dba_names_for_na_facility)].groupby(by=['License'])['Inspection ID'].count()

In [38]:
counts_diff = total_na_facility_counts - na_facility_type_counts

In [39]:
recoverable_licences = counts_diff[counts_diff > 0].index.values
recoverable_licences = recoverable_licences[recoverable_licences != 0]

In [40]:
# Obtain mapping from license to facility type
recovered = fi[fi['License'].isin(recoverable_licences)].groupby(by='License')['Facility Type'].unique().explode().dropna()

In [41]:
# Merge recovered facility types into fi
def take_not_nan(a, b):
    return b if pd.isna(a) else a

fi = fi.merge(recovered, how='left', left_on='License', right_index=True)
fi['Facility Type'] = fi['Facility Type_x'].combine(fi['Facility Type_y'], take_not_nan)
fi.drop(columns=['Facility Type_x', 'Facility Type_y'], inplace=True)

In [42]:
fi[fi['Facility Type'].isna()]['Inspection ID'].count()

4690

Managed to correct 86 records

### `Zip`
TODO: Process using Maja's solution

In [None]:
# Making sure we are missing lat and long in same rows
(fi[fi['Latitude'].isna()].index == fi[fi['Longitude'].isna()].index).all()

In [None]:
missing_location = fi[fi['Latitude'].isna()]
missing_location[['Address', 'City', 'State', 'Zip']].isna().any()

We have address for all of them, but we're missing zip codes for some

In [None]:
missing_zip = missing_location[missing_location['Zip'].isna()]

In [None]:
counts_by_state = fi.groupby(by='State')['Inspection ID'].count()
other_states = fi[fi['State'] != 'IL']