In [32]:
import pandas as pd
import pickle

In [33]:
data = pd.DataFrame(pickle.load(open('../temp/historic-inspections-2021-03-15.pkl', 'rb')))

In [34]:
# Hacemos subset de las columnas que utilizaremos para el modelo
data = data[['inspection_id', 'facility_type', 'risk', 'zip', 'inspection_date','inspection_type', 'results', 'violations', 'latitude', 'longitude']]
data.head()

Unnamed: 0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude
0,2492768,Restaurant,Risk 2 (Medium),60647,2021-03-12T00:00:00.000,License,Pass,,41.917134801370274,-87.68754385201615
1,2492746,Daycare Above and Under 2 Years,Risk 1 (High),60624,2021-03-11T00:00:00.000,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.88591964589597,-87.72596362714377
2,2492736,Restaurant,Risk 1 (High),60636,2021-03-11T00:00:00.000,Canvass,Pass w/ Conditions,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.77971768255646,-87.65456552453057
3,2492763,Restaurant,Risk 1 (High),60657,2021-03-11T00:00:00.000,Canvass,Out of Business,,41.93368774378592,-87.64445413171818
4,2492737,Restaurant,Risk 1 (High),60647,2021-03-11T00:00:00.000,Non-Inspection,No Entry,,41.91755731772555,-87.68266439539492


In [35]:
data.dtypes

inspection_id      object
facility_type      object
risk               object
zip                object
inspection_date    object
inspection_type    object
results            object
violations         object
latitude           object
longitude          object
dtype: object

In [36]:
data.isna().sum()

inspection_id          0
facility_type       4900
risk                  69
zip                   47
inspection_date        0
inspection_type        1
results                0
violations         58061
latitude             731
longitude            731
dtype: int64

In [37]:
# Reemplazamos con 0 los mv en zip
data['zip']=data['zip'].fillna(0)

In [38]:
# fill nas en columnas de texto
data[['facility_type', 'risk', 'inspection_type','violations']]=data[['facility_type', 'risk', 'inspection_type','violations']].fillna('na')

In [39]:
# convertimos columnas 'license' y 'zip'. Primero a int y luego a str
data = data.astype({"zip": 'int'})
data = data.astype({"zip": 'str'})
data.dtypes

inspection_id      object
facility_type      object
risk               object
zip                object
inspection_date    object
inspection_type    object
results            object
violations         object
latitude           object
longitude          object
dtype: object

In [40]:
data.isnull().sum()

inspection_id        0
facility_type        0
risk                 0
zip                  0
inspection_date      0
inspection_type      0
results              0
violations           0
latitude           731
longitude          731
dtype: int64

In [42]:
# Cambiamos valor de results a 'Pass' y 'Not Pass' para cualquier otro
data['results'].mask(data['results'] != 'Pass', other = 'Not Pass', inplace=True)

In [43]:
data.groupby('results').count()

Unnamed: 0_level_0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,violations,latitude,longitude
results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Not Pass,103704,103704,103704,103704,103704,103704,103704,103422,103422
Pass,113577,113577,113577,113577,113577,113577,113577,113128,113128


In [48]:
def standarize_column_strings(data, columns, excluded_punctuation=".,-*'¿?¡!()", gap_punct="\/"):
    for col in columns:
        data[col] = data[col].apply(lambda x: x.lower().replace(" ", "_"))
        data[col] = data[col].apply(lambda x: x.lower().replace("á", "a"))
        data[col] = data[col].apply(lambda x: x.lower().replace("é", "e"))
        data[col] = data[col].apply(lambda x: x.lower().replace("í", "i"))
        data[col] = data[col].apply(lambda x: x.lower().replace("ó", "o"))
        data[col] = data[col].apply(lambda x: x.lower().replace("ú", "u"))
        data[col] = data[col].apply(lambda x: x.lower().replace("ü", "u"))
        for ch in excluded_punctuation:
            data[col] = data[col].apply(lambda x: x.replace(ch, ""))
        for ch in gap_punct:
            data[col] = data[col].apply(lambda x: x.replace(ch,"_"))

In [50]:
col_text = ['facility_type',
 'risk',
 'inspection_type',
 'results']

In [51]:
#Convertimos variables de texto a formato estándar
standarize_column_strings(data, col_text)
data.head()

Unnamed: 0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude,num_violations
0,2492768,restaurant,risk_2_medium,60647,2021-03-12T00:00:00.000,license,pass,na,41.917134801370274,-87.68754385201615,0
1,2492746,daycare_above_and_under_2_years,risk_1_high,60624,2021-03-11T00:00:00.000,license,not_pass,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.88591964589597,-87.72596362714377,5
2,2492736,restaurant,risk_1_high,60636,2021-03-11T00:00:00.000,canvass,not_pass,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.77971768255646,-87.65456552453057,4
3,2492763,restaurant,risk_1_high,60657,2021-03-11T00:00:00.000,canvass,not_pass,na,41.93368774378592,-87.64445413171818,0
4,2492737,restaurant,risk_1_high,60647,2021-03-11T00:00:00.000,noninspection,not_pass,na,41.91755731772555,-87.68266439539492,0


In [24]:
# Función para reemplazar el valor de una columna específica
def replace_column_value(data, column, search_string, replace_string):
    data[column] = data[column].apply(lambda x: x.replace(search_string, replace_string))

In [25]:
replace_column_value(data,'facility_type', 'restuarant', 'restaurant')
replace_column_value(data,'facility_type', 'theatre', 'theater')
replace_column_value(data,'facility_type', 'herabal', 'herbal')
replace_column_value(data,'facility_type', 'day_care', 'daycare')
replace_column_value(data,'facility_type', 'long_term', 'longterm')

In [26]:
data.loc[data['facility_type'].str.contains('childern|children|1023|5_years_old', case=False, na=None), 'facility_type'] = 'childrens_service_facility'
data.loc[data['facility_type'].str.contains('conv|mart|gas_station_store', case=False, na=None), 'facility_type'] = 'convenience_store'
data.loc[data['facility_type'].str.contains('assis|longterm|nursing|supportive', case=False, na=None), 'facility_type'] = 'assisted_living'
data.loc[data['facility_type'].str.contains('herbal_life|herbalife|herbalcal', case=False, na=None), 'facility_type'] = 'herbalife'
data.loc[data['facility_type'].str.contains('after_school', case=False, na=None), 'facility_type'] = 'after_school'
data.loc[data['facility_type'].str.contains('tavern|pub|brew|wine_tasting|bar_grill|hooka', case=False, na=None), 'facility_type'] = 'bar'
data.loc[data['facility_type'].str.contains('bakery', case=False, na=None), 'facility_type'] = 'bakery'
data.loc[data['facility_type'].str.contains('mobil|truck|mfd', case=False, na=None), 'facility_type'] = 'mobile_food'
data.loc[data['facility_type'].str.contains('kitchen', case=False, na=None), 'facility_type'] = 'kitchen'
data.loc[data['facility_type'].str.contains('restaurant|rstaurant|diner', case=False, na=None), 'facility_type'] = 'restaurant'
data.loc[data['facility_type'].str.contains('retail', case=False, na=None), 'facility_type'] = 'retail'
data.loc[data['facility_type'].str.contains('roof', case=False, na=None), 'facility_type'] = 'rooftop'
data.loc[data['facility_type'].str.contains('grocery', case=False, na=None), 'facility_type'] = 'grocery_store'
data.loc[data['facility_type'].str.contains('liquor', case=False, na=None), 'facility_type'] = 'liquor'
data.loc[data['facility_type'].str.contains('popup', case=False, na=None), 'facility_type'] = 'popup_establishment'
data.loc[data['facility_type'].str.contains('school|college|shcool', case=False, na=None), 'facility_type'] = 'school'
data.loc[data['facility_type'].str.contains('daycare', case=False, na=None), 'facility_type'] = 'daycare'
data.loc[data['facility_type'].str.contains('cafeteria|coffee|cafe', case=False, na=None), 'facility_type'] = 'coffee'
data.loc[data['facility_type'].str.contains('drug_store|pharmacy', case=False, na=None), 'facility_type'] = 'drug_store'
data.loc[data['facility_type'].str.contains('gym|fitness|weight_loss|exercise', case=False, na=None), 'facility_type'] = 'gym'
data.loc[data['facility_type'].str.contains('commissary|machine|commiasary', case=False, na=None), 'facility_type'] = 'vending_machine'
data.loc[data['facility_type'].str.contains('ice_cream|paleteria|gelato', case=False, na=None), 'facility_type'] = 'ice_cream'
data.loc[data['facility_type'].str.contains('banquet', case=False, na=None), 'facility_type'] = 'banquet'
data.loc[data['facility_type'].str.contains('lounge', case=False, na=None), 'facility_type'] = 'lounge'
data.loc[data['facility_type'].str.contains('church|religious', case=False, na=None), 'facility_type'] = 'church'
data.loc[data['facility_type'].str.contains('kiosk', case=False, na=None), 'facility_type'] = 'kiosk'
data.loc[data['facility_type'].str.contains('health|rehab', case=False, na=None), 'facility_type'] = 'health'
data.loc[data['facility_type'].str.contains('event', case=False, na=None), 'facility_type'] = 'events'
data.loc[data['facility_type'].str.contains('donut|hotdog|hot_dog|popcorn|juice|tea|dessert|deli|salad|snack|candy|shake|watermelon|smoothie|food|sushi', case=False, na=None), 'facility_type'] = 'other_food'
data.loc[data['facility_type'].str.contains('poultry|butcher|slaughter|meat', case=False, na=None), 'facility_type'] = 'butcher'
data.loc[data['facility_type'].str.contains('profit', case=False, na=None), 'facility_type'] = 'non_profit'
#data.loc[data['facility_type'].str.contains('na', case=False, na=None), 'facility_type'] = 'not_specified'

In [27]:
data.nunique()

inspection_id      217281
facility_type         121
risk                    5
zip                   113
inspection_date      2833
inspection_type        98
results                 2
violations         158096
latitude            17291
longitude           17291
dtype: int64

In [28]:
#Ahora hacemos limpieza similar para columna de 'inspection_type'
data.loc[data['inspection_type'].str.contains('license', case=False, na=None), 'inspection_type'] = 'license'
data.loc[data['inspection_type'].str.contains('task_force|taskforce', case=False, na=None), 'inspection_type'] = 'task_force'
data.loc[data['inspection_type'].str.contains('canvass|canvas', case=False, na=None), 'inspection_type'] = 'canvas'
data.loc[data['inspection_type'].str.contains('complaint', case=False, na=None), 'inspection_type'] = 'complaint'
data.loc[data['inspection_type'].str.contains('food|sick', case=False, na=None), 'inspection_type'] = 'suspected_food_poisoning'

In [46]:
data['num_violations'] = data['violations'].apply(lambda x: x.count(' | ') + 1 if x != 'na' else 0)

In [47]:
data.groupby('num_violations').count()

Unnamed: 0_level_0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude
num_violations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,58061,58061,58061,58061,58061,58061,58061,58061,57865,57865
1,19464,19464,19464,19464,19464,19464,19464,19464,19370,19370
2,25515,25515,25515,25515,25515,25515,25515,25515,25385,25385
3,26798,26798,26798,26798,26798,26798,26798,26798,26702,26702
4,23504,23504,23504,23504,23504,23504,23504,23504,23432,23432
5,18708,18708,18708,18708,18708,18708,18708,18708,18650,18650
6,13833,13833,13833,13833,13833,13833,13833,13833,13808,13808
7,9606,9606,9606,9606,9606,9606,9606,9606,9587,9587
8,6814,6814,6814,6814,6814,6814,6814,6814,6795,6795
9,4681,4681,4681,4681,4681,4681,4681,4681,4670,4670
