In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from datetime import datetime
from datetime import timedelta
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from matplotlib.pyplot import figure
from sodapy import Socrata
import seaborn as sns 

from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

import pickle
import time 

In [None]:
client = Socrata("data.cityofchicago.org", None)
results = client.get("4ijn-s7e5", limit=400000)
df = pd.DataFrame.from_records(results)
col_names = df.columns.to_list()
col_name = []
for i in range(len(col_names)):
    col_name.append(col_names[i].replace(" ", "_").lower())
df.columns =col_name

## Limpieza y preprocesamiento

In [None]:
def standarize_column_strings(df, columns, excluded_punctuation=".,*¿?¡!"):
    for col in columns:
        df[col] = df[col].str.lower().astype(str).str.replace(" ", "_")
        df[col] = df[col].str.lower().astype(str).str.replace("-", "_")
        df[col] = df[col].str.lower().astype(str).str.replace("á", "a")
        df[col] = df[col].str.lower().astype(str).str.replace("é", "e")
        df[col] = df[col].str.lower().astype(str).str.replace("í", "i")
        df[col] = df[col].str.lower().astype(str).str.replace("ó", "o")
        df[col] = df[col].str.lower().astype(str).str.replace("ú", "u")
        df[col] = df[col].str.lower().astype(str).str.replace("ü", "u")
        df[col] = df[col].str.lower().astype(str).str.replace(r"[^a-zA-Z\d\_]+", "")
        for ch in excluded_punctuation:
            df[col] = df[col].str.replace(ch, "")

In [None]:
def cleaning(df):
    '''
    Función que convierte las columnas del Data Frame al tipo y forma que se necesita para
    los análisis posteriores
    
    inputs: Data Frame almacenado en el S3 (ingesta.pkl)
    outputs: Data Frame con las variables en formato adecuado (df_clean.pkl)
        
    '''
    #df = pickle.load(open("ingesta.pkl","rb"))
    nrows_prev = df.shape[0]
    ncols_prev = df.shape[1]
    data_null_prev = df.isnull().sum().sum()
    # Variables de texto
    df['violations']= df['violations'].astype('object')
    df['violations_count'] = df.violations.str.count(r'\|')+1
    df['violations_count'] = df.violations_count.fillna(0)
    df['violations_count'] = df['violations_count'].astype('int')
    # Variables categóricas
    df['dba_name']= df['dba_name'].astype('object')
    df['aka_name']= df['aka_name'].astype('object')
    df['facility_type']= df['facility_type'].astype('category')
    df['risk']= df['risk'].astype('category')
    df['address']= df['address'].astype('category')
    df['city']= df['city'].astype('category')
    df['state']= df['state'].astype('category')
    df['inspection_type']= df['inspection_type'].astype('category')
    df['results']= df['results'].astype('category')
    # Variable label_risk
    df['risk'] = df['risk'].replace(["Risk 1 (High)"],3)
    df['risk'] = df['risk'].replace(["Risk 2 (Medium)"],2)
    df['risk'] = df['risk'].replace(["Risk 3 (Low)"],1)
    df['risk'] = df['risk'].replace(["All"],0)
    df['risk'] = pd.to_numeric(df['risk'], errors='coerce')
    df=df.rename(columns = {'risk':'label_risk'})
    df['label_risk'] = df['label_risk'].fillna(3)
    df['label_risk'] = df['label_risk'].astype('int')
    # Variables de fecha
    df['inspection_date'] = pd.to_datetime(df['inspection_date'], infer_datetime_format=True)
    df['inspection_month']=df['inspection_date'].dt.month
    MONTH = 12
    df['sin_mnth'] = np.sin(2*np.pi*df.inspection_month/MONTH)
    df['cos_mnth'] = np.cos(2*np.pi*df.inspection_month/MONTH)
    df['inspection_weekday']=df['inspection_date'].dt.weekday
    WEEKDAY = 7
    df['sin_wkd'] = np.sin(2*np.pi*df.inspection_weekday/WEEKDAY)
    df['cos_wkd'] = np.cos(2*np.pi*df.inspection_weekday/WEEKDAY)
    # Etiqueta
    df['label_results'] = df['results'].apply(lambda x: int(0) if x == 'Fail' else (int(1) if x in ['Pass','Pass w/Conditions'] else int(2)))
    # Imputación de datos
    df.drop(['violations'],axis = 1, inplace = True)
    df.drop(['results'], axis = 1, inplace = True)
    df.drop(df.loc[df['license_'].isnull()].index, inplace=True)
    df.drop(df.loc[df['zip'].isnull()].index, inplace=True)
    df.drop(df.loc[df['label_results'] == 2].index, inplace=True)
    df['aka_name'] = df['aka_name'].fillna(df['dba_name'])
    df['dba_name']= df['dba_name'].astype(str).str.lower()
    df['aka_name']= df['aka_name'].astype(str).str.lower()
    df['facility_type']= df['facility_type'].astype(str).str.lower()
    df['state']= df['state'].astype(str).str.lower()
    df['inspection_type']= df['inspection_type'].astype(str).str.lower()
    df = df[~df['state'].isin(['wi', 'ny', 'in'])]
    col_text = ['dba_name','aka_name']
    # Eliminamos el '_' que aparece al final en la columna 'license_'
    df.rename(columns={'license_':'license'}, inplace=True)
    standarize_column_strings(df, col_text)
    df_dict_dummy = pd.DataFrame(df['aka_name'])
    df_dict_dummy['facility_type'] = df['facility_type']
    df_dict_dummy.drop(df_dict_dummy.loc[df_dict_dummy['facility_type'].isnull()].index, inplace=True)
    group = df_dict_dummy.groupby('aka_name')
    df_dict_dummy2 = group.apply(lambda x: x['facility_type'].unique())
    df_dict_dummy3 = df_dict_dummy2.to_frame()
    df_dict_dummy3.reset_index(level = 'aka_name', inplace = True)
    df_dict_dummy3 = df_dict_dummy3.rename(columns = {0:'facility_type'})
    df_dict_dummy3['facility_type'] = df_dict_dummy3['facility_type'].apply(lambda x: str(x[0]))
    df2 = pd.merge(df,df_dict_dummy3, how = 'left', on = 'aka_name')
    df2['facility_type_x'] = df2['facility_type_x'].fillna(df2['facility_type_y'])
    df2['facility_type_x'] = df2['facility_type_x'].fillna('restaurant')
    df2=df2.rename(columns = {'facility_type_x':'facility_type'})
    df2.drop(['inspection_id','dba_name','address','city','state','latitude','longitude','location','facility_type_y','inspection_weekday','inspection_month'],axis = 1, inplace = True)
    #pickle.dump(df2,open("df_clean.pkl","wb"))
    nrows_after = df2.shape[0]
    ncols_after = df2.shape[1]
    return df2, nrows_prev, ncols_prev, nrows_after, ncols_after, data_null_prev

## Feature Engineering

In [None]:
def feat_eng(df_fe):
    '''
    Función que realiza la selección de los features que serán utilizdos para la clasificación
    
    inputs: Data Frame limpio (df_clean.pkl)
    outputs: Data Frame con la matriz de diseño para el modelo (df_clean.pkl)
        
    '''
     
    # Transformación a OHE
    df_fe = df_fe.sort_values(by='inspection_date', ascending=True)
    df_input = pd.DataFrame(df_fe[['label_risk','label_results','zip','facility_type']])
    data_input_ohe = pd.get_dummies(df_input)
    etiqueta = data_input_ohe.label_results
    data_input_ohe= data_input_ohe.drop('label_results', axis = 1)
    variables_lista = list(data_input_ohe.columns)
    # Grid Search
    np.random.seed(20201124)
    # ocuparemos un RF
    classifier = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=1234)
    # separando en train, test
    #X_train, X_test, y_train, y_test = train_test_split(data_input_ohe, etiqueta, test_size=0.3)

    # definicion de los hiperparametros que queremos probar
    hyper_param_grid = {'n_estimators': [300, 400], #'min_samples_leaf': [3,5,7],
                        'max_depth': [7, 10],
                        'min_samples_split': [3],
                        'max_features': [10, 15, 20],
                        'criterion': ['gini']}
    # usamos TimeSeriesSplit para dividir respetando el orden cronológico
    tscv = TimeSeriesSplit(n_splits=3)
    # This was the trickiest part as a newbie. Straight from the docs
    # If you only have experience with CV splits this way
    # of making the splits might seem foreign. Fret not.
    for train_index, test_index in tscv.split(data_input_ohe):
        X_train, X_test = data_input_ohe.iloc[train_index, :], data_input_ohe.iloc[test_index,:]
        y_train, y_test = etiqueta.iloc[train_index], etiqueta.iloc[test_index]
    # ocupemos grid search
    gs = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'precision', return_train_score=True,
                           cv = tscv)
    start_time = time.time()
    gs.fit(X_train, y_train)
    best_rf = gs.best_estimator_
    best_score = gs.best_estimator_.oob_score_
    feature_importance = pd.DataFrame({'importance':\
                                       best_rf.feature_importances_,\
                                       'feature': variables_lista})
    feature_importance=feature_importance.sort_values(by="importance", ascending=False)
    #fi_out = feature_importance.head(10)
    time_exec = time.time() - start_time
    nrows_ohe = data_input_ohe.shape[0]
    ncols_ohe = data_input_ohe.shape[1]
    #print("Tiempo en ejecutar: ", time.time() - start_time)
    return df_input, nrows_ohe, ncols_ohe, float(best_score), time_exec, str(best_rf)

In [None]:
# Pickle con la base de datos original
pickle.dump(df,open("df_raw.pkl","wb"))
df_clean, nrows_prev, ncols_prev, nrows_after, ncols_after, data_null_prev = cleaning(df)

**Extracción de datos del último día de 2020 hacia atrás**

In [None]:
var = df_clean.loc[df_clean.inspection_date > pd.to_datetime('2020-12-31'), :].index
df_clean.drop(var,axis=0).reset_index

In [None]:
meta_clean = pd.DataFrame({'nrows_prev' : nrows_prev,
                        'ncols_prev' : ncols_prev,
                        'nrows_after' : nrows_after,
                        'ncols_after' : ncols_after,
                        'data_null_prev' : data_null_prev}, index = [0])
meta_clean

In [None]:
# Pickles con la base de datos limpia y su metadata
pickle.dump(df_clean,open("df_clean.pkl","wb"))
pickle.dump(meta_clean,open("meta_clean.pkl","wb"))

In [None]:
df_fe, nrows_ohe, ncols_ohe, best_score, time_exec, best_rf = feat_eng(df_clean)

In [None]:
meta_fe = pd.DataFrame({'nrows_ohe' : nrows_ohe,
                        'ncols_ohe' : ncols_ohe,
                        'best_score' : best_score,
                        'time_exec' : time_exec,
                        'best_rf' : best_rf}, index = [0])
meta_fe                        

In [None]:
# Pickles con Feature Engineering y su metadata
pickle.dump(df_fe,open("df_fe.pkl","wb"))
pickle.dump(meta_fe,open("meta_fe.pkl","wb"))

In [None]:
df_raw = pickle.load(open("df_raw.pkl","rb"))

In [None]:
df_clean = pickle.load(open("df_clean.pkl","rb"))

## Entrenamiento

In [3]:
import pickle
df_fe = pickle.load(open("df_fe.pkl","rb"))

**Transformación a OHE**

In [4]:
# Aplicamos OneHot Encoding
data_input_ohe = pd.get_dummies(df_fe)
etiqueta = data_input_ohe.label_results
data_input_ohe= data_input_ohe.drop('label_results', axis = 1)
variables_lista = list(data_input_ohe.columns)

In [5]:
# Hacemos TimeSeriesSplit para obtener las matrices de entrenamiento y prueba
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(data_input_ohe):
    X_train, X_test = data_input_ohe.iloc[train_index, :], data_input_ohe.iloc[test_index,:]
    y_train, y_test = etiqueta.iloc[train_index], etiqueta.iloc[test_index]

In [6]:
# Metadata de las matrices para el modelo
nrows_train = X_train.shape[0]
nrows_test = X_test.shape[0]
meta_train = pd.DataFrame({'nrows_train' : nrows_train,
                           'nrows_test' : nrows_test}, index = [0])
meta_train

Unnamed: 0,nrows_train,nrows_test
0,117340,39113


In [7]:
# Hacemos un solo DF para los Datasets de Entrenamiento y Prueba y con la etiqueta
X_train_1 = X_train.assign(Set = 'entrenamiento')
X_train_1 = X_train_1.assign(etiqueta = y_train)
X_test_1 = X_test.assign(Set = 'prueba')
X_test_1 = X_test_1.assign(etiqueta = y_test)
df_train_test = pd.concat([X_train_1, X_test_1], axis = 0)

df_train_test.shape

(156453, 528)

In [None]:
max(df_train_test.etiqueta) > 0

In [None]:
# Funciones para regresar el DataFrame con las etiquetas y los sets de entrenamiento y 
# y prueba a los cuatro DF Xtrain, Ytrain, Xtst y Ytest
X_train_2 = df_train_test[df_train_test.Set == 'entrenamiento']
y_train_2 = X_train_2.etiqueta
X_train_2 = X_train_2.iloc[:,0:df_train_test.shape[1]-2]


X_test_2 = df_train_test[df_train_test.Set == 'prueba']
y_test_2 = X_test_2.etiqueta
X_test_2 = X_test_2.iloc[:,0:df_train_test.shape[1]-2]

## Selección de Modelo

In [8]:
# Algoritmos a evaluar: DecisionTree y RandomForest
algorithms_dict = {'tree': 'tree_grid_search'}
algorithms = ['tree']
# Hiperparámetros a evaluar en cada algoritmo:
grid_search_dict = {'tree_grid_search': {'max_depth': [5,10,15], 
                                         'min_samples_leaf': [3,5,7]}}

# Configuraciones generales de cada algoritmo a evaluar:
estimators_dict = {'tree': DecisionTreeClassifier(random_state=1111)}
best_estimators = []

In [9]:
start_time = time.time()
for algorithm in algorithms:
    estimator = estimators_dict[algorithm]
    grid_search_to_look = algorithms_dict[algorithm]
    grid_params = grid_search_dict[grid_search_to_look]
    gs = GridSearchCV(estimator, grid_params, scoring='precision', cv=tscv, n_jobs=-1)
    start_time = time.time()
    #train
    gs.fit(X_train, y_train)
    #best estimator
    best_estimators.append(gs)
time_exec = time.time() - start_time

In [10]:
best_tree = best_estimators[0].best_estimator_
best_tree

DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, random_state=1111)

In [11]:
r = pd.DataFrame(best_estimators[0].cv_results_)
r = r.sort_values("rank_test_score")
r.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,3.768035,1.826298,0.152156,0.031171,15,5,"{'max_depth': 15, 'min_samples_leaf': 5}",0.758883,0.773153,0.756926,0.762987,0.007233,1
8,3.313803,1.38662,0.141566,0.013434,15,7,"{'max_depth': 15, 'min_samples_leaf': 7}",0.759076,0.77287,0.756963,0.76297,0.007054,2


In [12]:
lista = r.params.astype(str)
test_mod = "|".join(lista)
lista_2 = r.mean_test_score.astype(str)
mean_scores = "|".join(lista_2)
lista_3 = r.rank_test_score.astype(str)
rank_model = "|".join(lista_2)

In [15]:
test_mod

"{'max_depth': 15, 'min_samples_leaf': 5}|{'max_depth': 15, 'min_samples_leaf': 7}|{'max_depth': 15, 'min_samples_leaf': 3}|{'max_depth': 10, 'min_samples_leaf': 7}|{'max_depth': 10, 'min_samples_leaf': 3}|{'max_depth': 10, 'min_samples_leaf': 5}|{'max_depth': 5, 'min_samples_leaf': 7}|{'max_depth': 5, 'min_samples_leaf': 5}|{'max_depth': 5, 'min_samples_leaf': 3}"

In [13]:
mean_scores

'0.762987478396355|0.7629698087544893|0.7627601253893648|0.7625208491614459|0.7625102065289221|0.7625073151753209|0.7624475294845636|0.7624157249220945|0.7623866831814622'

In [14]:
rank_model

'0.762987478396355|0.7629698087544893|0.7627601253893648|0.7625208491614459|0.7625102065289221|0.7625073151753209|0.7624475294845636|0.7624157249220945|0.7623866831814622'

In [None]:
model = best_tree

In [None]:
m = model.fit(X_train, y_train)
pickle.dump(m, open("best_model.pkl", 'wb'))

In [None]:
m = pickle.load(open("best_model.pkl","rb"))

In [None]:
type(m)

In [None]:
resultados = m.predict(X_test)

In [None]:
res = pd.DataFrame(resultados)
res=res.rename(columns = {0:'label'})
res.label.value_counts()

In [None]:
res_2 = pd.DataFrame(y_test)
res_2.label_results.value_counts()

In [None]:
1 < time_exec

In [None]:
models = pd.DataFrame(grid_search_dict)

In [None]:
models

In [None]:
dic = {'t_exec':time_exec,'best_tree':best_tree}
dic

In [None]:
x = pd.DataFrame(dic,index=[0])
x

In [None]:
x.t_exec

In [None]:
df_fe.shape

In [None]:
# Algoritmos a evaluar: DecisionTree y RandomForest
algorithms_dict = {'random_forest': 'rf_grid_search'}
algorithms = ['random_forest']
# Hiperparámetros a evaluar en cada algoritmo:
grid_search_dict = {'rf_grid_search': {'n_estimators': [500],  
                                      'max_depth': [5,10], 
                                      'min_samples_leaf': [10]}}

# Configuraciones generales de cada algoritmo a evaluar:
estimators_dict = {'random_forest': RandomForestClassifier(oob_score=True, random_state=2222)}
best_estimators = []

In [None]:
start_time = time.time()
for algorithm in algorithms:
    estimator = estimators_dict[algorithm]
    grid_search_to_look = algorithms_dict[algorithm]
    grid_params = grid_search_dict[grid_search_to_look]
    gs = GridSearchCV(estimator, grid_params, scoring='precision', cv=tscv, n_jobs=-1)
    start_time = time.time()
    #train
    gs.fit(X_train, y_train)
    #best estimator
    best_estimators.append(gs)
time_exec = time.time() - start_time

In [None]:
best_rf = best_estimators[0].best_estimator_
best_rf

In [None]:
model_rf = best_rf
m_rf = model_rf.fit(X_train, y_train)
pickle.dump(m, open("best_model_rf.pkl", 'wb'))

In [None]:
resultados_rf = best_rf.predict(X_test)

In [None]:
res_rf = pd.DataFrame(resultados_rf)
res_rf = res_rf.rename(columns = {0:'label'})
res_rf.label.value_counts()