# Feature engineering

In [1]:
import pandas as pd
import re

In [2]:
data_path = '/Users/cbautistap/Dropbox/MCD/Cursos/primavera-2021/dpa/data/food_inspections_clean.csv'

In [3]:
data = pd.read_csv(data_path, parse_dates=['inspection_date'])

In [4]:
data.head()

Unnamed: 0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude,num_violations
0,2472391,not_specified,risk_2_medium,60610,2021-01-07,license,not_pass,na,41.910736,-87.634551,0
1,2453551,restaurant,risk_1_high,60654,2020-10-20,license,pass,na,41.896585,-87.642996,0
2,2386633,restaurant,risk_1_high,60640,2020-08-28,canvas,not_pass,na,41.976301,-87.668276,0
3,2386595,not_specified,risk_1_high,60618,2020-08-27,complaint,not_pass,na,41.939256,-87.70227,0
4,2386523,restaurant,risk_2_medium,60636,2020-08-26,complaint,pass,na,41.778361,-87.664337,0


In [5]:
data.dtypes

inspection_id               int64
facility_type              object
risk                       object
zip                         int64
inspection_date    datetime64[ns]
inspection_type            object
results                    object
violations                 object
latitude                  float64
longitude                 float64
num_violations              int64
dtype: object

In [6]:
# al importar con pd.read_csv() el data frame lee la columna zip como int64, por lo tanto hay que convertirla a string de nuevo. ESTO SE PUEDE CORREGIR AL GUARDAR COMO PICKEL
data = data.astype({"zip": 'str'})

In [7]:
data.dtypes

inspection_id               int64
facility_type              object
risk                       object
zip                        object
inspection_date    datetime64[ns]
inspection_type            object
results                    object
violations                 object
latitude                  float64
longitude                 float64
num_violations              int64
dtype: object

**Creamos variable de día de la semana 'dow' y de mes 'month', a partir de la variable inspection date**

In [8]:
data['dow'] = data['inspection_date'].dt.day_name()
data['dow'] = data['dow'].str.lower()
data['month'] = data['inspection_date'].dt.month
data.head()

Unnamed: 0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude,num_violations,dow,month
0,2472391,not_specified,risk_2_medium,60610,2021-01-07,license,not_pass,na,41.910736,-87.634551,0,thursday,1
1,2453551,restaurant,risk_1_high,60654,2020-10-20,license,pass,na,41.896585,-87.642996,0,tuesday,10
2,2386633,restaurant,risk_1_high,60640,2020-08-28,canvas,not_pass,na,41.976301,-87.668276,0,friday,8
3,2386595,not_specified,risk_1_high,60618,2020-08-27,complaint,not_pass,na,41.939256,-87.70227,0,thursday,8
4,2386523,restaurant,risk_2_medium,60636,2020-08-26,complaint,pass,na,41.778361,-87.664337,0,wednesday,8


In [9]:
# Son categorìas, por lo tanto lo convertimos a string
data = data.astype({"month": 'str'})

In [10]:
data.dtypes

inspection_id               int64
facility_type              object
risk                       object
zip                        object
inspection_date    datetime64[ns]
inspection_type            object
results                    object
violations                 object
latitude                  float64
longitude                 float64
num_violations              int64
dow                        object
month                      object
dtype: object

**Ahora, para las variables 'facility_type' y 'inspection_type' agrupamos en categoría 'other' aquellos valores que representan menos del 1% de las observaciones**

In [11]:
freq_it = data['inspection_type'].value_counts(normalize=True)

In [12]:
freq_ft = data['facility_type'].value_counts(normalize=True)

In [13]:
mapping_it = data['inspection_type'].map(freq_it)
mapping_ft = data['facility_type'].map(freq_ft)

In [14]:
data['inspection_type'] = data['inspection_type'].mask(mapping_it < 0.0002, 'other')
data['facility_type'] = data['facility_type'].mask(mapping_ft < 0.0002, 'other')

In [15]:
data['inspection_type'].nunique()

14

In [16]:
data.groupby('inspection_type').count()

Unnamed: 0_level_0,inspection_id,facility_type,risk,zip,inspection_date,results,violations,latitude,longitude,num_violations,dow,month
inspection_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
canvas,137093,137093,137093,137093,137093,137093,137093,136623,136623,137093,137093,137093
complaint,35719,35719,35719,35719,35719,35719,35719,35670,35670,35719,35719,35719
consultation,674,674,674,674,674,674,674,673,673,674,674,674
license,38591,38591,38591,38591,38591,38591,38591,38410,38410,38591,38591,38591
no_entry,72,72,72,72,72,72,72,72,72,72,72,72
noninspection,205,205,205,205,205,205,205,204,204,205,205,205
other,51,51,51,51,51,51,51,51,51,51,51,51
out_of_business,307,307,307,307,307,307,307,306,306,307,307,307
package_liquor_1474,44,44,44,44,44,44,44,43,43,44,44,44
recent_inspection,366,366,366,366,366,366,366,366,366,366,366,366


In [17]:
data['facility_type'].nunique()

30

In [18]:
#ONE-LINER
#data['inspection_type'].mask(data['inspection_type'].map(data['inspection_type'].value_counts(normalize=True)) < 0.0002, 'other')
#data['facility_type'].mask(data['facility_type'].map(data['facility_type'].value_counts(normalize=True)) < 0.0002, 'other')

In [19]:
data.groupby('facility_type').count()

Unnamed: 0_level_0,inspection_id,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude,num_violations,dow,month
facility_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
assisted_living,1760,1760,1760,1760,1760,1760,1760,1723,1723,1760,1760,1760
bakery,3205,3205,3205,3205,3205,3205,3205,3205,3205,3205,3205,3205
banquet,346,346,346,346,346,346,346,346,346,346,346,346
bar,485,485,485,485,485,485,485,481,481,485,485,485
butcher,148,148,148,148,148,148,148,148,148,148,148,148
catering,1314,1314,1314,1314,1314,1314,1314,1314,1314,1314,1314,1314
childrens_service_facility,3853,3853,3853,3853,3853,3853,3853,3853,3853,3853,3853,3853
church,64,64,64,64,64,64,64,61,61,64,64,64
coffee,177,177,177,177,177,177,177,177,177,177,177,177
convenience_store,184,184,184,184,184,184,184,184,184,184,184,184


In [20]:
data['results'] = data['results'].apply(lambda x: '1' if x in ['pass'] else '0' )

In [21]:
data.head()

Unnamed: 0,inspection_id,facility_type,risk,zip,inspection_date,inspection_type,results,violations,latitude,longitude,num_violations,dow,month
0,2472391,not_specified,risk_2_medium,60610,2021-01-07,license,0,na,41.910736,-87.634551,0,thursday,1
1,2453551,restaurant,risk_1_high,60654,2020-10-20,license,1,na,41.896585,-87.642996,0,tuesday,10
2,2386633,restaurant,risk_1_high,60640,2020-08-28,canvas,0,na,41.976301,-87.668276,0,friday,8
3,2386595,not_specified,risk_1_high,60618,2020-08-27,complaint,0,na,41.939256,-87.70227,0,thursday,8
4,2386523,restaurant,risk_2_medium,60636,2020-08-26,complaint,1,na,41.778361,-87.664337,0,wednesday,8


## Función cool Quique

In [22]:
def extract_violation_num(s) -> int:
    """
    Extrae el número de violación en un string. 
    :param s: string
              texto del registro
    :return: int
    """
    pattern = '_?\d+_'  # guion_bajo opcional + numeros + guion_bajo
    result = re.findall(pattern, s)
    if not result:  # no hay violaciones
        return 0
    else:  # regresa número de violación
        return int(result[0].replace('_', ''))
    

def get_violations_incurred(s) -> list:
    """
    Extra el total de violaciones en un registro. 
    :param s: string
              Una celda de la columna 'violations'
    :return violation_nums: list
            lista con todas las infracciones, por ejemplo, [13,22,55]
    """
    all_violations = s.split('~')
    violation_nums = []
    for violation in all_violations:
        violation_nums.append(extract_violation_num(violation))
    
    return violation_nums
    
def add_extra_columns(df):
    """Añade columnas para one hot encoding."""
    for i in range(1, 80):
        column = 'violation_' + str(i)
        df[column] = 0
    
    return df

def add_one_hot_encoding(df):
    """ 
    Realiza todo el pipeline.
    
    :param df: pandas dataframe
    :return df: dataframe con el one hot encoding
    """
    
    # crear columnas
    df = add_extra_columns(df)
    df['lista_violaciones'] = df.violations.apply(get_violations_incurred)
    
    # luego llenamos df
    for i in range(len(df)):
        violaciones = df['lista_violaciones'][i]
        violaciones = [e for e in violaciones if e!= '0']
        for violacion in violaciones:
            column_index = violacion + 12
            df.iloc[i, column_index] = 1

    # tiramos columnas  temporales y las que están todas en ceros
    df.drop(columns=['lista_violaciones'], inplace = True)
    df = df.loc[(df.sum(axis=1) != 0), (df.sum(axis=0) != 0)]
    
    return df

In [23]:
data = add_one_hot_encoding(data)

Una vez que se tenga la matriz que se va a usar, se divide en X para features e y para etiqueta. Luego se hace el split en train y test y se hace el one hot encoding. Ver como hacer para que también se haga a X_test. Por último, se hace lo de feature importances y listo.

In [24]:
data = data.drop(['inspection_id','inspection_date','violations'], axis=1)

In [25]:
# OBS. para evitar data leaking es necesario primero hacer split en test y train antes del one hot encoding (y que otras actividades de feature engineering?)
# Además, separar en X e y.

In [26]:
data.dtypes

facility_type       object
risk                object
zip                 object
inspection_type     object
results             object
latitude           float64
longitude          float64
num_violations       int64
dow                 object
month               object
dtype: object

In [27]:
X= data.drop('results', axis = 1)
y = data.results

(X.shape, y.shape)

((215130, 9), (215130,))

In [28]:
X.dtypes

facility_type       object
risk                object
zip                 object
inspection_type     object
latitude           float64
longitude          float64
num_violations       int64
dow                 object
month               object
dtype: object

In [29]:
y

0         0
1         1
2         0
3         0
4         1
         ..
215125    0
215126    1
215127    0
215128    1
215129    1
Name: results, Length: 215130, dtype: object

**Nota. El checkpoint solo pide feature engineering, por lo que debería estar completo al dejar el data frame con las feature que nos interesan? o de una vez hacer la separación en features (X) y label (y) -como se hace típicamente para entrenar el modelo-? O también es necesario hacer el one hot encoding? Sin embargo, si se hace el one hot encoding (y utilizando sklearn) se debe hacer el train_test_split antes para evitar data leakage. O PODEMOS NO HACER SPLIT EN TRAIN, TEST Y HACER CV. A continuación se hace la separación**

In [30]:
## Separar en train, test 

import time 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## verificando los tamaños de nuestros 4 sets
print("entrenamiento: X: {}, y: {}".format(X_train.shape, y_train.shape))
print("prueba: X: {}, y: {}".format(X_test.shape, y_test.shape))

entrenamiento: X: (150591, 9), y: (150591,)
prueba: X: (64539, 9), y: (64539,)


## One hot encoding

Con pandas get_dummies()

In [31]:
# Generamos dummies para las variables categóricas
X_train_dum = pd.get_dummies(X_train)

Este es el data frame con las variables a utilizar

In [32]:
X_train_dum

Unnamed: 0,latitude,longitude,num_violations,facility_type_assisted_living,facility_type_bakery,facility_type_banquet,facility_type_bar,facility_type_butcher,facility_type_catering,facility_type_childrens_service_facility,...,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9
2432,41.809219,-87.620035,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
21380,41.878228,-87.633805,10,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
122972,41.987161,-87.660031,7,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
40170,41.988575,-87.812978,3,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
105470,41.750727,-87.741269,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132449,41.729337,-87.647870,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
31868,41.820350,-87.616514,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
69660,41.910044,-87.721888,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
192559,41.967682,-87.739305,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [33]:
features_list = list(X_train_dum.columns)

**Con OneHotEncoder y Pipeline de sklearn**

In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

numerical_ix = X.select_dtypes(include=['int64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t, remainder="drop", n_jobs=-1, verbose=True)

In [35]:
# También se puede usar

#X_train = col_transform.fit_transform(X_train)

In [36]:
#col_transform.fit(X_train)

In [37]:
#X_train = col_transform.transform(X_train)

In [38]:
# Si coincide el número de columnas con get_dummies()
#X_train

In [39]:
#col_transform.categories_

In [40]:
import time 

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

**Definimos modelo (Random Forest Classifier) y creamos Pipeline**

In [41]:
model = RandomForestClassifier(n_estimators=1000, criterion='gini',min_samples_split=7, oob_score=True, random_state=1234, n_jobs=-1)
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# define the model cross-validation configuration
#cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X_train, y_train, scoring='precision', cv=5, n_jobs=-1)
print("Tiempo en ejecutar: ", time.time() - start_time)

NameError: name 'start_time' is not defined

## Hyperparamater tunning
**Con GridSearchCV**

In [None]:


# ocuparemos un RF
classifier = RandomForestClassifier(oob_score=True, random_state=1234, n_jobs=-1)

# definicion de los hiperparametros que queremos probar
hyper_param_grid = {'n_estimators': [300, 500, 1000], 
                    #'max_depth': [1, 5, 10],
                    'criterion': ['gini'],
                    'min_samples_split': [3,7,11]}

# ocupemos grid search!
gs = GridSearchCV(classifier, 
                           hyper_param_grid, 
                           scoring = 'precision',
                           cv = 5,
                           return_train_score=True,
                           n_jobs = -1)
start_time = time.time()
gs.fit(X_train, y_train)
print("Tiempo en ejecutar: ", time.time() - start_time)

In [None]:
# n_estimators corresponde al número de árboles que queremos crear
#grid = {'n_estimators': [300, 500, 1000], 'min_samples_leaf': [3,7,11], 
#       'criterion':['gini','entropy']}

#rf = RandomForestClassifier(oob_score=True, n_jobs=-1)
#gs_rf = GridSearchCV(rf, grid, cv=2, scoring='precision', return_train_score=True, n_jobs=-1)

In [None]:
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
 
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/abalone.csv'
dataframe = read_csv(url, header=None)
# split into inputs and outputs
last_ix = len(dataframe.columns) - 1
X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
print(X.shape, y.shape)
# determine categorical and numerical features
numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X.select_dtypes(include=['object', 'bool']).columns
# define the data preparation for the columns
t = [('cat', OneHotEncoder(), categorical_ix), ('num', MinMaxScaler(), numerical_ix)]
col_transform = ColumnTransformer(transformers=t)
# define the model
model = SVR(kernel='rbf',gamma='scale',C=100)
# define the data preparation and modeling pipeline
pipeline = Pipeline(steps=[('prep',col_transform), ('m', model)])
# define the model cross-validation configuration
cv = KFold(n_splits=10, shuffle=True, random_state=1)
# evaluate the pipeline using cross validation and calculate MAE
scores = cross_val_score(pipeline, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# convert MAE scores to positive values
scores = absolute(scores)
# summarize the model performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))