Author: Judit Lozano Gondolbeu

### Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
import pickle
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from numpy import mean
import lightgbm as lgbm
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

In [2]:
#Leemos nuestra tabla de nuevo
df_data = pd.read_csv('../data/df_data.csv')
df_data.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX,TARGET
0,1,1,20,50,1,5,3,3,6,50,1990,1999,41,0,0
1,1,1,20,50,1,5,3,3,1,50,1987,1999,19,0,0
2,1,1,8,50,5,3,6,18,1,50,1986,1999,46,0,0
3,1,1,17,56,1,2,1,1,1,50,1984,1999,28,0,0
4,1,1,17,56,1,2,1,1,1,50,1991,1999,21,0,0


### Codificación de las variables
Applicaremos la siguiente codificación según grupo de variables:

In [3]:
#Grupos por encoding a realizar
numeric_features = ["V_YEAR", 'C_YEAR', "P_AGE"] 
cat_features = list(set(df_data.drop('TARGET', axis=1).columns)-set(numeric_features))

In [4]:
df_data[cat_features] = df_data[cat_features].astype(object)

In [5]:
df_data.dtypes

C_MNTH    object
C_WDAY    object
C_HOUR    object
C_RCFG    object
C_WTHR    object
C_RSUR    object
C_RALN    object
C_TRAF    object
V_TYPE    object
P_SAFE    object
V_YEAR     int64
C_YEAR     int64
P_AGE      int64
P_SEX     object
TARGET     int64
dtype: object

In [6]:
#Definimos train y target
target = df_data[['TARGET']]
train = df_data.drop('TARGET', axis = 1)

In [7]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = ce.CatBoostEncoder()

In [8]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),('cat', categorical_transformer, cat_features)])

Parametros obtenidos en nuestro estudio de car-accidents:

In [9]:
parametros={'class_weight': {0: 0.2, 1: 0.8}, 'learning_rate': 0.01, 'n_estimators': 1000, 'num_leaves': 50, 'random_state':100}

In [10]:
modelo = Pipeline(steps=[('preprocessor', preprocessor),('model', lgbm.LGBMClassifier(**parametros))])

### Dividimos el dataset en train y test
Dividimos el dataset en train y test manteniendo la proporción de la variable objetivo en las dos partes (separación estratificada)

In [11]:
X = train
y = target

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.20, random_state=0)

In [13]:
#comprobación de la estratificación
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

TARGET
0         0.985431
1         0.014569
dtype: float64
TARGET
0         0.985432
1         0.014568
dtype: float64


### Problema de desbalanceo de datos

Para solventar este problema probaremos con la técnica SMOTE.

### SMOTE
SMOTE realiza un oversample de la clase minoritaria segun la configuracion que le demos (Ej: strategy, k-values). Por defecto, el K-value es 5 pero en este caso usaremos un k-value = 6 como ya discutimos en el estudio anterior.

In [14]:
over = SMOTE(sampling_strategy=0.1, k_neighbors=6)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('over', over), ('under', under)]
oversample = Pipeline(steps=steps)

# transformación del dataset
X_train_sm, y_train_sm = oversample.fit_resample(X_train, y_train)

counter = y_train.value_counts()
print("Antes de SMOTE ", counter)

counter2 = y_train_sm.value_counts()
print("Despues de SMOTE ", counter2)

Antes de SMOTE  TARGET
0         3095110
1           45758
dtype: int64
Despues de SMOTE  TARGET
0         1547555
1          309511
dtype: int64


In [15]:
# train the model on train set
modelo.fit(X_train_sm, y_train_sm)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['V_YEAR', 'C_YEAR',
                                                   'P_AGE']),
                                                 ('cat', CatBoostEncoder(),
                                                  ['V_TYPE', 'C_RALN', 'C_RSUR',
                                                   'C_MNTH', 'C_HOUR', 'P_SEX',
                                                   'C_WTHR', 'C_RCFG', 'C_WDAY',
                                                   'P_SAFE', 'C_TRAF'])])),
                ('model',
                 LGBMClassifier(class_weight={0: 0.2, 1: 0.8},
                                learning_rate=0.01, n_estimators=1000,
                                num_leaves=50, random_stat

### Guardamos el modelo entrenado para su posterior utilización

In [16]:
#me guardo el modelo entrenado para su posterior utilización
with open('../models/model_flask.pickle', 'wb') as f:
    pickle.dump(modelo, f)

In [17]:
#para traer de nuevo las variables:
#with open('../data/train_smote.pickle', 'rb') as f:
    #X_train_sm, y_train_sm, X_test, y_test = pickle.load(f)