Author: Judit Lozano Gondolbeu

### Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
import pickle
import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from numpy import mean

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

In [3]:
#Leemos nuestra tabla de nuevo
df_data = pd.read_csv('../data/df_data.csv')
df_data.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX,PP_CAR,TARGET
0,1,1,20,50,1,5,3,3,6,50,1990,1999,41,0,1,0
1,1,1,20,50,1,5,3,3,1,50,1987,1999,19,0,2,0
2,1,1,8,50,5,3,6,18,1,50,1986,1999,46,0,1,0
3,1,1,17,56,1,2,1,1,1,50,1984,1999,28,0,1,0
4,1,1,17,56,1,2,1,1,1,50,1991,1999,21,0,2,0


### Codificación de las variables
Applicaremos la siguiente codificación según grupo de variables:

In [34]:
#Grupos por encoding a realizar
numeric_features = ["V_YEAR", 'C_YEAR', "P_AGE"] 
cat_features = list(set(df_data.drop('TARGET', axis=1).columns)-set(numeric_features))

In [5]:
df_data[cat_features] = df_data[cat_features].astype(object)

In [6]:
df_data.dtypes

C_MNTH    object
C_WDAY    object
C_HOUR    object
C_RCFG    object
C_WTHR    object
C_RSUR    object
C_RALN    object
C_TRAF    object
V_TYPE    object
P_SAFE    object
V_YEAR     int64
C_YEAR     int64
P_AGE      int64
P_SEX     object
PP_CAR    object
TARGET     int64
dtype: object

In [7]:
#Definimos train y target
target = df_data[['TARGET']]
train = df_data.drop('TARGET', axis = 1)

In [8]:
# Catboost encoding
catboost = ce.CatBoostEncoder()
catboost.fit(train, target)
df_coded = catboost.transform(train)
df_coded.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX,PP_CAR
0,0.013036,0.013388,0.018474,0.01005,0.013724,0.016415,0.032684,0.011866,0.020918,0.015462,1990,1999,41,0.018556,0.01941
1,0.013036,0.013388,0.018474,0.01005,0.013724,0.016415,0.032684,0.011866,0.012319,0.015462,1987,1999,19,0.018556,0.015573
2,0.013036,0.013388,0.00981,0.01005,0.021564,0.016234,0.030981,0.020896,0.012319,0.015462,1986,1999,46,0.018556,0.01941
3,0.013036,0.013388,0.010621,0.019557,0.013724,0.01263,0.012028,0.004503,0.012319,0.015462,1984,1999,28,0.018556,0.01941
4,0.013036,0.013388,0.010621,0.019557,0.013724,0.01263,0.012028,0.004503,0.012319,0.015462,1991,1999,21,0.018556,0.015573


### Escalado de las variables
Aunque en algunos algoritmos no sea necesario el escalado de variables, pasaremos a realizarlo en este punto para trabajar con los datos en el mismo formato.

In [9]:
scaler = StandardScaler()
model_scaled = scaler.fit(df_coded)
train_scaled = pd.DataFrame(scaler.transform(df_coded), columns=df_coded.columns, index=df_coded.index)
df_scaled = pd.concat([train_scaled, target], axis=1).reset_index(drop=True)
df_scaled.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX,PP_CAR,TARGET
0,-1.41349,-0.504111,0.639472,-0.605457,-0.246324,0.98728,2.518029,-0.363466,0.709934,0.074408,-0.584604,-1.533381,-0.150771,0.81667,0.839152,0
1,-1.41349,-0.504111,0.639472,-0.605457,-0.246324,0.98728,2.518029,-0.363466,-0.251504,0.074408,-0.692229,-1.533381,-0.896072,0.81667,0.174065,0
2,-1.41349,-0.504111,-0.779007,-0.605457,2.041762,0.890469,2.281274,0.851024,-0.251504,0.074408,-0.728104,-1.533381,0.018616,0.81667,0.839152,0
3,-1.41349,-0.504111,-0.646253,0.668421,-0.246324,-1.036199,-0.353054,-1.353824,-0.251504,0.074408,-0.799854,-1.533381,-0.591176,0.81667,0.839152,0
4,-1.41349,-0.504111,-0.646253,0.668421,-0.246324,-1.036199,-0.353054,-1.353824,-0.251504,0.074408,-0.548729,-1.533381,-0.828318,0.81667,0.174065,0


In [35]:
#Nos guardamos el dataset codificado y escalado
df_scaled.to_csv('../data/df_coded_scaled.csv')

### Dividimos el dataset en train y test
Dividimos el dataset en train y test manteniendo la proporción de la variable objetivo en las dos partes (separación estratificada)

In [39]:
X = df_scaled.drop('TARGET',axis=1)
y = df_scaled['TARGET']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.20, random_state=0)

In [41]:
#comprobación de la estratificación
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

0    0.985431
1    0.014569
Name: TARGET, dtype: float64
0    0.985432
1    0.014568
Name: TARGET, dtype: float64


### Problema de desbalanceo de datos

Para solventar este problema probaremos con la técnica SMOTE.

### SMOTE
SMOTE realiza un oversample de la clase minoritaria segun la configuracion que le demos (Ej: strategy, k-values). Por defecto, el K-value es 5.

A continuación, procederemos a realizar una busqueda de K-values con un grid entre un rago de valores de entre 4 y 9 para intentar optimizar el sintetizado de la clase minoritaria con vistas a generar un mejor resultado para el modelo que utilicemos. También combinaremos la técnica del SMOTE con un ligero undersampling para lograr unos resultados aún más óptimos.

### Resultados del modelo LogisticRegression() sin aplicar SMOTE
Compararemos la metrica ROC antes y después del oversampling para confirmar que la técnica es apropiada (cuando ésta supere el baseline marcado por la curva ROC sin SMOTE de un modelo LogisticRegression por ejemplo que nos da inicialmente 0.50)

In [44]:
# LGGMC object
lgbc = LogisticRegression()
 
# train the model on train set
lgbc.fit(X_train, y_train)
 
predictions = lgbc.predict(X_test)
score = roc_auc_score(y_test, predictions)
# print classification report
print('Antes de SMOTE - Mean ROC AUC: %.3f' % (score))

Antes de SMOTE - Mean ROC AUC: 0.500


In [19]:
# grid search k value for SMOTE oversampling for imbalanced classification

# values to evaluate
k_values = [4, 5, 6, 7, 8 ,9]
for k in k_values:
# define pipeline
    model = LogisticRegression()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.2)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=4, Mean ROC AUC: 0.808
> k=5, Mean ROC AUC: 0.809
> k=6, Mean ROC AUC: 0.809
> k=7, Mean ROC AUC: 0.809
> k=8, Mean ROC AUC: 0.809
> k=9, Mean ROC AUC: 0.809


Optaremos por un k=6 para la realización del SMOTE y nos guardaremos las variables para su utilización en otros cuadernos:

In [30]:
over = SMOTE(sampling_strategy=0.1, k_neighbors=6)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('over', over), ('under', under)]
oversample = Pipeline(steps=steps)

# transformación del dataset
X_train_sm, y_train_sm = oversample.fit_resample(X_train, y_train)

# summarize the new class distribution
counter = Counter(y_train)
print("Antes de SMOTE ", counter)

counter2 = Counter(y_train_sm)
print("Después de SMOTE ", counter2)

Antes de SMOTE  Counter({0: 3095110, 1: 45758})
Después de SMOTE  Counter({0: 1547555, 1: 309511})


In [45]:
# Modelo - Después de aplicar SMOTE
lgbc2 = LogisticRegression()
 
# train the model on train set
lgbc2.fit(X_train_sm, y_train_sm)
 
predictions2 = lgbc2.predict(X_test)
score2 = roc_auc_score(y_test, predictions2)
# print classification report
print('Después de SMOTE - Mean ROC AUC: %.3f' % (score2))

Después de SMOTE - Mean ROC AUC: 0.580


### Comentarios

Como vemos aplicando la técnica SMOTE en combinación con un ligero undersampling hemos conseguido aumentar el resultado de la metrica ROC de 0.50 a 0.58 usando como ejemplo un modelo LogisticRegression.

In [31]:
#me guardo el train y test para su posterior utilización
with open('../data/train_smote.pickle', 'wb') as f:
    pickle.dump([X_train_sm, y_train_sm, X_test, y_test], f)

In [36]:
#para traer de nuevo las variables:
#with open('../data/train_smote.pickle', 'rb') as f:
    #X_train_sm, y_train_sm, X_test, y_test = pickle.load(f)