### Libraries

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
import scipy.stats as stats
from sklearn.model_selection import train_test_split, GridSearchCV
from dython.nominal import associations
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN
from matplotlib import pyplot
from numpy import where
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
import warnings
import pickle
from sklearn.dummy import DummyClassifier
from collections import Counter
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

In [10]:
# nos guardamos la tabla a CSV
#df_data.to_csv('datos/df_data.csv', index=False)

In [67]:
#Leemos nuestra tabla de nuevo
df_data = pd.read_csv('datos/df_data.csv')
df_data.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX,TARGET
0,1,1,20,50,1,5,3,3,6,50,1990,1999,41,0,0
1,1,1,20,50,1,5,3,3,1,50,1987,1999,19,0,0
2,1,1,8,50,5,3,6,18,1,50,1986,1999,46,0,0
3,1,1,17,56,1,2,1,1,1,50,1984,1999,28,0,0
4,1,1,17,56,1,2,1,1,1,50,1991,1999,21,0,0


In [68]:
df_data.dtypes

C_MNTH    int64
C_WDAY    int64
C_HOUR    int64
C_RCFG    int64
C_WTHR    int64
C_RSUR    int64
C_RALN    int64
C_TRAF    int64
V_TYPE    int64
P_SAFE    int64
V_YEAR    int64
C_YEAR    int64
P_AGE     int64
P_SEX     int64
TARGET    int64
dtype: object

### Codificación de las variables
Applicaremos la siguiente codificación según grupo de variables:

In [69]:
#Grupos select_dtypesn tipo de encoding a realizar
numeric_features = ["V_YEAR", 'C_YEAR', "P_AGE"] 
cat_features = list(set(df_data.drop('TARGET', axis=1).columns)-set(numeric_features))

In [70]:
df_data[cat_features] = df_data[cat_features].astype(object)

In [71]:
df_data.dtypes

C_MNTH    object
C_WDAY    object
C_HOUR    object
C_RCFG    object
C_WTHR    object
C_RSUR    object
C_RALN    object
C_TRAF    object
V_TYPE    object
P_SAFE    object
V_YEAR     int64
C_YEAR     int64
P_AGE      int64
P_SEX     object
TARGET     int64
dtype: object

In [72]:
#Definimos train y target
target = df_data[['TARGET']]
train = df_data.drop('TARGET', axis = 1)

In [73]:
# Catboost encoding
catboost = ce.CatBoostEncoder()
catboost.fit(train, target)
df_coded = catboost.transform(train)
df_coded.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX
0,0.013036,0.013388,0.018474,0.01005,0.013724,0.016415,0.032684,0.011866,0.020918,0.015462,1990,1999,41,0.018556
1,0.013036,0.013388,0.018474,0.01005,0.013724,0.016415,0.032684,0.011866,0.012319,0.015462,1987,1999,19,0.018556
2,0.013036,0.013388,0.00981,0.01005,0.021564,0.016234,0.030981,0.020896,0.012319,0.015462,1986,1999,46,0.018556
3,0.013036,0.013388,0.010621,0.019557,0.013724,0.01263,0.012028,0.004503,0.012319,0.015462,1984,1999,28,0.018556
4,0.013036,0.013388,0.010621,0.019557,0.013724,0.01263,0.012028,0.004503,0.012319,0.015462,1991,1999,21,0.018556


### Escalado de las variables
Aunque en algunos salgoritmos no sea necesario el escalado de variables, pasaremos a realizarlo en este punto para trabajar con los datos en formato unificado.

In [74]:
scaler = StandardScaler()
model_scaled = scaler.fit(df_coded)
train_scaled = pd.DataFrame(scaler.transform(df_coded), columns=df_coded.columns, index=df_coded.index)
df_scaled = pd.concat([train_scaled, target], axis=1).reset_index(drop=True)
df_scaled.head()

Unnamed: 0,C_MNTH,C_WDAY,C_HOUR,C_RCFG,C_WTHR,C_RSUR,C_RALN,C_TRAF,V_TYPE,P_SAFE,V_YEAR,C_YEAR,P_AGE,P_SEX,TARGET
0,-1.41349,-0.504111,0.639472,-0.605457,-0.246324,0.98728,2.518029,-0.363466,0.709934,0.074408,-0.584604,-1.533381,-0.150771,0.81667,0
1,-1.41349,-0.504111,0.639472,-0.605457,-0.246324,0.98728,2.518029,-0.363466,-0.251504,0.074408,-0.692229,-1.533381,-0.896072,0.81667,0
2,-1.41349,-0.504111,-0.779007,-0.605457,2.041762,0.890469,2.281274,0.851024,-0.251504,0.074408,-0.728104,-1.533381,0.018616,0.81667,0
3,-1.41349,-0.504111,-0.646253,0.668421,-0.246324,-1.036199,-0.353054,-1.353824,-0.251504,0.074408,-0.799854,-1.533381,-0.591176,0.81667,0
4,-1.41349,-0.504111,-0.646253,0.668421,-0.246324,-1.036199,-0.353054,-1.353824,-0.251504,0.074408,-0.548729,-1.533381,-0.828318,0.81667,0


In [76]:
df_scaled.to_csv('datos/df_coded_scaled.csv', index=False)

### Dividimos el dataset en train y test
Dividimos el dataset en train y test manteniendo la proporción de la variable objetivo en las dos partes (separación estratificada)

In [77]:
X = df_scaled.drop('TARGET',axis=1)
y = df_scaled['TARGET']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, shuffle=True, test_size=0.20, random_state=0)

In [79]:
#comprobación de la estratificación
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

0    0.985431
1    0.014569
Name: TARGET, dtype: float64
0    0.985432
1    0.014568
Name: TARGET, dtype: float64


### Problema de desbalanceo de datos

Para solventar este problema probaremos con la técnica SMOTE y con la técnica SMOTE Tomeklinks y comprobaremos cuál nos dá mejores resultados.

### SMOTE
SMOTE realiza un oversample de la clase minoritaria segun la configuracion que le demos (Ej: strategy, k-values). Por defecto, el K-value es 5. Sin embargo, procederemos a realizar una busqueda de K-values con un grid entre un rago de valores de entre 1 y 7 para intentar optimizar el sintetizado de la clase minoritaria con vistas a generar un resultado más apropiado para el modelo que utilicemos.

Por ejemplo, para un LogisticRegression, procederemos a analizar cual serían el K_value en SMOTE que nos proporcionaría el mejor resultado.

### Resultados del modelo LogisticRegression sin aplicar SMOTE

In [88]:
from sklearn.metrics import confusion_matrix, classification_report

# logistic regression object
lr = LogisticRegression()
 
# train the model on train set
lr.fit(X_train, y_train.ravel())
 
predictions = lr.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    773779
           1       0.20      0.00      0.00     11439

    accuracy                           0.99    785218
   macro avg       0.59      0.50      0.50    785218
weighted avg       0.97      0.99      0.98    785218



In [89]:
def print_binary_evaluation(y_true, y_pred):
    results_dict = {'accuracy': accuracy_score(y_true, y_pred),
                    'recall': recall_score(y_true, y_pred),
                    'precision': precision_score(y_true, y_pred),
                    'f1_score': f1_score(y_true, y_pred)}
    return results_dict

In [90]:
print(lr)
print("model score: %.3f" % lr.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.985


{'accuracy': 0.9853989592699097,
 'recall': 0.0007867820613690008,
 'precision': 0.20454545454545456,
 'f1_score': 0.0015675346163894451}

In [91]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))
 
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())
 
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
 
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

Before OverSampling, counts of label '1': 45758
Before OverSampling, counts of label '0': 3095110 

After OverSampling, the shape of train_X: (6190220, 14)
After OverSampling, the shape of train_y: (6190220,) 

After OverSampling, counts of label '1': 3095110
After OverSampling, counts of label '0': 3095110


In [92]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions = lr1.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.72      0.83    773779
           1       0.03      0.67      0.06     11439

    accuracy                           0.72    785218
   macro avg       0.51      0.69      0.45    785218
weighted avg       0.98      0.72      0.82    785218



In [93]:
print(lr)
print("model score: %.3f" % lr.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.985


{'accuracy': 0.7191773494749228,
 'recall': 0.6692018533088556,
 'precision': 0.034116535489219085,
 'f1_score': 0.06492322436465564}

In [100]:
# grid search k value for SMOTE oversampling for imbalanced classification
import lightgbm as lgb

from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# values to evaluate
k_values = [7]
for k in k_values:
# define pipeline
model = lgb.LGBMClassifier()
over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.780
> k=2, Mean ROC AUC: 0.781
> k=3, Mean ROC AUC: 0.781
> k=4, Mean ROC AUC: 0.782
> k=5, Mean ROC AUC: 0.782
> k=6, Mean ROC AUC: 0.783
> k=7, Mean ROC AUC: 0.783


In [107]:
counter = Counter(y_train)
print(counter)

Counter({0: 3095110, 1: 45758})


In [108]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# values to evaluate
k_values = [7]



for k in k_values:
# define pipeline
    model = LogisticRegression()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))


> k=7, Mean ROC AUC: 0.763


In [160]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# values to evaluate
k_values = [7]



for k in k_values:
# define pipeline
    model = LogisticRegression()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.1)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))


> k=7, Mean ROC AUC: 0.763


In [131]:
#Probando el nuevo dataset con un logistic Regression


In [132]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.91      0.95    773779
           1       0.06      0.39      0.10     11439

    accuracy                           0.90    785218
   macro avg       0.52      0.65      0.52    785218
weighted avg       0.98      0.90      0.93    785218



In [133]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.898


{'accuracy': 0.8977048412033346,
 'recall': 0.3916426261036804,
 'precision': 0.057550260132314214,
 'f1_score': 0.10035392679539447}

In [126]:
#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.3, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.9)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 1031703, 1: 928533})


In [127]:
#Probando el nuevo dataset con un logistic Regression


In [128]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86    773779
           1       0.04      0.63      0.07     11439

    accuracy                           0.76    785218
   macro avg       0.52      0.69      0.47    785218
weighted avg       0.98      0.76      0.85    785218



In [129]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.760


{'accuracy': 0.7601685137121156,
 'recall': 0.6256665792464376,
 'precision': 0.0374329872643113,
 'f1_score': 0.07063967547400732}

In [134]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.5, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 3095110, 1: 1547555})


In [135]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.90      0.95    773779
           1       0.06      0.39      0.10     11439

    accuracy                           0.90    785218
   macro avg       0.52      0.65      0.52    785218
weighted avg       0.98      0.90      0.93    785218



In [136]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.897


{'accuracy': 0.8974310318917804,
 'recall': 0.3926916688521724,
 'precision': 0.05752779058449874,
 'f1_score': 0.10035409894663941}

In [137]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.1, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 3095110, 1: 309511})


In [138]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    773779
           1       0.14      0.07      0.10     11439

    accuracy                           0.98    785218
   macro avg       0.56      0.53      0.54    785218
weighted avg       0.97      0.98      0.98    785218



In [139]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.980


{'accuracy': 0.9801329057662967,
 'recall': 0.07159716758457907,
 'precision': 0.14123124676668392,
 'f1_score': 0.09502262443438915}

In [140]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.1, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 1547555, 1: 309511})


In [141]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98    773779
           1       0.10      0.18      0.13     11439

    accuracy                           0.97    785218
   macro avg       0.55      0.58      0.56    785218
weighted avg       0.97      0.97      0.97    785218



In [142]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.966


{'accuracy': 0.9660450983039105,
 'recall': 0.17571466037241018,
 'precision': 0.10445356753105026,
 'f1_score': 0.13102144579883973}

In [169]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# values to evaluate

model = LogisticRegression()
over = SMOTE(sampling_strategy=0.1, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('> k=%d, Mean ROC AUC: %.3f' % (k, score))


> k=7, Mean ROC AUC: 0.763


In [143]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.2, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 3095110, 1: 619022})


In [144]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98    773779
           1       0.10      0.17      0.13     11439

    accuracy                           0.97    785218
   macro avg       0.55      0.58      0.56    785218
weighted avg       0.97      0.97      0.97    785218



In [145]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.966


{'accuracy': 0.9661113219513562,
 'recall': 0.1748404580820002,
 'precision': 0.10432423973710292,
 'f1_score': 0.1306762495916367}

In [150]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.3, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.3)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 3095110, 1: 928533})


In [151]:
#Probando el nuevo dataset con un logistic Regression
lr2 = LogisticRegression()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.96      0.97    773779
           1       0.08      0.25      0.12     11439

    accuracy                           0.95    785218
   macro avg       0.53      0.60      0.55    785218
weighted avg       0.98      0.95      0.96    785218



In [152]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LogisticRegression()
model score: 0.948


{'accuracy': 0.9476272831239223,
 'recall': 0.24958475391205526,
 'precision': 0.08066111032631727,
 'f1_score': 0.12191997266942818}

In [165]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# values to evaluate
k_values = [7]



for k in k_values:
# define pipeline
    model = LogisticRegression()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.2)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))


> k=7, Mean ROC AUC: 0.763


In [153]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.2, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 3095110, 1: 619022})


In [154]:
#Probando el nuevo dataset con un logistic Regression
lr2 = xgb.XGBClassifier()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    773779
           1       0.70      0.00      0.01     11439

    accuracy                           0.99    785218
   macro avg       0.84      0.50      0.50    785218
weighted avg       0.98      0.99      0.98    785218



In [155]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
model score: 0.985


{'accuracy': 0.9854702770440821,
 'recall': 0.0045458519101320045,
 'precision': 0.7027027027027027,
 'f1_score': 0.00903326674194389}

In [170]:
model = xgb.XGBClassifier(use_label_encoder=False)
over = SMOTE(sampling_strategy=0.1, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=7, Mean ROC AUC: 0.790


In [156]:


#Una vez que ya tienes el K_value
# summarize class distribution
counter = Counter(y_train)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.2, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.2)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_sm, y_train_sm = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_sm)
print(counter)

Counter({0: 3095110, 1: 45758})
Counter({0: 3095110, 1: 619022})


In [157]:
#Probando el nuevo dataset con un logistic Regression
lr2 = lgb.LGBMClassifier()
lr2.fit(X_train_sm, y_train_sm.ravel())
predictions = lr2.predict(X_test)
 
# print classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    773779
           1       0.48      0.00      0.00     11439

    accuracy                           0.99    785218
   macro avg       0.73      0.50      0.50    785218
weighted avg       0.98      0.99      0.98    785218



In [158]:
print(lr2)
print("model score: %.3f" % lr2.score(X_test, y_test))
print_binary_evaluation(y_test, predictions)

LGBMClassifier()
model score: 0.985


{'accuracy': 0.985430797561951,
 'recall': 0.0008742022904100009,
 'precision': 0.47619047619047616,
 'f1_score': 0.001745200698080279}

In [168]:
model = lgb.LGBMClassifier()
over = SMOTE(sampling_strategy=0.1, k_neighbors=7)
under = RandomUnderSampler(sampling_strategy=0.1)
steps = [('over', over), ('under', under), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
score = mean(scores)
print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=7, Mean ROC AUC: 0.783


In [159]:
#### grid search k value for SMOTE oversampling for imbalanced classification
import lightgbm as lgb

from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# values to evaluate
k_values = [7]
for k in k_values:
# define pipeline
    model = lgb.LGBMClassifier()
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('over', over), ('under', under), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

IndentationError: expected an indented block (3954110982.py, line 17)

In [55]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from imblearn.over_sampling import SMOTE
# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTE(sampling_strategy=0.35, random_state=0) #aquí le especifíco el ratio que quiero de la clase minoritaria
# uso random_state para que los datos puedan ser reproducidos en otro ordenador usando el mismo código.
X_train, y_train = oversample.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)

Counter({0: 3095110, 1: 45758})


ValueError: Unknown label type: 'unknown'

In [None]:
with open('train.pickle', 'wb') as f:
    pickle.dump([X_train, y_train, X_test, y_test], f)

### Escalado de las variables
Aunque en algunos salgoritmos no sea necesario el escalado de variables, pasaremos a realizarlo en este punto para trabajar con los datos en formato unificado.

In [None]:
scaler = StandardScaler()
model_scaled = scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train_pca, y_train)
print(rfc)
print("model score: %.3f" % rfc.score(X_test_pca, y_test))
y_pred = rfc.predict(X_test_pca)
print_binary_evaluation(y_test, y_pred)

In [None]:
with open('train_scaled.pickle', 'wb') as f:
    pickle.dump([X_train_scaled, y_train, X_test_scaled, y_test], f)

In [10]:
with open('train_scaled.pickle', 'rb') as f:
    X_train_scaled, y_train, X_test_scaled, y_test = pickle.load(f)

### SMOTE and Tomek Links Undersampling
SMOTE realiza un oversample de la clase minoritaria como hemos visto anteriormente. Tome Links identifica partes de observaciones cercanas en el dataset de diferentes clases y elimina uno o los dos elementos según especifiquemos. El objetivo de Tome Links es hacer que el límite que define al conjunto de una clase y al conjunto de la otra sea menos ambiguo. A través de la funcion SMOTETomek se combinan ambas técnicas.
En nuestro caso configuraremos Tomek para eliminar los links de la clase mayoritaria:

In [None]:
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
# summarize class distribution
counter = Counter(y_train)
print(counter)
# transform the dataset
oversample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=0) 
X_train_tk, y_train_tk = oversample.fit_resample(X_train_scaled, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)

Counter({0: 3095110, 1: 1083288})


### Escalado de las variables
Volvemos a realizar el escalado después de aplicar el SMOTETomek y nos guardamos las variables para poder contrastar en el siguiente cuaderno.

In [None]:
scaler = StandardScaler()
model_scaled = scaler.fit(X_train_tk)
X_train_scaled_tk = pd.DataFrame(scaler.transform(X_train_tk), columns=X_train_tk.columns, index=X_train_tk.index)

In [None]:
with open('train_scaled_tk.pickle', 'wb') as f:
    pickle.dump([X_train_scaled_tk, y_train_tk, X_test_scaled, y_test], f)

In [None]:
#with open('train.pickle', 'rb') as f:
 #   X_train, y_train, X_test, y_test = pickle.load(f)