# **Modelamiento**

## Importamos librerías necesarias

In [1]:
# importemos las librerías básicas a usar

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

## Lectura de datos

In [8]:
df = pd.read_csv('datasets/data_to_model.csv', index_col=0)
df['Exited'] = df['Exited_C'].map({'Yes':1, 'No':0})
df.drop(columns=['Exited_C'], inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [9]:
df.select_dtypes('number').head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,42,2,0.0,1,1,1,101348.88,1
1,608,41,1,83807.86,1,0,1,112542.58,0
2,502,42,8,159660.8,3,1,0,113931.57,1
3,699,39,1,0.0,2,0,0,93826.63,0
4,850,43,2,125510.82,1,1,1,79084.1,0


## Baseline

A continuación de plantea una regresión logística como línea base, para tener como referencia su performance, y a partir de ahí intentar mejorar las métricas obtenidas con otros modelos y configuraciones

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

df2 = df.copy()

to_encode = df.select_dtypes('object').columns.tolist()
to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

transformer = ColumnTransformer([
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'), to_encode),
    ('scaler', MinMaxScaler(), to_scale)
], verbose_feature_names_out=False, remainder='passthrough').set_output(transform='pandas')

X = df.drop(columns=['Exited'])
y = df.Exited.values
X = transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f"Tamaño de base de entrenamiento{X_train.shape}")
print(f"Tamaño de base de test{X_test.shape}")

Tamaño de base de entrenamiento(7500, 13)
Tamaño de base de test(2500, 13)


Logistic Regression

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(f"AUC para logistic regression: {roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]):.2%}")

AUC para logistic regression: 77.77%


SVC

In [61]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['poly', 'rbf', 'sigmoid'], 'C':[0.5, 1, 1.5, 5]}
grid = GridSearchCV(SVC(probability=True), param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid.fit(X_train, y_train)
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,20.382377,1.014559,0.228395,0.054718,5.0,poly,"{'C': 5, 'kernel': 'poly'}",0.810648,0.849724,0.828666,0.820166,0.820571,0.825955,0.013184,1
6,13.707832,1.000083,0.275858,0.077819,1.5,poly,"{'C': 1.5, 'kernel': 'poly'}",0.804727,0.84279,0.827857,0.812465,0.812313,0.82003,0.01364,2
3,14.087158,2.019397,0.280617,0.031255,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.799901,0.837262,0.824817,0.808415,0.808687,0.815816,0.013415,3
10,16.377122,1.214917,0.344521,0.129538,5.0,rbf,"{'C': 5, 'kernel': 'rbf'}",0.792057,0.836433,0.820616,0.804741,0.809924,0.812754,0.014987,4
0,13.493886,0.705708,0.322875,0.127703,0.5,poly,"{'C': 0.5, 'kernel': 'poly'}",0.799569,0.830918,0.820424,0.802598,0.80679,0.81206,0.011823,5


In [66]:
print(f"AUC para support vector machine: {roc_auc_score(y_test, grid.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")

AUC para support vector machine: 80.64%


Decision Tree

In [68]:
from sklearn.tree import DecisionTreeClassifier

param_grid2 = {'max_depth': [10,15,20,25,30], 'min_samples_split':[60, 80, 100]}
grid2 = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid2, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid2.fit(X_train, y_train)
pd.DataFrame(grid2.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.045389,0.003712,0.006905,0.001091,10,100,"{'max_depth': 10, 'min_samples_split': 100}",0.830422,0.857362,0.836011,0.825966,0.839763,0.837905,0.010809,1
1,0.047672,0.004945,0.006968,0.001962,10,80,"{'max_depth': 10, 'min_samples_split': 80}",0.829545,0.844051,0.822017,0.824469,0.840762,0.832169,0.008766,2
0,0.047827,0.00378,0.007244,0.001186,10,60,"{'max_depth': 10, 'min_samples_split': 60}",0.823611,0.839582,0.817865,0.820535,0.841068,0.828532,0.00981,3
5,0.052626,0.013097,0.006825,0.001217,15,100,"{'max_depth': 15, 'min_samples_split': 100}",0.82281,0.843228,0.832064,0.817662,0.825712,0.828295,0.008798,4
11,0.0581,0.008132,0.010012,0.004044,25,100,"{'max_depth': 25, 'min_samples_split': 100}",0.821648,0.842585,0.832377,0.817232,0.826662,0.8281,0.008831,5


In [69]:
print(f"AUC para decision tree: {roc_auc_score(y_test, grid2.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")

AUC para decision tree: 82.86%


Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

param_grid3 = {'n_estimators': [100, 150, 200], 'max_depth': [10,15,20,25,30],'min_samples_split':[60, 80, 100]}
grid3 = GridSearchCV(RandomForestClassifier(), param_grid=param_grid3, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid3.fit(X_train, y_train)
pd.DataFrame(grid3.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,2.165972,0.043781,0.091862,0.006949,20,60,200,"{'max_depth': 20, 'min_samples_split': 60, 'n_...",0.842944,0.885254,0.864504,0.851071,0.856707,0.860096,0.014419,1
29,2.594234,0.256369,0.099906,0.012524,25,60,200,"{'max_depth': 25, 'min_samples_split': 60, 'n_...",0.842708,0.886096,0.864032,0.850769,0.855169,0.859755,0.014863,2
36,1.597316,0.092406,0.102344,0.013466,30,60,100,"{'max_depth': 30, 'min_samples_split': 60, 'n_...",0.838924,0.886656,0.864589,0.852939,0.855602,0.859742,0.015778,3
2,2.208682,0.230537,0.118828,0.011678,10,60,200,"{'max_depth': 10, 'min_samples_split': 60, 'n_...",0.843696,0.883668,0.86317,0.850264,0.857769,0.859714,0.013678,4
10,1.528289,0.182844,0.085354,0.015597,15,60,150,"{'max_depth': 15, 'min_samples_split': 60, 'n_...",0.841358,0.885405,0.863796,0.851351,0.856603,0.859703,0.014787,5


In [70]:
print(f"AUC para random forest: {roc_auc_score(y_test, grid3.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")

AUC para random forest: 85.95%


In [71]:
from sklearn.neighbors import KNeighborsClassifier

param_grid4 = {'n_neighbors': [5, 10, 15], 'p': [1,2],'algorithm':['ball_tree', 'kd_tree', 'brute']}
grid4 = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid4, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid4.fit(X_train, y_train)
pd.DataFrame(grid4.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.029222,0.008684,0.282678,0.038465,ball_tree,15,1,"{'algorithm': 'ball_tree', 'n_neighbors': 15, ...",0.778049,0.794129,0.765989,0.746985,0.771921,0.771415,0.015404,1
10,0.023527,0.003565,0.267334,0.03525,kd_tree,15,1,"{'algorithm': 'kd_tree', 'n_neighbors': 15, 'p...",0.778049,0.794129,0.765989,0.746985,0.771921,0.771415,0.015404,1
16,0.007329,0.002035,0.339361,0.015476,brute,15,1,"{'algorithm': 'brute', 'n_neighbors': 15, 'p': 1}",0.778049,0.794129,0.765989,0.746985,0.771921,0.771415,0.015404,1
8,0.029385,0.003837,0.23573,0.029373,kd_tree,10,1,"{'algorithm': 'kd_tree', 'n_neighbors': 10, 'p...",0.773698,0.788049,0.764548,0.732381,0.767576,0.76525,0.018316,4
2,0.02879,0.006062,0.268303,0.058205,ball_tree,10,1,"{'algorithm': 'ball_tree', 'n_neighbors': 10, ...",0.773698,0.788049,0.764548,0.732381,0.767576,0.76525,0.018316,4


In [73]:
print(f"AUC para k-nn classifier: {roc_auc_score(y_test, grid4.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")
df2.head()

AUC para k-nn classifier: 79.30%


### Feature selection

Se escogeran las variables que tengan mayor relación con la variable objetivo

In [105]:
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
num_cols = df2.drop(columns=cat_cols).columns.tolist()
num_cols.remove('Exited')

Variables numéricas

In [106]:
from sklearn.feature_selection import f_classif, SelectKBest

X = df2[num_cols]
y = df2.Exited.values

num_kbest = SelectKBest(f_classif,k=4).set_output(transform='pandas')
best_nums = num_kbest.fit_transform(df2[num_cols], y)

Variables categóricas

In [107]:
from sklearn.feature_selection import chi2
from sklearn.preprocessing import OrdinalEncoder

# codificamos las variables categóricas
encoder = OrdinalEncoder().set_output(transform='pandas')
X = df2[cat_cols]
X =encoder.fit_transform(X)
y = df2.Exited.values
cat_kbest = SelectKBest(chi2,k=3).set_output(transform='pandas')
best_cats = cat_kbest.fit_transform(X, y)

Nuevo dataframe

In [109]:
new_df = pd.concat([best_nums, df2[cat_cols][best_cats.columns.tolist()]], axis = 1)
new_df['Exited'] = y
new_df.head()

Unnamed: 0,CreditScore,Age,Balance,NumOfProducts,Geography,Gender,IsActiveMember,Exited
0,619.0,42.0,0.0,1.0,France,Female,1,1
1,608.0,41.0,83807.86,1.0,Spain,Female,1,0
2,502.0,42.0,159660.8,3.0,France,Female,0,1
3,699.0,39.0,0.0,2.0,France,Female,0,0
4,850.0,43.0,125510.82,1.0,Spain,Female,1,0


Nuevos datos de prueba y entrenamiento

In [115]:
to_encode = new_df.select_dtypes('object').columns.tolist()
to_scale = ['CreditScore', 'Age', 'Balance', 'NumOfProducts']

transformer = ColumnTransformer([
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'), to_encode),
    ('scaler', MinMaxScaler(), to_scale)
], verbose_feature_names_out=False, remainder='passthrough').set_output(transform='pandas')

X = new_df.drop(columns=['Exited'])
y = new_df.Exited.values
X = transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f"Tamaño de base de entrenamiento: {X_train.shape}")
print(f"Tamaño de base de test: {X_test.shape}")

Tamaño de base de entrenamiento: (7500, 10)
Tamaño de base de test: (2500, 10)


SVC

In [116]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['poly', 'rbf', 'sigmoid'], 'C':[0.5, 1, 1.5, 5]}
grid = GridSearchCV(SVC(probability=True), param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid.fit(X_train, y_train)
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,14.414601,1.582171,0.23123,0.043439,5.0,poly,"{'C': 5, 'kernel': 'poly'}",0.800393,0.824159,0.815681,0.830009,0.826172,0.819283,0.010546,1
10,12.043933,1.006279,0.383444,0.097685,5.0,rbf,"{'C': 5, 'kernel': 'rbf'}",0.799709,0.808874,0.810792,0.819094,0.822518,0.812197,0.008038,2
6,11.991981,1.700629,0.208125,0.061642,1.5,poly,"{'C': 1.5, 'kernel': 'poly'}",0.793464,0.81624,0.80441,0.827626,0.817322,0.811812,0.01176,3
3,10.196768,0.595232,0.18717,0.043076,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.792475,0.812906,0.802738,0.823721,0.81402,0.809172,0.01067,4
7,12.295433,0.388839,0.415091,0.082385,1.5,rbf,"{'C': 1.5, 'kernel': 'rbf'}",0.793478,0.803336,0.799997,0.816817,0.806262,0.803978,0.007701,5


In [117]:
print(f"AUC para support vector machine: {roc_auc_score(y_test, grid.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")

AUC para support vector machine: 82.64%


Decision tree

In [118]:
from sklearn.tree import DecisionTreeClassifier

param_grid2 = {'max_depth': [10,15,20,25,30], 'min_samples_split':[60, 80, 100]}
grid2 = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid2, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid2.fit(X_train, y_train)
pd.DataFrame(grid2.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.036331,0.006403,0.007538,0.001245,10,100,"{'max_depth': 10, 'min_samples_split': 100}",0.852235,0.843704,0.843547,0.806155,0.852299,0.839588,0.017157,1
1,0.038627,0.005361,0.009568,0.003156,10,80,"{'max_depth': 10, 'min_samples_split': 80}",0.852816,0.838026,0.839943,0.805688,0.83868,0.835031,0.015644,2
5,0.045307,0.013145,0.01513,0.008202,15,100,"{'max_depth': 15, 'min_samples_split': 100}",0.840223,0.838605,0.837063,0.812045,0.840457,0.833678,0.010886,3
8,0.039487,0.005633,0.01083,0.004806,20,100,"{'max_depth': 20, 'min_samples_split': 100}",0.839751,0.837981,0.83564,0.811507,0.840457,0.833067,0.010908,4
11,0.042035,0.005027,0.009044,0.00353,25,100,"{'max_depth': 25, 'min_samples_split': 100}",0.839751,0.837981,0.83564,0.811507,0.840457,0.833067,0.010908,4


In [120]:
print(f"AUC para decision tree: {roc_auc_score(y_test, grid2.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")

AUC para decision tree: 84.69%


Random Forest Classifier

In [121]:
param_grid3 = {'n_estimators': [100, 150, 200], 'max_depth': [10,15,20,25,30],'min_samples_split':[60, 80, 100]}
grid3 = GridSearchCV(RandomForestClassifier(), param_grid=param_grid3, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid3.fit(X_train, y_train)
pd.DataFrame(grid3.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
40,1.39825,0.123478,0.139633,0.023498,30,80,150,"{'max_depth': 30, 'min_samples_split': 80, 'n_...",0.862752,0.861484,0.858264,0.851766,0.861659,0.859185,0.004,1
14,1.477203,0.034226,0.098119,0.008159,15,80,200,"{'max_depth': 15, 'min_samples_split': 80, 'n_...",0.861407,0.862534,0.859794,0.850339,0.861477,0.85911,0.004472,2
36,1.052313,0.132982,0.050314,0.002585,30,60,100,"{'max_depth': 30, 'min_samples_split': 60, 'n_...",0.862339,0.861098,0.85913,0.850506,0.862304,0.859075,0.004441,3
29,2.024852,0.176412,0.167313,0.011683,25,60,200,"{'max_depth': 25, 'min_samples_split': 60, 'n_...",0.861667,0.861431,0.858878,0.850741,0.862648,0.859073,0.004348,4
11,1.805262,0.077299,0.09797,0.007022,15,60,200,"{'max_depth': 15, 'min_samples_split': 60, 'n_...",0.861119,0.863564,0.856715,0.851317,0.862605,0.859064,0.004529,5


In [122]:
print(f"AUC para random forest: {roc_auc_score(y_test, grid3.best_estimator_.predict_proba(X_test)[:, 1]):.2%}")

AUC para random forest: 87.82%


## Añadimos pesos a las clases

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['poly', 'rbf', 'sigmoid'], 'C':[0.5, 1, 1.5, 5]}
grid = GridSearchCV(SVC(probability=True,), param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid.fit(X_train, y_train)
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)