# **Modelamiento**

## Importamos librerías necesarias

In [55]:
# importemos las librerías básicas a usar

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

## Lectura de datos

In [2]:
df = pd.read_csv('datasets/data_to_model.csv', index_col=0)
df['Exited'] = df['Exited_C'].map({'Yes':1, 'No':0})
df.drop(columns=['Exited_C'], inplace=True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
df.select_dtypes('number').head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,42,2,0.0,1,1,1,101348.88,1
1,608,41,1,83807.86,1,0,1,112542.58,0
2,502,42,8,159660.8,3,1,0,113931.57,1
3,699,39,1,0.0,2,0,0,93826.63,0
4,850,43,2,125510.82,1,1,1,79084.1,0


## Baseline

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

df2 = df.copy()

to_encode = df.select_dtypes('object').columns.tolist()
to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

transformer = ColumnTransformer([
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'), to_encode),
    ('scaler', MinMaxScaler(), to_scale)
], verbose_feature_names_out=False, remainder='passthrough').set_output(transform='pandas')

X = df.drop(columns=['Exited'])
y = df.Exited.values
X = transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f"Tamaño de base de entrenamiento{X_train.shape}")
print(f"Tamaño de base de test{X_test.shape}")

Tamaño de base de entrenamiento(7500, 13)
Tamaño de base de test(2500, 13)


Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(classification_report(y_train, lr.predict(X_train)))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89      5976
           1       0.60      0.20      0.30      1524

    accuracy                           0.81      7500
   macro avg       0.71      0.58      0.60      7500
weighted avg       0.78      0.81      0.77      7500



SVC

In [60]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['poly', 'rbf', 'sigmoid'], 'C':[0.5, 1, 1.5, 5]}
grid = GridSearchCV(SVC(), param_grid=param_grid, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid.fit(X_train, y_train)
pd.DataFrame(grid.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,4.161606,0.755153,0.246971,0.028056,5.0,poly,"{'C': 5, 'kernel': 'poly'}",0.818383,0.832617,0.80777,0.846332,0.812761,0.823573,0.014095,1
6,3.141755,0.294857,0.27046,0.025789,1.5,poly,"{'C': 1.5, 'kernel': 'poly'}",0.812794,0.822141,0.797292,0.839322,0.804928,0.815296,0.014574,2
3,2.95607,0.546084,0.264248,0.01239,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.812651,0.818037,0.791821,0.836318,0.799149,0.811595,0.01549,3
10,3.035098,0.722,0.460349,0.05834,5.0,rbf,"{'C': 5, 'kernel': 'rbf'}",0.807714,0.816402,0.78911,0.829223,0.7971,0.80791,0.014119,4
0,3.11891,1.396086,0.356445,0.145048,0.5,poly,"{'C': 0.5, 'kernel': 'poly'}",0.810869,0.808988,0.788381,0.830913,0.7922,0.80627,0.015191,5


In [62]:
print(classification_report(y_train, grid.best_estimator_.predict(X_train)))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92      5976
           1       0.83      0.40      0.54      1524

    accuracy                           0.86      7500
   macro avg       0.85      0.69      0.73      7500
weighted avg       0.86      0.86      0.84      7500



Decision Tree

In [74]:
from sklearn.tree import DecisionTreeClassifier

param_grid2 = {'max_depth': [10,15,20,25,30], 'min_samples_split':[60, 80, 100]}
grid2 = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid2, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid2.fit(X_train, y_train)
pd.DataFrame(grid2.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.053442,0.007388,0.007742,0.001722,10,100,"{'max_depth': 10, 'min_samples_split': 100}",0.833557,0.825479,0.815382,0.847541,0.817673,0.827927,0.011704,1
1,0.057659,0.009195,0.010301,0.00249,10,80,"{'max_depth': 10, 'min_samples_split': 80}",0.827425,0.823906,0.807193,0.848479,0.816277,0.824656,0.013791,2
5,0.071364,0.036916,0.012626,0.009208,15,100,"{'max_depth': 15, 'min_samples_split': 100}",0.825801,0.817195,0.811702,0.846814,0.820739,0.82445,0.01209,3
11,0.052473,0.004372,0.008882,0.001541,25,100,"{'max_depth': 25, 'min_samples_split': 100}",0.823878,0.815259,0.81011,0.846874,0.819097,0.823044,0.012744,4
8,0.088337,0.041514,0.01485,0.007471,20,100,"{'max_depth': 20, 'min_samples_split': 100}",0.823657,0.815259,0.81011,0.846874,0.819137,0.823008,0.012739,5


Random Forest Classifier

In [75]:
from sklearn.ensemble import RandomForestClassifier

param_grid3 = {'n_estimators': [100, 150, 200], 'max_depth': [10,15,20,25,30],'min_samples_split':[60, 80, 100]}
grid3 = GridSearchCV(RandomForestClassifier(), param_grid=param_grid3, n_jobs=-1, scoring='roc_auc', cv = 5, return_train_score=False)
grid3.fit(X_train, y_train)
pd.DataFrame(grid3.cv_results_).sort_values(by='rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
30,1.551123,0.694959,0.108687,0.030414,25,80,100,"{'max_depth': 25, 'min_samples_split': 80, 'n_...",0.85641,0.854017,0.841161,0.877811,0.851356,0.856151,0.012012,1
27,1.615287,0.659457,0.105223,0.032808,25,60,100,"{'max_depth': 25, 'min_samples_split': 60, 'n_...",0.857364,0.853694,0.840977,0.876776,0.850113,0.855785,0.011823,2
37,1.811621,0.038269,0.11694,0.012835,30,60,150,"{'max_depth': 30, 'min_samples_split': 60, 'n_...",0.856347,0.854607,0.84298,0.875882,0.848719,0.855707,0.011131,3
28,1.854025,0.08117,0.112904,0.008921,25,60,150,"{'max_depth': 25, 'min_samples_split': 60, 'n_...",0.857219,0.855225,0.839141,0.875555,0.8505,0.855528,0.011814,4
29,3.318862,1.161743,0.219875,0.084857,25,60,200,"{'max_depth': 25, 'min_samples_split': 60, 'n_...",0.857153,0.854901,0.839619,0.876559,0.8494,0.855526,0.012133,5
