In [1]:
import pandas as pd 

In [2]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing
from sklearn.ensemble import GradientBoostingClassifier


In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss

In [5]:
path = 'C:/Users/isaac/Dropbox/Apps/ShareLaTeX/Donde2020'

In [6]:
# Read data 
data = pd.read_csv(path + '/DB/Master.csv') 
# Work only with choice
data = data[data.producto.isin([4,5])]
data.head()



Unnamed: 0,Enc,f_encuesta,edo_boleta,n_empeno,prenda,prenda_tipo,pr_recup,val_pren,genero,edad,...,grf_dummy_choose_same3,grf_dummy_trabajo1,grf_dummy_trabajo2,grf_dummy_trabajo3,grf_dummy_trabajo4,grf_dummy_trabajo5,grf_dummy_trabajo6,pr_prob,OC,cont_OC
0,18.0,9/25/2012,1.0,,78133041,2.0,100.0,1895.252563,1.0,47.0,...,1,1.0,0.0,0.0,0.0,0.0,0.0,50.908798,1.0,49.091202
23,6.0,11/26/2012,0.0,,42125569,3.0,90.0,5000.0,1.0,74.0,...,1,0.0,0.0,0.0,0.0,1.0,0.0,64.866508,1.0,25.133492
40,24.0,11/20/2012,0.0,,78134840,2.0,100.0,3000.0,,,...,1,,,,,,,,,
49,67.0,9/25/2012,1.0,,5524248,3.0,100.0,2052.516602,1.0,43.0,...,1,0.0,0.0,1.0,0.0,0.0,0.0,46.471657,1.0,53.528343
66,,,,,80116767,,,,,,...,1,,,,,,,,,


In [7]:
# Distinguish numerical variables
numeric_features = ['edad', 'pr_recup', 'log_prestamo', 'val_pren_pr']

categorical_features = ['genero','pres_antes', 'plan_gasto', 'pb', 'faltas', 
                        'masqueprepa']

# Pre-processing of numerical variables 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)


# First step of Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [8]:
X = data[['edad', 'pr_recup', 'log_prestamo', 'val_pren_pr',
          'genero','pres_antes', 'plan_gasto', 'pb', 'faltas',
          'masqueprepa']]

y = data['choose_commitment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)

In [9]:
from sklearn.neighbors import KNeighborsClassifier
# Hyperparameters - grid
# ==============================================================================
param_grid = {'model__n_neighbors'  : [2, 3, 4, 5, 6],
              'model__weights'     : ['uniform', 'distance'],
              'model__algorithm'     : ['auto', 'ball_tree', 'kd_tree', 'brute']
             }


knn_est = KNeighborsClassifier(
    leaf_size=30,
    p=2,
    metric='minkowski',
    metric_params=None,
    n_jobs=None)

# KNN Model
KNN = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', knn_est)])


# Grid search by cross-validation
# ==============================================================================
grid = GridSearchCV(KNN,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_model__algorithm,param_model__n_neighbors,param_model__weights,mean_test_score,std_test_score,mean_train_score,std_train_score
39,brute,6,distance,0.855741,0.008748,0.97744,0.002209
29,kd_tree,6,distance,0.855741,0.008748,0.97744,0.002209
19,ball_tree,6,distance,0.855741,0.008748,0.97744,0.002209
9,auto,6,distance,0.855741,0.008748,0.97744,0.002209


In [10]:
# Recover the best model
final_model_knn = grid.best_estimator_

In [11]:
from sklearn.tree import DecisionTreeClassifier
# Hyperparameters - grid
# ==============================================================================
param_grid = {'model__max_depth'  : [2, 3, 4, 5, 6, 7, 8, 10, 20, 50]
             }


dt_est = DecisionTreeClassifier(criterion="entropy")

# DT Model
DT = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', dt_est)])


# Grid search by cross-validation
# ==============================================================================
grid = GridSearchCV(DT,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_model__max_depth,mean_test_score,std_test_score,mean_train_score,std_train_score
3,5,0.854549,0.018377,0.871573,0.004595
6,8,0.852244,0.011417,0.904707,0.001299
5,7,0.852052,0.01508,0.88956,0.004672
4,6,0.851786,0.014495,0.879864,0.005094


In [12]:
# Recover the best model
final_model_dt = grid.best_estimator_

In [13]:
from sklearn import svm
# Hyperparameters - grid
# ==============================================================================
param_grid = {'model__kernel': ['poly', 'rbf', 'sigmoid', 'precomputed'],
              'model__C'     : [0.01, 0.1, 0.2, 0.3, 0.4 , 0.5, 0.6, 0.7, 0.8, 0.9, 1]
             }


svm_est = svm.SVC()

# SVM Model
SVM = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', svm_est)])


# Grid search by cross-validation
# ==============================================================================
grid = GridSearchCV(SVM,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

 0.84624497        nan 0.85026593 0.85026593 0.84168184        nan
 0.85026593 0.85026593 0.83739197        nan 0.85026593 0.85026593
 0.83629678        nan 0.851158   0.85026593 0.83461494        nan
 0.85096611 0.85026593 0.83435629        nan 0.85077421 0.85026593
 0.83143834        nan 0.85059116 0.85026593 0.83161864        nan
 0.85039924 0.85026593 0.83120983        nan 0.85039924 0.85026593
 0.82991971        nan]
 0.84544451        nan 0.85379348 0.85024988 0.83958509        nan
 0.85510804 0.85024988 0.83416852        nan 0.85553649 0.85024988
 0.83409831        nan 0.85597207 0.85024988 0.83195287        nan
 0.85726512 0.85024988 0.83044458        nan 0.85812583 0.85024988
 0.82919454        nan 0.85812583 0.85024988 0.82628267        nan
 0.85855138 0.85024988 0.82688231        nan 0.85897262 0.8511448
 0.8261118         nan]


Unnamed: 0,param_model__C,param_model__kernel,mean_test_score,std_test_score,mean_train_score,std_train_score
20,0.5,poly,0.851158,0.01302,0.855972,0.007016
24,0.6,poly,0.850966,0.012751,0.857265,0.006115
28,0.7,poly,0.850774,0.012484,0.858126,0.005787
32,0.8,poly,0.850591,0.012649,0.858126,0.005787


In [16]:
# Recover the best model
final_model_svm = grid.best_estimator_

In [22]:
from sklearn.linear_model import LogisticRegression
# Hyperparameters - grid
# ==============================================================================
param_grid = {'model__penalty'  : ['l1', 'l2', 'elasticnet'],
              'model__C'     : [0.0001, 0.001, 0.01, 0.1, 1, 10, 20]
             }


logit_est = LogisticRegression(random_state = 0,
                              max_iter = 10000)

# Logit Model
LOGIT = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', logit_est)])


# Grid search by cross-validation
# ==============================================================================
grid = GridSearchCV(LOGIT,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

        nan 0.85026593        nan        nan 0.85026593        nan
        nan 0.85026593        nan        nan 0.85097495        nan
        nan 0.85097495        nan]
        nan 0.8511448         nan        nan 0.8511448         nan
        nan 0.85158837        nan        nan 0.85237733        nan
        nan 0.85228402        nan]


Unnamed: 0,param_model__C,param_model__penalty,mean_test_score,std_test_score,mean_train_score,std_train_score
16,10.0,l2,0.850975,0.013184,0.852377,0.006346
19,20.0,l2,0.850975,0.013184,0.852284,0.006478
1,0.0001,l2,0.850266,0.011775,0.85025,0.005876
4,0.001,l2,0.850266,0.011775,0.85025,0.005876


In [23]:
# Recover the best model
final_model_logit = grid.best_estimator_

In [24]:
from catboost import CatBoostClassifier, Pool

In [27]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'model__iterations'  : [500, 1000, 1500],
              'model__depth'     : [3, 5, 6, 10, 20],
              'model__learning_rate' : [0.0005, 0.001, 0.01, 0.1]
             }


CBC = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', CatBoostClassifier(random_state=123))])


# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
        estimator  = CBC,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

 0.84971634 0.85870687 0.85026593 0.85026593 0.84984961 0.85674583
 0.85026593 0.85026593 0.851158   0.8568358  0.85026593 0.85026593
 0.85147332 0.85470021 0.85026593 0.85026593 0.85331799 0.85420234
 0.85026593 0.85026593 0.85026593 0.85347385 0.85026593 0.85026593
 0.8530407  0.85125349 0.85026593 0.85026593 0.85309965 0.85311138
 0.85026593 0.85026593 0.85490673 0.84966446 0.85026593 0.85026593
 0.85622784 0.84940806 0.85026593 0.85026593 0.85582205 0.85107925
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan]
 0.86478416 0.95877239 0.85024988 0.85024988 0.87808785 0.96756021
 0.85024988 0.85024988 0.85727258 0.96513382 0.85024988 0.85024988
 0.88361273 0.97312194 0.85024988 0.85024988 0.91007154 0.97560847
 0.85024988 0.85024988 0.86192171 0.97054413 0.85024988 0.85024988
 0.90131519 0.97478156 0.85024988 0.85024988 0.92730064 0.97581324
 0.85024988 0.85069728 0.9196324  0.97395629 0.85024988 0.852

0:	learn: 0.6081593	total: 147ms	remaining: 1m 13s
1:	learn: 0.5450580	total: 149ms	remaining: 37.1s
2:	learn: 0.4951235	total: 150ms	remaining: 24.8s
3:	learn: 0.4563776	total: 151ms	remaining: 18.7s
4:	learn: 0.4273382	total: 152ms	remaining: 15.1s
5:	learn: 0.4044686	total: 153ms	remaining: 12.6s
6:	learn: 0.3864116	total: 154ms	remaining: 10.8s
7:	learn: 0.3728540	total: 155ms	remaining: 9.55s
8:	learn: 0.3619299	total: 156ms	remaining: 8.53s
9:	learn: 0.3523790	total: 158ms	remaining: 7.72s
10:	learn: 0.3442933	total: 159ms	remaining: 7.05s
11:	learn: 0.3389672	total: 159ms	remaining: 6.49s
12:	learn: 0.3349855	total: 160ms	remaining: 6s
13:	learn: 0.3309114	total: 161ms	remaining: 5.6s
14:	learn: 0.3280717	total: 162ms	remaining: 5.25s
15:	learn: 0.3256515	total: 164ms	remaining: 4.95s
16:	learn: 0.3237388	total: 164ms	remaining: 4.67s
17:	learn: 0.3218388	total: 165ms	remaining: 4.43s
18:	learn: 0.3197666	total: 166ms	remaining: 4.21s
19:	learn: 0.3182321	total: 167ms	remaining:

261:	learn: 0.2404661	total: 485ms	remaining: 441ms
262:	learn: 0.2402836	total: 488ms	remaining: 439ms
263:	learn: 0.2402384	total: 490ms	remaining: 438ms
264:	learn: 0.2401178	total: 492ms	remaining: 437ms
265:	learn: 0.2400106	total: 494ms	remaining: 435ms
266:	learn: 0.2398526	total: 496ms	remaining: 433ms
267:	learn: 0.2396000	total: 498ms	remaining: 431ms
268:	learn: 0.2393068	total: 500ms	remaining: 429ms
269:	learn: 0.2390942	total: 502ms	remaining: 428ms
270:	learn: 0.2388921	total: 505ms	remaining: 427ms
271:	learn: 0.2386852	total: 507ms	remaining: 425ms
272:	learn: 0.2384022	total: 510ms	remaining: 424ms
273:	learn: 0.2381093	total: 512ms	remaining: 422ms
274:	learn: 0.2381033	total: 514ms	remaining: 420ms
275:	learn: 0.2379195	total: 516ms	remaining: 419ms
276:	learn: 0.2378166	total: 518ms	remaining: 417ms
277:	learn: 0.2376080	total: 520ms	remaining: 416ms
278:	learn: 0.2369837	total: 522ms	remaining: 414ms
279:	learn: 0.2368075	total: 525ms	remaining: 412ms
280:	learn: 

432:	learn: 0.2127401	total: 829ms	remaining: 128ms
433:	learn: 0.2127260	total: 831ms	remaining: 126ms
434:	learn: 0.2125677	total: 833ms	remaining: 125ms
435:	learn: 0.2125201	total: 836ms	remaining: 123ms
436:	learn: 0.2122048	total: 838ms	remaining: 121ms
437:	learn: 0.2120814	total: 840ms	remaining: 119ms
438:	learn: 0.2120034	total: 842ms	remaining: 117ms
439:	learn: 0.2118042	total: 844ms	remaining: 115ms
440:	learn: 0.2116536	total: 847ms	remaining: 113ms
441:	learn: 0.2115146	total: 849ms	remaining: 111ms
442:	learn: 0.2114491	total: 851ms	remaining: 109ms
443:	learn: 0.2113318	total: 853ms	remaining: 108ms
444:	learn: 0.2111921	total: 855ms	remaining: 106ms
445:	learn: 0.2108410	total: 857ms	remaining: 104ms
446:	learn: 0.2107594	total: 859ms	remaining: 102ms
447:	learn: 0.2106331	total: 861ms	remaining: 99.9ms
448:	learn: 0.2102911	total: 863ms	remaining: 98ms
449:	learn: 0.2102419	total: 865ms	remaining: 96.1ms
450:	learn: 0.2101547	total: 867ms	remaining: 94.1ms
451:	learn

Unnamed: 0,param_model__depth,param_model__iterations,param_model__learning_rate,mean_test_score,std_test_score,mean_train_score,std_train_score
3,3,500,0.1,0.858945,0.012572,0.932408,0.002456
7,3,1000,0.1,0.858707,0.009864,0.958772,0.000785
15,5,500,0.1,0.856836,0.0074,0.965134,0.002232
11,3,1500,0.1,0.856746,0.010362,0.96756,0.001387


In [28]:
final_model_cbc = grid.best_estimator_

In [29]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'model__n_estimators'  : [50, 100, 500, 1000],
              'model__max_features'  : ['auto', 'sqrt', 'log2'],
              'model__max_depth'     : [None, 1, 3, 5, 10, 20],
              'model__subsample'     : [0.5, 1],
              'model__learning_rate' : [0.001, 0.01, 0.1]
             }


GBC = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', GradientBoostingClassifier(random_state=123))])


# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
        estimator  = GBC,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_model__learning_rate,param_model__max_depth,param_model__max_features,param_model__n_estimators,param_model__subsample,mean_test_score,std_test_score,mean_train_score,std_train_score
357,0.1,3,log2,500,1,0.861695,0.009437,0.958927,0.003567
379,0.1,5,log2,100,1,0.85899,0.012044,0.949885,0.005548
363,0.1,5,auto,100,1,0.858794,0.011062,0.958484,0.0034
231,0.01,5,sqrt,1000,1,0.857738,0.008431,0.949643,0.005448


In [30]:
final_model_gbc = grid.best_estimator_

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'model__n_estimators': [150, 200],
              'model__max_features': [5, 7, 9],
              'model__max_depth'   : [None, 3, 10, 20],
              'model__criterion'   : ['gini', 'entropy']
             }



RFC = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', RandomForestClassifier(random_state=123))])


# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
        estimator  = RFC,
        param_grid = param_grid,
        scoring    = 'f1_weighted',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_model__criterion,param_model__max_depth,param_model__max_features,param_model__n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
23,gini,20,9,200,0.857437,0.014202,0.972045,0.001951
19,gini,20,5,200,0.857266,0.014105,0.971827,0.001757
22,gini,20,9,150,0.857099,0.014671,0.971934,0.002095
43,entropy,20,5,200,0.857039,0.013687,0.969853,0.001978


In [1]:
final_model_rfc = grid.best_estimator_

NameError: name 'grid' is not defined

In [None]:
#Metric Results


metric_results_oos = pd.DataFrame({'KNN' :  [accuracy_score(y_test, final_model_knn.predict(X_test)),
    f1_score(y_test, final_model_knn.predict(X_test), average = 'weighted'),
    jaccard_score(y_test, final_model_knn.predict(X_test), average = 'weighted'),
    log_loss(y_test, final_model_knn.predict(X_test))],
                                     'Decision Tree' :  [accuracy_score(y_test, final_model_dt.predict(X_test)),
    f1_score(y_test, final_model_dt.predict(X_test), average = 'weighted'),
    jaccard_score(y_test, final_model_dt.predict(X_test), average = 'weighted'),
    log_loss(y_test, final_model_dt.predict(X_test))], 
                                     'SVM' :  [accuracy_score(y_test, final_model_svm.predict(X_test)),
    f1_score(y_test, final_model_svm.predict(X_test), average = 'weighted'),
    jaccard_score(y_test, final_model_svm.predict(X_test), average = 'weighted'),
    log_loss(y_test, final_model_svm.predict(X_test))],
                                     'Logit' :  [accuracy_score(y_test, final_model_logit.predict(X_test)),
    f1_score(y_test, final_model_logit.predict(X_test), average = 'weighted'),
    jaccard_score(y_test, final_model_logit.predict(X_test), average = 'weighted'),
    log_loss(y_test, final_model_logit.predict(X_test))],
                                    'GBC' :  [accuracy_score(y_test, final_model_gbc.predict(X_test)),
    f1_score(y_test, final_model_gbc.predict(X_test), average = 'weighted'),
    jaccard_score(y_test, final_model_gbc.predict(X_test), average = 'weighted'),
    log_loss(y_test, final_model_gbc.predict(X_test))],
                                    'CBC' :  [accuracy_score(y_test, final_model_cbc.predict(X_test)),
    f1_score(y_test, final_model_cbc.predict(X_test), average = 'weighted'),
    jaccard_score(y_test, final_model_cbc.predict(X_test), average = 'weighted'),
    log_loss(y_test, final_model_cbc.predict(X_test))]
                        })
     
     
# Change the row indexes
metric_results_oos.index = ['Accuracy', 'F1', 'Jaccard', 'Log-loss']


metric_results_oos

In [None]:
metric_results_ins = pd.DataFrame({'KNN' :  [accuracy_score(y_train, final_model_knn.predict(X_train)),
    f1_score(y_train, final_model_knn.predict(X_train), average = 'weighted'),
    jaccard_score(y_train, final_model_knn.predict(X_train), average = 'weighted'),
    log_loss(y_train, final_model_knn.predict(X_train))],
                                     'Decision Tree' :  [accuracy_score(y_train, final_model_dt.predict(X_train)),
    f1_score(y_train, final_model_dt.predict(X_train), average = 'weighted'),
    jaccard_score(y_train, final_model_dt.predict(X_train), average = 'weighted'),
    log_loss(y_train, final_model_dt.predict(X_train))], 
                                     'SVM' :  [accuracy_score(y_train, final_model_svm.predict(X_train)),
    f1_score(y_train, final_model_svm.predict(X_train), average = 'weighted'),
    jaccard_score(y_train, final_model_svm.predict(X_train), average = 'weighted'),
    log_loss(y_train, final_model_svm.predict(X_train))],
                                     'Logit' :  [accuracy_score(y_train, final_model_logit.predict(X_train)),
    f1_score(y_train, final_model_logit.predict(X_train), average = 'weighted'),
    jaccard_score(y_train, final_model_logit.predict(X_train), average = 'weighted'),
    log_loss(y_train, final_model_logit.predict(X_train))],
                                     'GBC' :  [accuracy_score(y_train, final_model_gbc.predict(X_train)),
    f1_score(y_train, final_model_gbc.predict(X_train), average = 'weighted'),
    jaccard_score(y_train, final_model_gbc.predict(X_train), average = 'weighted'),
    log_loss(y_train, final_model_gbc.predict(X_train))],
                                     'CBC' :  [accuracy_score(y_train, final_model_cbc.predict(X_train)),
    f1_score(y_train, final_model_cbc.predict(X_train), average = 'weighted'),
    jaccard_score(y_train, final_model_cbc.predict(X_train), average = 'weighted'),
    log_loss(y_train, final_model_cbc.predict(X_train))]
                        })
     
# Change the row indexes
metric_results_ins.index = ['Accuracy', 'F1', 'Jaccard', 'Log-loss']


metric_results_ins

In [None]:
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss
from sklearn.metrics import classification_report

In [None]:
df_pred = pd.read_csv(path + '/DB/Master.csv') 
pred = df_pred[['edad', 'pr_recup', 'log_prestamo', 'val_pren_pr',
          'genero','pres_antes', 'plan_gasto', 'pb', 'faltas',
          'masqueprepa']]

dict_classifiers = {
    "Logit": [final_model_logit, LOGIT],
    "KNN": [final_model_knn, KNN],
    "GBC": [final_model_gbc, GBC],
    "DT": [final_model_dt, DT],
    "CBC": [final_model_cbc, CBC]
}


In [None]:

prop_choose = df_pred[['prenda']]

for model in dict_classifiers:
    params = dict_classifiers[model][0].get_params()
    dict_classifiers[model][1].set_params(**params)
    dict_classifiers[model][1].fit(X,y)
    print(model,'accuracy :', accuracy_score(y,  dict_classifiers[model][1].predict(X)))
    dta = pd.DataFrame(dict_classifiers[model][1].predict_proba(pred))
    name0 = 'pr_'+model+'_0'
    name1 = 'pr_'+model+'_1'    
    dta.rename(columns={0: name0, 1: name1}, inplace=True)
    pre = pd.DataFrame(dict_classifiers[model][1].predict(pred))
    name = 'pre_'+model
    pre.rename(columns={0: name}, inplace=True)
    prop_choose = pd.concat([prop_choose, dta, pre], axis=1)

In [None]:
prop_choose

In [None]:
import os
prop_choose.to_csv(os.path.join(path + '/_aux/prop_choose.csv') , index = False)


In [None]:
importancia = permutation_importance(
                estimator    = GBC,
                X            = X,
                y            = y,
                n_repeats    = 5,
                scoring      = 'f1_weighted',
                n_jobs       = multiprocessing.cpu_count() - 1,
                random_state = 123
             )

# Se almacenan los resultados (media y desviación) en un dataframe
df_importancia = pd.DataFrame(
                    {k: importancia[k] for k in ['importances_mean', 'importances_std']}
                 )
df_importancia['feature'] = X_train.columns
df_importancia.sort_values('importances_mean', ascending=False)

In [None]:
import matplotlib.pyplot as plt
# Gráfico
fig, ax = plt.subplots(figsize=(5, 6))
df_importancia = df_importancia.sort_values('importances_mean', ascending=True)
ax.barh(
    df_importancia['feature'],
    df_importancia['importances_mean'],
    xerr=df_importancia['importances_std'],
    align='center',
    alpha=0
)
ax.plot(
    df_importancia['importances_mean'],
    df_importancia['feature'],
    marker="D",
    linestyle="",
    alpha=0.8,
    color="r"
)
ax.set_title('Importancia de los predictores (train)')
ax.set_xlabel('Incremento del error tras la permutación');