# CUNEF MUCD 2022/2023

## Autor: Jose Antonio Nazar

In [1]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn import svm
import statsmodels.api as sm
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.dummy import DummyClassifier 
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
import pickle
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import fbeta_score

from aux_func import cargar_modelo
from aux_func import evaluate_model
from imblearn.over_sampling import SMOTE


  from pandas import MultiIndex, Int64Index


In [2]:
#Read data
xtrain = pd.read_parquet("./datos/xtrain.parquet")
ytrain = pd.read_parquet("./datos/ytrain.parquet")
xtest = pd.read_parquet("./datos/xtest.parquet")
ytest = pd.read_parquet("./datos/ytest.parquet")

In [3]:
#Load preprocessor
preprocessor = cargar_modelo('./datos/preprocessor.pickle')

### Modelos

- Base
- Regresión logística y Lasso
- Random Forest
- XGBoost
- LightGBM
- SVM
- Ada Boost


Tras observar nuestro problema vemos que los datos están desbalanceados por lo que probarémos a usar en algunos modelos un oversample de la clase minoritaria mediante el algoritmo SMOTE. En función de los resultados optaré o no por usar el SMOTE en el resto de modelos

## Modelo Base

In [4]:
modelo_base = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', DummyClassifier(strategy='most_frequent', random_state=0))])

In [5]:
#Training
#modelo_base.fit(xtrain, ytrain)

In [6]:
#Keep model with pickle
#with open('C:/Users/Jose/Documents/1-CUNEF/cuatri 1/aprendizaje automatico/fraudulentos/datos/modelo_base.pickle', 'wb') as f:
    #pickle.dump(modelo_base, f)

In [7]:
#Skip fit and run from this cell
with open('./datos/modelo_base.pickle', 'rb') as f:
    modelo_base = pickle.load(f)

In [8]:
#predictions
ypred = modelo_base.predict(xtest)
ypred_prob = modelo_base.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.5
Accuracy of the model: 0.9989128102424719

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.00      0.00      0.00       228

    accuracy                           1.00    209715
   macro avg       0.50      0.50      0.50    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209487      0]
 [   228      0]]



In [9]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.49989118638340757

## Modelo Regresión logistica y lasso

In [10]:
clf = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    ('clasificador', LogisticRegression(C=1.5,random_state=0, n_jobs=2, penalty='l1', solver='liblinear', tol= 0.0005))])

In [11]:
#Training
#clf.fit(xtrain, ytrain)

In [12]:
#Keep model with pickle
#with open('C:/Users/Jose/Documents/1-CUNEF/cuatri 1/aprendizaje automatico/fraudulentos/datos/Lasso.pickle', 'wb') as f:
    #pickle.dump(clf, f)

In [13]:
#Skip fit and run from this cell
with open('./datos/Lasso.pickle', 'rb') as f:
    clf = pickle.load(f)

In [14]:
#predictions
ypred = clf.predict(xtest)
ypred_prob = clf.predict_proba(xtest)
evaluate_model(ytest, ypred, ypred_prob)

ROC-AUC score of the model: 0.9763121213651493
Accuracy of the model: 0.9992847435805736

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.91      0.38      0.54       228

    accuracy                           1.00    209715
   macro avg       0.95      0.69      0.77    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209478      9]
 [   141     87]]



In [15]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.001520, G-Mean=0.936
ROC-AUC score of the model: 0.9763121213651493
Accuracy of the model: 0.929256371742603

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.93      0.96    209487
           1       0.01      0.94      0.03       228

    accuracy                           0.93    209715
   macro avg       0.51      0.93      0.50    209715
weighted avg       1.00      0.93      0.96    209715


Confusion matrix: 
[[194665  14822]
 [    14    214]]



In [16]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.7156893280538892

**Modelo Regresión Logística y Lasso con SMOTE**

In [17]:
clf_S = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('smote', SMOTE(sampling_strategy='auto')),
    ('clasificador', LogisticRegression(C=1.5,random_state=0, penalty='l1', solver='liblinear', tol= 0.0005))])

In [18]:
#Training
#clf_S.fit(xtrain, ytrain)

In [19]:
#keep model with pickle
#with open('./datos/LassoSMOTE.pickle', 'wb') as f:
    #pickle.dump(clf_S, f)

In [20]:
#Skip fit and run from this cell
with open('./datos/LassoSMOTE.pickle', 'rb') as f:
    clf_S = pickle.load(f)

In [21]:
#predictions
ypred = clf_S.predict(xtest)
ypred_prob = clf_S.predict_proba(xtest)
evaluate_model(ytest, ypred, ypred_prob)

ROC-AUC score of the model: 0.9810568574409717
Accuracy of the model: 0.9490499010561954

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    209487
           1       0.02      0.94      0.04       228

    accuracy                           0.95    209715
   macro avg       0.51      0.95      0.51    209715
weighted avg       1.00      0.95      0.97    209715


Confusion matrix: 
[[198815  10672]
 [    13    215]]



In [22]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.528625, G-Mean=0.949
ROC-AUC score of the model: 0.9810568574409717
Accuracy of the model: 0.9547147318980521

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.95      0.98    209487
           1       0.02      0.94      0.04       228

    accuracy                           0.95    209715
   macro avg       0.51      0.95      0.51    209715
weighted avg       1.00      0.95      0.98    209715


Confusion matrix: 
[[200004   9483]
 [    14    214]]



In [23]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.5249615422892426

Vemos que los resultados son muy parecidos en el modelo con y sin SMOTE, incluso siendo un poco peores con con el SMOTE por lo que probarémos con otros modelos

## Random Forest

In [24]:
clf_1 = Pipeline(steps=[
    ('preprocesador', preprocessor), 
    ('clasificador', RandomForestClassifier(n_jobs=-1, random_state=0))])

In [25]:
#Training
#clf_1.fit(xtrain, ytrain)

In [26]:
#keep model with pickle
#with open('./datos/Randomforest.pickle', 'wb') as f:
    #pickle.dump(clf_1, f)

In [27]:
#Skip fit and run from this cell
with open('./datos/RandomForest.pickle', 'rb') as f:
    clf_1 = pickle.load(f)

In [28]:
#predictions
ypred = clf_1.predict(xtest)
ypred_prob = clf_1.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.9955077185629491
Accuracy of the model: 0.9999475478625754

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       1.00      0.96      0.98       228

    accuracy                           1.00    209715
   macro avg       1.00      0.98      0.99    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209486      1]
 [    10    218]]



In [29]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.090000, G-Mean=0.991
ROC-AUC score of the model: 0.9955077185629491
Accuracy of the model: 0.9998760222206328

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.91      0.98      0.94       228

    accuracy                           1.00    209715
   macro avg       0.96      0.99      0.97    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209466     21]
 [     5    223]]



In [30]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.9818677644568494

In [31]:
#check the overfiting
ypred = clf_1.predict(xtrain)
ypred_prob = clf_1.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.020000, G-Mean=0.991
ROC-AUC score of the model: 0.9927850989400407
Accuracy of the model: 0.9990344038337744

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    837946
           1       0.53      0.98      0.69       914

    accuracy                           1.00    838860
   macro avg       0.77      0.99      0.84    838860
weighted avg       1.00      1.00      1.00    838860


Confusion matrix: 
[[837154    792]
 [    18    896]]



podemos ver un cambio muy significativo por lo que el modelo no generaliza bien

**Modelo Randomforest con SMOTE**

In [32]:
clf_1_S = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('smote', SMOTE(sampling_strategy='auto', n_jobs=-1)),
    ('clasificador', RandomForestClassifier(n_jobs=-1, random_state=0))])

In [33]:
#Training
#clf_1_S.fit(xtrain, ytrain)

In [34]:
#keep model with pickle
#with open('./datos/RandomForestSMOTE.pickle', 'wb') as f:
    #pickle.dump(clf_1_S, f)

In [35]:
#Skip fit and run from this cell
with open('./datos/RandomForestSMOTE.pickle', 'rb') as f:
    clf_1_S = pickle.load(f)

In [36]:
#predictions
ypred = clf_1_S.predict(xtest)
ypred_prob = clf_1_S.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.9999550384527482
Accuracy of the model: 0.9999475478625754

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       1.00      0.96      0.98       228

    accuracy                           1.00    209715
   macro avg       1.00      0.98      0.99    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209486      1]
 [    10    218]]



In [37]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.080000, G-Mean=0.997
ROC-AUC score of the model: 0.9999550384527482
Accuracy of the model: 0.9986362444269604

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.44      0.99      0.61       228

    accuracy                           1.00    209715
   macro avg       0.72      0.99      0.81    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209203    284]
 [     2    226]]



In [38]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.9818677644568494

podemos observar mejores resultados si no usamos el algorítmo SMOTE. No obstante, haré una última prueba con el siguiente modelo.

In [39]:
#check the overfiting
ypred = clf_1_S.predict(xtrain)
ypred_prob = clf_1_S.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.050000, G-Mean=0.993
ROC-AUC score of the model: 0.9975891698101989
Accuracy of the model: 0.9970936747490642

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    837946
           1       0.27      0.99      0.43       914

    accuracy                           1.00    838860
   macro avg       0.64      0.99      0.71    838860
weighted avg       1.00      1.00      1.00    838860


Confusion matrix: 
[[835521   2425]
 [    13    901]]



## XGBoost

In [40]:
clf_2 = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', XGBClassifier(n_jobs=-1, random_state=0))])

In [41]:
#Training
#clf_2.fit(xtrain, ytrain)

In [42]:
#keep model with pickle
#with open('./datos/XGBoost.pickle', 'wb') as f:
    #pickle.dump(clf_2, f)

In [43]:
#Skip fit and run from this cell
with open('./datos/XGBoost.pickle', 'rb') as f:
    clf_2 = pickle.load(f)

In [44]:
#predictions
ypred = clf_2.predict(xtest)
ypred_prob = clf_2.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.9999672550128513
Accuracy of the model: 0.9999475478625754

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       1.00      0.95      0.98       228

    accuracy                           1.00    209715
   macro avg       1.00      0.98      0.99    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209487      0]
 [    11    217]]



In [45]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.001511, G-Mean=0.998
ROC-AUC score of the model: 0.9999672550128513
Accuracy of the model: 0.9954128221634122

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.19      1.00      0.32       228

    accuracy                           1.00    209715
   macro avg       0.60      1.00      0.66    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[208526    961]
 [     1    227]]



In [46]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.9805084780963368

In [47]:
#check the overfiting
ypred = clf_2.predict(xtrain)
ypred_prob = clf_2.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.002403, G-Mean=0.993
ROC-AUC score of the model: 0.9987324448365487
Accuracy of the model: 0.9967658488901605

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    837946
           1       0.25      0.99      0.40       914

    accuracy                           1.00    838860
   macro avg       0.63      0.99      0.70    838860
weighted avg       1.00      1.00      1.00    838860


Confusion matrix: 
[[835244   2702]
 [    11    903]]



**Modelo XGBoost con Smote**

In [48]:
clf_2_S = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('smote', SMOTE(sampling_strategy='auto', n_jobs=-1)),
    ('clasificador', XGBClassifier(n_jobs=-1, random_state=0))])


In [49]:
#Training
#clf_2_S.fit(xtrain, ytrain)

In [50]:
#keep model with pickle
#with open('./datos/XGBoostSMOTE.pickle', 'wb') as f:
    #pickle.dump(clf_2_S, f)

In [51]:
#Skip fit and run from this cell
with open('./datos/XGBoostSMOTE.pickle', 'rb') as f:
    clf_2_S = pickle.load(f)

In [52]:
#predictions
ypred = clf_2_S.predict(xtest)
ypred_prob = clf.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.9763121213651493
Accuracy of the model: 0.9998712538445033

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.97      0.91      0.94       228

    accuracy                           1.00    209715
   macro avg       0.98      0.96      0.97    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209480      7]
 [    20    208]]



In [53]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.001520, G-Mean=0.936
ROC-AUC score of the model: 0.9763121213651493
Accuracy of the model: 0.929256371742603

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.93      0.96    209487
           1       0.01      0.94      0.03       228

    accuracy                           0.93    209715
   macro avg       0.51      0.93      0.50    209715
weighted avg       1.00      0.93      0.96    209715


Confusion matrix: 
[[194665  14822]
 [    14    214]]



In [54]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.9613790392532321

Tras observar los resultados como son mejores sin usar el SMOTE decido no probarlo con el resto de modelos

In [55]:
#check the overfiting
ypred = clf_2_S.predict(xtrain)
ypred_prob = clf_2_S.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.015297, G-Mean=0.989
ROC-AUC score of the model: 0.9974634599501384
Accuracy of the model: 0.9919748229740362

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    837946
           1       0.12      0.98      0.21       914

    accuracy                           0.99    838860
   macro avg       0.56      0.99      0.60    838860
weighted avg       1.00      0.99      1.00    838860


Confusion matrix: 
[[831228   6718]
 [    14    900]]



### Modelo LightGBM

In [56]:
clf_3 = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', lgb.LGBMClassifier(n_jobs=-1, random_state=0))])

In [57]:
#Training
#clf_3.fit(xtrain, ytrain)

In [58]:
#keep model with pickle
#with open('./datos/LightGBM.pickle', 'wb') as f:
    #pickle.dump(clf_3, f)

In [59]:
#Skip fit and run from this cell
with open('./datos/LightGBM.pickle', 'rb') as f:
    clf_3 = pickle.load(f)

In [60]:
#predictions
ypred = clf_3.predict(xtest)
ypred_prob = clf_3.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.6673520397656465
Accuracy of the model: 0.9977350213384831

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.23      0.46      0.31       228

    accuracy                           1.00    209715
   macro avg       0.61      0.73      0.65    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209135    352]
 [   123    105]]



In [61]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.000116, G-Mean=0.684
ROC-AUC score of the model: 0.6673520397656465
Accuracy of the model: 0.9968004196170994

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.16      0.46      0.24       228

    accuracy                           1.00    209715
   macro avg       0.58      0.73      0.62    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[208938    549]
 [   122    106]]



In [62]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.6910148070979458

In [63]:
#check the overfitting

ypred = clf_3.predict(xtrain)
ypred_prob = clf_3.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.000001, G-Mean=0.707
ROC-AUC score of the model: 0.6899348152613314
Accuracy of the model: 0.9893402951624825

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    837946
           1       0.05      0.50      0.09       914

    accuracy                           0.99    838860
   macro avg       0.53      0.75      0.54    838860
weighted avg       1.00      0.99      0.99    838860


Confusion matrix: 
[[829458   8488]
 [   454    460]]



### Modelo SVM

In [64]:
clf_4 = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', SVC(random_state=0, probability=True))])

In [65]:
#Training
#clf_4.fit(xtrain, ytrain)

In [66]:
#keep model with pickle
#with open('./datos/SVM.pickle', 'wb') as f:
    #pickle.dump(clf_4, f)

In [67]:
#Skip fit and run from this cell
with open('./datos/SVM.pickle', 'rb') as f:
    clf_4 = pickle.load(f)

In [68]:
#predictions
ypred = clf_4.predict(xtest)
ypred_prob = clf_4.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.9022037878831655
Accuracy of the model: 0.9992609016999261

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       1.00      0.32      0.49       228

    accuracy                           1.00    209715
   macro avg       1.00      0.66      0.74    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209487      0]
 [   155     73]]



In [69]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.003078, G-Mean=0.854
ROC-AUC score of the model: 0.9022037878831655
Accuracy of the model: 0.9899101161099587

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    209487
           1       0.08      0.73      0.14       228

    accuracy                           0.99    209715
   macro avg       0.54      0.86      0.57    209715
weighted avg       1.00      0.99      0.99    209715


Confusion matrix: 
[[207432   2055]
 [    61    167]]



In [70]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.6852052084932867

In [71]:
#check the overfitting

ypred = clf_4.predict(xtrain)
ypred_prob = clf_4.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.002233, G-Mean=0.880
ROC-AUC score of the model: 0.9224268405016892
Accuracy of the model: 0.9750423193381494

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    837946
           1       0.03      0.79      0.06       914

    accuracy                           0.98    838860
   macro avg       0.52      0.88      0.53    838860
weighted avg       1.00      0.98      0.99    838860


Confusion matrix: 
[[817200  20746]
 [   190    724]]



### Modelo ADA Boost

In [72]:
clf_5 = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', AdaBoostClassifier(n_estimators=100, random_state=0))])

In [73]:
#Training
#clf_5.fit(xtrain, ytrain)

In [74]:
#keep model with pickle
#with open('./datos/ADABoost.pickle', 'wb') as f:
    #pickle.dump(clf_5, f)

In [75]:
#Skip fit and run from this cell
with open('./datos/ADABoost.pickle', 'rb') as f:
    clf_5 = pickle.load(f)

In [76]:
#predictions
ypred = clf_5.predict(xtest)
ypred_prob = clf_5.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.9971091243027348
Accuracy of the model: 0.9995136256347901

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       0.93      0.60      0.73       228

    accuracy                           1.00    209715
   macro avg       0.96      0.80      0.86    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209476     11]
 [    91    137]]



In [77]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.482597, G-Mean=0.975
ROC-AUC score of the model: 0.9971091243027348
Accuracy of the model: 0.9718284338268602

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    209487
           1       0.04      0.97      0.07       228

    accuracy                           0.97    209715
   macro avg       0.52      0.97      0.53    209715
weighted avg       1.00      0.97      0.98    209715


Confusion matrix: 
[[203585   5902]
 [     6    222]]



In [78]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.8230487693291013

In [79]:
#check the overfitting

ypred = clf_5.predict(xtrain)
ypred_prob = clf_5.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.482907, G-Mean=0.968
ROC-AUC score of the model: 0.9957325694404952
Accuracy of the model: 0.9743818992442124

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    837946
           1       0.04      0.96      0.08       914

    accuracy                           0.97    838860
   macro avg       0.52      0.97      0.53    838860
weighted avg       1.00      0.97      0.99    838860


Confusion matrix: 
[[816492  21454]
 [    36    878]]



### Modelo CatBoost

In [80]:
clf_7 = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', CatBoostClassifier(random_state=0, task_type="GPU"))])

In [81]:
#Training
#clf_7.fit(xtrain, ytrain)

In [82]:
#keep model with pickle
#with open('./datos/catboost.pickle', 'wb') as f:
    #pickle.dump(clf_7, f)

In [83]:
#Skip fit and run from this cell
with open('./datos/catboost.pickle', 'rb') as f:
    clf_7 = pickle.load(f)

In [84]:
#predictions
ypred = clf_7.predict(xtest)
ypred_prob = clf_7.predict_proba(xtest)
evaluate_model(ytest,ypred,ypred_prob)

ROC-AUC score of the model: 0.999371417679563
Accuracy of the model: 0.9998283384593377

Classification report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209487
           1       1.00      0.84      0.91       228

    accuracy                           1.00    209715
   macro avg       1.00      0.92      0.96    209715
weighted avg       1.00      1.00      1.00    209715


Confusion matrix: 
[[209487      0]
 [    36    192]]



In [85]:
#threshold adjust

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_prob)

Best Threshold=0.002027, G-Mean=0.989
ROC-AUC score of the model: 0.999371417679563
Accuracy of the model: 0.9862909186276614

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    209487
           1       0.07      0.99      0.14       228

    accuracy                           0.99    209715
   macro avg       0.54      0.99      0.56    209715
weighted avg       1.00      0.99      0.99    209715


Confusion matrix: 
[[206615   2872]
 [     3    225]]



In [86]:
#F2 score
fbeta_score(ytest, ypred, beta=2, average='macro')

0.9347654244490238

In [87]:
#check the overfitting

ypred = clf_7.predict(xtrain)
ypred_prob = clf_7.predict_proba(xtrain)

# keep probabilities for the positive outcome only
yhat = ypred_prob[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytrain, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_prob[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytrain,ypred_new_threshold,ypred_prob)

Best Threshold=0.003693, G-Mean=0.987
ROC-AUC score of the model: 0.9975156272114192
Accuracy of the model: 0.9923264907135875

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    837946
           1       0.12      0.98      0.22       914

    accuracy                           0.99    838860
   macro avg       0.56      0.99      0.61    838860
weighted avg       1.00      0.99      1.00    838860


Confusion matrix: 
[[831526   6420]
 [    17    897]]



### Modelo GLM

In [88]:
glm = Pipeline(steps=[
    ('preprocesador', preprocessor),
    ('clasificador', LinearRegression())])

In [89]:
#glm.fit(xtrain, ytrain)

In [90]:
# keep model with pickle
#with open('./datos/glm.pickle', 'wb') as f:
    #pickle.dump(glm, f)

In [91]:
#Skip fit and run from this cell
with open('./datos/glm.pickle', 'rb') as f:
    glm = pickle.load(f)

In [92]:
#predictions
ypred = glm.predict(xtest)
mean_squared_error(ytest, ypred)

0.000941666669492876