Author: Judit Lozano Gondolbeu

### Libraries

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from dython.nominal import associations
from collections import Counter
from sklearn.linear_model import Lasso
import warnings
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgbm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from explainerdashboard import InlineExplainer
from explainerdashboard import ClassifierExplainer, ExplainerDashboard

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

### Ejemplo de optimización de parámetros para múltiples modelos

El gasto computaciónal es muy grande al realizar una búsqueda de los mejores parámetros para varios modelos. 

Por ese motivo, mostraremos un ejemplo de como se llevaría a cabo este ejercicio con una muestra más pequeña y con unos datos aleatorios.

In [12]:
#with open('../data/train_ridge.pickle', 'wb') as f:
    #pickle.dump([X_train, y_train, X_test, y_test],f)

In [13]:
with open('../data/train_ridge.pickle', 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)

In [14]:
import random

my_list = [0]*700 + [1]*300
random.shuffle(my_list)

fabricado = pd.DataFrame(my_list, columns=['random'])
fabricado['random'].value_counts()

0    700
1    300
Name: random, dtype: int64

In [15]:
X_train_mini = X_train[0:1000]

In [16]:
y_train_mini = fabricado['random']

In [17]:
X_test_mini = X_test[0:1000]

In [18]:
y_test_mini = fabricado['random']

### Modelos

In [23]:
dic = {
    'glm': {
        'model': LogisticRegression(),
        'params': {
            'C': [100, 10, 1.0, 0.1, 0.01],
            'penalty': ['l2'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear']
        }    
    },
    'lss': {
        'model': Lasso(),
        'params': {
            'alpha': [1, 0.1, 0.01]
        }
    },
    
    'rf': {
        'model': RandomForestClassifier(),
        'params': {
            'criterion':['entropy', 'gini'],
            'bootstrap': [True, False],
            'max_depth': [10, 20, 30, 40, 50, None],
            'min_samples_leaf': [1, 2, 4, 8, 16],
            'min_samples_leaf': [4, 6, 8],
            'min_samples_split': [5, 7,10],
        }
    },
    
    'nb': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': np.logspace(0,-9, num=5)
        }
    },
    
    'xgb': {
        'model': xgb.XGBClassifier(),
        'params': {
            'max_depth': [3,6, 10, None],
            'min_child_weight': [3,6, None],
            'subsample': [0.6, 0.8, 1]
        }
    },
    
    'lgbm': {
        'model': lgbm.LGBMClassifier(),
        'params':{
            'class_weight': ['balance',None],
            'learning_rate': [0.1, 0.01],
            'min_split_gain': [0,5,15],
            'n_estimators': [100, 200, 1000],
            'num_leaves': [6, 31, 40],
            }
    },
    'sv': {
      'model': SVC(),
      'params': {
          'C': [0.1, 1, 10],
          'gamma': [1, 0.1, 0.01],
          'kernel': ['rbf', 'linear']
        }
    },
    'ad': {
        'model': AdaBoostClassifier(),
        'params':{
            'n_estimators':[10, 50, 100, 500, None],
            'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0]

        }
    }  
    
}

In [26]:
modelres = {}

for i in dic:
    res = {}
    
    cv = GridSearchCV(dic[i]['model'], dic[i]['params'], verbose=2)
    
    res['cv_model'] = cv
    
    cv.fit(X_train_mini, y_train_mini)

    res['best_params'] = cv.best_params_

    res['score'] = cv.score(X_test_mini, y_test_mini)
    
    res['roc_auc_score'] = roc_auc_score(y_true= y_test_mini, y_score=cv.predict(X_test_mini))
    
    modelres[i] = res

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ................C=100, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ....................C=100, penalty=l2, solver=lbfgs; total time=   0.0s
[CV] END ................C=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ................C=100, penalty=l2, solv

In [27]:
df_res= pd.DataFrame(modelres)
df_res= df_res.T

In [28]:
df_res

Unnamed: 0,cv_model,best_params,score,roc_auc_score
glm,"GridSearchCV(estimator=LogisticRegression(),\n...","{'C': 0.01, 'penalty': 'l2', 'solver': 'newton...",0.7,0.5
lss,"GridSearchCV(estimator=Lasso(), param_grid={'a...",{'alpha': 1},0.0,0.5
rf,GridSearchCV(estimator=RandomForestClassifier(...,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.7,0.5
nb,"GridSearchCV(estimator=GaussianNB(),\n ...",{'var_smoothing': 1.0},0.678,0.496667
xgb,GridSearchCV(estimator=XGBClassifier(base_scor...,"{'max_depth': 3, 'min_child_weight': None, 'su...",0.628,0.494286
lgbm,"GridSearchCV(estimator=LGBMClassifier(),\n ...","{'class_weight': None, 'learning_rate': 0.1, '...",0.7,0.5
sv,"GridSearchCV(estimator=SVC(),\n pa...","{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}",0.7,0.5
ad,"GridSearchCV(estimator=AdaBoostClassifier(),\n...","{'learning_rate': 0.001, 'n_estimators': 500}",0.698,0.498571


Se podría modificar la tabla e ir añadiendo criterios de información segun quisieramos.

A partir de ahí generaríamos las matrices de confusion y gráficas que representes la curva ROC, Lift y de ganancia. Nuestro modelo ganador sería el que mejor se ajuste a los criterios de información que nosotros estableciésemos.

En este caso, como la muestra es muy pequeña todos los modelos puntúan parecido. Otro parámetro para considerar a la hora de elegir un modelo sería el coste computacional.

### Ejemplo de un dashboard interactivo con un dataset pequeño:

In [36]:
model = df_res['cv_model'][5] #ejemplo con LGBMClassifier(),

In [37]:
explainer = ClassifierExplainer(model, X_test_mini, y_test_mini)

Note: shap values for shap='kernel' normally get calculated against X_background, but paramater X_background=None, so setting X_background=shap.sample(X, 50)...
Generating self.shap_explainer = shap.KernelExplainer(model, X, link='identity')


In [64]:
InlineExplainer(explainer).shap.dependence()
InlineExplainer(explainer).shap.overview()