In [1]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import joblib            as jb
import seaborn           as sns

from sklearn.metrics                  import roc_auc_score
from sklearn.metrics                  import average_precision_score
from sklearn.model_selection          import GridSearchCV
from sklearn.model_selection          import RandomizedSearchCV
from sklearn.model_selection          import StratifiedKFold

from lightgbm                         import LGBMClassifier
from sklearn.ensemble                 import RandomForestClassifier
from sklearn.naive_bayes              import GaussianNB
from sklearn.neighbors                import KNeighborsClassifier
from sklearn.svm                      import SVC

%matplotlib inline

In [2]:
# Carregar os dataSet com as mehores features

X_train             = jb.load("../dados/treino/X_trainFinal.pkl.z")
y_train             = jb.load("../dados/treino/y_trainFinal.pkl.z")
X_test              = jb.load("../dados/teste/X_testeFinal.pkl.z")
y_test              = jb.load("../dados/teste/y_testeFinal.pkl.z")

# X_train.info()

# Random Forest (Parametros)

In [None]:
# Alimentar uma variavel com os possiveis valores atribuidos a cada parametro
grid_param = {'n_estimators':     [500, 800, 1500, 2000, 5000],
             'max_features':      ['auto','sqrt','log2'],
             'max_depth':         [10,24,32,64,128],
             'min_samples_split': [2,50,100,200,500],
             'min_samples_leaf':  [5,10,15,20,50]}
# Setup inicial do metodo de busca informando o modelo que sera utilizado
res_prm = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=1),
                             param_distributions=grid_param,
                             n_iter=500,
                             cv=5,
                            verbose=2,
                            random_state=42,n_jobs=6)
# Iniciar o trabalho de treinamento(fit) do modelo com cada combinação de parametro com objetivo
# de encontrar a melhor combinação.
# res_prm.fit(X_train, y_train) #descomentar esta linha para procurar os parametros
print(res_prm.best_params_)
'''
 {'n_estimators': 1500,
 'min_samples_split': 2,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 64}
'''

# # Random Forest (Treinamento)

In [3]:
# Utilizar o melhor parametro
mdl_rf = RandomForestClassifier(n_estimators=1500,
                                random_state=1,
                                max_depth=64,
                                max_features='sqrt',
                                min_samples_leaf=5, 
                                class_weight="balanced",
                                n_jobs=6,
                                min_samples_split=2)
mdl_rf.fit(X_train, y_train)
p_rf = mdl_rf.predict_proba(X_test)[:, 1]
print('Average:'      , average_precision_score(y_test, p_rf))
print('roc auc Score:', roc_auc_score(y_test, p_rf))

Average: 0.9410726637585005
roc auc Score: 0.8721428759020675


# LGBM

In [4]:
params = [0.01, 5, 2, 1, 1, 2000]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]
mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth,
                          max_depth=max_depth,min_child_samples=min_child_samples,
                          subsample=subsample, colsample_bytree=colsample_bytree,
                          bagging_freq=1,n_estimators=n_estimators,
                          random_state=20,class_weight="balanced",n_jobs=6)
mdl_lgbm.fit(X_train, y_train)
p_lgbm = mdl_lgbm.predict_proba(X_test)[:, 1]
print('Average:'      ,average_precision_score(y_test, p_lgbm) )
print('roc:'          ,roc_auc_score(y_test, p_lgbm) )

Average: 0.9643243455221865
roc: 0.9251570017416255


# Naive Bayes (Gauss)

In [5]:
gnb = GaussianNB()
gnb.fit(
    X_train,
    y_train
)
p_nb = gnb.predict_proba(X_test)[:, 1]
print('Average:'      , average_precision_score(y_test, p_nb))
print('roc auc Score:', roc_auc_score(y_test, p_nb))
print('NB Score:'     , gnb.score(X_test, y_test) )

Average: 0.7216729326800552
roc auc Score: 0.5832553990934625
NB Score: 0.7098773935803829


# KNN

In [6]:
knn   = KNeighborsClassifier()
#Buscar melhores parametros
# k_list         = list(range(1,31))
# p              = list(range(1,10))
# weights        = ['uniform', 'distance']
# algorithm      = ['auto', 'ball_tree', 'kd_tree']
# leaf_size      = list(range(10,40))
# parametros     = dict(n_neighbors=k_list,
#                       p=p,
#                       weights=weights,
#                       algorithm=algorithm,
#                       leaf_size=leaf_size)
# grid = GridSearchCV(knn, parametros, cv=5, scoring='accuracy', n_jobs=6)
# grid.fit(X_train, y_train)
# grid = grid.best_params_
grid = {'algorithm':   'auto',
        'leaf_size':   28,
        'n_neighbors': 22,
        'p': 1,
        'weights': 'distance'}

# Utiliza os melhores parametros
knn   = KNeighborsClassifier(n_neighbors=grid['n_neighbors'], 
                             p=grid['p'],
                             weights=grid['weights'],
                             algorithm=grid['algorithm'],
                             leaf_size=grid['leaf_size'])
knn.fit( X_train, y_train)
p_knn = knn.predict_proba(X_test)[:, 1]
print('Average:'      , average_precision_score(y_test, p_knn))
print('roc auc Score:', roc_auc_score(y_test, p_knn))
print('KNN Score:'    , knn.score(X_test, y_test) )

Average: 0.8266299483788582
roc auc Score: 0.7179321514311299
KNN Score: 0.7346741975478716


# SVM

In [7]:
svclassifier = SVC(kernel='poly', gamma=1)
svclassifier.fit(X_train, y_train)
p_svn = svclassifier.predict(X_test)
p_svnScore = svclassifier.score(X_test, y_test)
print('Average:'      , average_precision_score(y_test, p_svn ))
print('Score Mean:', roc_auc_score(y_test, p_svn ))

Average: 0.6974789915966386
Score Mean: 0.5


# Geral

In [8]:
p = (p_rf + p_lgbm + p_nb + p_knn + p_svn ) / 5
average = pd.DataFrame({"RF":     [average_precision_score(y_test, p_rf)],
              "LGBM":   [average_precision_score(y_test, p_lgbm)],
              "NB":     [average_precision_score(y_test, p_nb)],
              "SVN":    [average_precision_score(y_test, p_svn)],
              "KNN":    [average_precision_score(y_test, p_knn)],
              "GERAL":  [average_precision_score(y_test, p)]})
roc = pd.DataFrame({"RF":     [roc_auc_score(y_test, p_rf)],
              "LGBM":   [roc_auc_score(y_test, p_lgbm)],
              "NB":     [roc_auc_score(y_test, p_nb)],
              "SVN":    [roc_auc_score(y_test, p_svn)],
              "KNN":    [roc_auc_score(y_test, p_knn)],
              "GERAL":  [roc_auc_score(y_test, p)]})
print(average), print(roc)

         RF      LGBM        NB       SVN      KNN     GERAL
0  0.941073  0.964324  0.721673  0.697479  0.82663  0.944403
         RF      LGBM        NB  SVN       KNN     GERAL
0  0.872143  0.925157  0.583255  0.5  0.717932  0.882449


(None, None)

# Salvar Modelos

In [9]:
jb.dump(mdl_lgbm     , "../modelos_treinados/LGBM/lgbm.pkl.z")
jb.dump(mdl_rf       , "../modelos_treinados/randomForest/random_forest.pkl.z")
jb.dump(gnb          , "../modelos_treinados/naiveBayes/naiveBayes.pkl.z")
jb.dump(knn          , "../modelos_treinados/KNN/knn.pkl.z")
jb.dump(svclassifier , "../modelos_treinados/SVM/SVM.pkl.z")

['../modelos_treinados/SVM/SVM.pkl.z']