# Códigos para realizar classificação
Com diferentes opções de validação (Kfold, leave-one-out e holdout)
* Regressão Logística
* Naive Bayes
* KNN
* Árvore de Decisão
* Random Forest
* Gradient Boosting
* MLP
* SVM

In [30]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('notebook')

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
#import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, auc, roc_curve, precision_score, recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from sklearn.neighbors import DistanceMetric
from sklearn import datasets

from scipy import stats

from time import time

# Gerando base de dados (iris)
Base com resposta com 3 níveis

In [2]:
iris = datasets.load_iris()

dados = pd.DataFrame(iris.data)
dados.columns = iris.feature_names
dados['TARGET'] = iris.target

X = dados.drop('TARGET',axis=1)
Y = dados['TARGET']

dados.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),TARGET
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# Gerando base de dados (breast_cancer)
Base com resposta com 2 níveis

In [3]:
cancer = datasets.load_breast_cancer()

dados2 = pd.DataFrame(cancer.data)
dados2.columns = cancer.feature_names
dados2['TARGET'] = cancer.target

X_2 = dados2.drop('TARGET',axis=1)
Y_2 = dados2['TARGET']

dados2.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,TARGET
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


# Definindo funções para diferentes tipos de validação
* K-folds
* Leave one out (kfolds com k=n)
* Holdout

In [4]:
# Kfolds / Leave one out para resposta com 2 níveis

def model_classif_cv (model, X, y, cv, metrics):

    first = True

    for train_index, test_index in cv.split(X):
        #print(train_index)
        model2 = model.fit(X.iloc[train_index],y[train_index])
        
        pred_train = model2.predict(X.iloc[train_index])
        pred_test = model2.predict(X.iloc[test_index])
        
        prob_train = model2.predict_proba(X.iloc[train_index])
        prob_test = model2.predict_proba(X.iloc[test_index])

        prob1_train = pd.DataFrame(prob_train).iloc[:,1]
        prob1_test = pd.DataFrame(prob_test).iloc[:,1]

        prob0_train = pd.DataFrame(prob_train).iloc[:,0]
        prob0_test = pd.DataFrame(prob_test).iloc[:,0]
        
        y_train = y[train_index]
        y_test = y[test_index]
        
        train_results = pd.concat([y_train.reset_index(drop=True), prob1_train], axis = 1)
        test_results = pd.concat([y_test.reset_index(drop=True), prob1_test], axis = 1)
        train_results.columns = ['y_train', 'prob1']
        test_results.columns = ['y_test', 'prob1']
            
        first_metric = True
        for metric in metrics:
            name_metric = metric.__name__
            
            if metric == roc_auc_score:
                m_tr = metric(y_train, prob1_train)
                m_te = metric(y_test, prob1_test)
                
            else:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
            
            m_tr_te = {
                str(name_metric) +'- 1.Treino': [m_tr],
                str(name_metric) +'- 2.Teste' : [m_te]
            }
            resultados_aux = pd.DataFrame(m_tr_te)
            
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 1)    
                
        train_prob1_True1 = train_results[train_results.y_train == 1][['prob1']]
        train_prob1_True0 = train_results[train_results.y_train == 0][['prob1']]
        
        test_prob1_True1 = test_results[test_results.y_test == 1][['prob1']]
        test_prob1_True0 = test_results[test_results.y_test == 0][['prob1']]
        
        m_tr_te_ks = {
                    'ks - 1.Treino': [stats.ks_2samp(train_prob1_True1.prob1, train_prob1_True0.prob1).statistic],
                    'ks - 2.Teste' : [stats.ks_2samp(test_prob1_True1.prob1, test_prob1_True0.prob1).statistic]
        }  
                       
        resultados_aux = pd.DataFrame(m_tr_te_ks)
        results_folds = pd.concat([results_folds, resultados_aux], axis = 1)    
    
        if first == True:
            results = results_folds
            first = False
        else:
            results = pd.concat([results, results_folds], axis = 0)    

    results.index = range(cv.get_n_splits(X))
    results_mean = np.transpose(pd.DataFrame(results.mean(), columns=['mean']))
    results = pd.concat([results, results_mean], axis = 0)

    return results

In [5]:
# Kfolds / Leave one out para resposta com mais de 2 níveis
# Usamos apenas:
#accuracy_score, f1_score, precision_score, recall_score
# é necessário definir o parâmetro average (av), pois o default é binary
# micro, macro ou weighted

def model_classif_cv_m (model, X, y, cv, metrics, av):

    first = True

    for train_index, test_index in cv.split(X):
        #print(train_index)
        model2 = model.fit(X.iloc[train_index],y[train_index])
        
        pred_train = model2.predict(X.iloc[train_index])
        pred_test = model2.predict(X.iloc[test_index])
        
        prob_train = model2.predict_proba(X.iloc[train_index])
        prob_test = model2.predict_proba(X.iloc[test_index])

        y_train = y[train_index]
        y_test = y[test_index]
        
                  
        first_metric = True
        for metric in metrics:
            name_metric = metric.__name__
            
            if metric == accuracy_score:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
                
            else:
                m_tr = metric(y_train, pred_train, average = av)
                m_te = metric(y_test, pred_test, average = av)
            
            m_tr_te = {
                str(name_metric) +'- 1.Treino': [m_tr],
                str(name_metric) +'- 2.Teste' : [m_te]
            }
            resultados_aux = pd.DataFrame(m_tr_te)
            
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 1)    
   
    
        if first == True:
            results = results_folds
            first = False
        else:
            results = pd.concat([results, results_folds], axis = 0)    

    results.index = range(cv.get_n_splits(X))
    results_mean = np.transpose(pd.DataFrame(results.mean(), columns=['mean']))
    results = pd.concat([results, results_mean], axis = 0)

    return results

In [None]:
# Validação holdout resposta binária

def model_classif_holdout (clf, X_train, y_train, X_test, y_test, metrics):
    
    clf2 = clf.fit(X_train, y_train)
       
    pred_train = clf2.predict(X_train)
    pred_test = clf2.predict(X_test)

    prob_train = clf2.predict_proba(X_train)
    prob_test = clf2.predict_proba(X_test)

    prob1_train = pd.DataFrame(prob_train).iloc[:,1]
    prob1_test = pd.DataFrame(prob_test).iloc[:,1]
    
    prob0_train = pd.DataFrame(prob_train).iloc[:,0]
    prob0_test = pd.DataFrame(prob_test).iloc[:,0]
    
    train_results = pd.concat([y_train, prob1_train], axis = 1)
    test_results = pd.concat([y_test, prob1_test], axis = 1)
    train_results.columns = ['y_train', 'prob1']
    test_results.columns = ['y_test', 'prob1']
    
    first_metric = True
    
    for metric in metrics:
            name_metric = metric.__name__
            
            if metric == roc_auc_score:
                m_tr = metric(y_train, prob1_train)
                m_te = metric(y_test, prob1_test)

            else:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
            
            m_tr_te = {
                '1.Treino': [m_tr],
                '2.Teste' : [m_te]
            }
            
            resultados_aux = pd.DataFrame(m_tr_te, index = [str(name_metric)])
            #print(resultados_aux)
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 0)    
    
    train_prob1_True1 = train_results[train_results.y_train == 1][['prob1']]
    train_prob1_True0 = train_results[train_results.y_train == 0][['prob1']]
    
    test_prob1_True1 = test_results[test_results.y_test == 1][['prob1']]
    test_prob1_True0 = test_results[test_results.y_test == 0][['prob1']]
    
    
    m_tr_te_ks = {
                '1.Treino': [stats.ks_2samp(train_prob1_True1.prob1, train_prob1_True0.prob1).statistic],
                '2.Teste' : [stats.ks_2samp(test_prob1_True1.prob1, test_prob1_True0.prob1).statistic]
    }
    
    
    resultados_aux = pd.DataFrame(m_tr_te_ks, index = ['KS'])
    results_folds = pd.concat([results_folds, resultados_aux], axis = 0)    
    
    
    return results_folds

In [None]:
# Validação holdout resposta multiclass

def model_classif_holdout_m (clf, X_train, y_train, X_test, y_test, metrics, av):
    
    clf2 = clf.fit(X_train, y_train)
       
    pred_train = clf2.predict(X_train)
    pred_test = clf2.predict(X_test)

    prob_train = clf2.predict_proba(X_train)
    prob_test = clf2.predict_proba(X_test)
    
    first_metric = True
    
    for metric in metrics:
            name_metric = metric.__name__
            
            if metric == accuracy_score:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)

            else:
                m_tr = metric(y_train, pred_train, average = av)
                m_te = metric(y_test, pred_test, average = av)
            
            m_tr_te = {
                '1.Treino': [m_tr],
                '2.Teste' : [m_te]
            }
            
            resultados_aux = pd.DataFrame(m_tr_te, index = [str(name_metric)])
            #print(resultados_aux)
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 0)    
    
    train_prob1_True1 = train_results[train_results.y_train == 1][['prob1']]
    train_prob1_True0 = train_results[train_results.y_train == 0][['prob1']]
    
    test_prob1_True1 = test_results[test_results.y_test == 1][['prob1']]
    test_prob1_True0 = test_results[test_results.y_test == 0][['prob1']]
    
    
    m_tr_te_ks = {
                '1.Treino': [stats.ks_2samp(train_prob1_True1.prob1, train_prob1_True0.prob1).statistic],
                '2.Teste' : [stats.ks_2samp(test_prob1_True1.prob1, test_prob1_True0.prob1).statistic]
    }
    
    
    resultados_aux = pd.DataFrame(m_tr_te_ks, index = ['KS'])
    results_folds = pd.concat([results_folds, resultados_aux], axis = 0)    
    
    
    return results_folds

# Regressão logística

In [6]:
LR = LogisticRegression(penalty='l2', 
                                     dual=False, 
                                     tol=0.0001, 
                                     C=1.0, 
                                     fit_intercept=True, 
                                     intercept_scaling=1, 
                                     class_weight=None, 
                                     random_state=None, 
                                     solver='liblinear', 
                                     max_iter=100, 
                                     multi_class='ovr', 
                                     verbose=0, 
                                     warm_start=False, 
                                     n_jobs=1)

LR.fit(X_2,Y_2)
y_pred = LR.predict(X_2)
accuracy_score(Y_2, y_pred)

0.95957820738137078

In [7]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(LR, X_2, Y_2, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.974138,0.758621,0.968571,0.611111,0.979769,1.0,0.964844,0.877193,0.994446,0.998024,0.935197,0.978261
1,0.966049,0.958904,0.960123,0.921053,0.97205,1.0,0.957031,0.947368,0.995374,0.992208,0.931154,0.925974
2,0.964561,0.958904,0.954268,0.945946,0.975078,0.972222,0.955078,0.947368,0.994422,0.996032,0.933454,0.944444
3,0.962179,0.966667,0.954955,0.935484,0.969512,1.0,0.951172,0.964912,0.994184,0.997537,0.923383,0.965517
4,0.966767,0.965517,0.958084,0.965517,0.97561,0.965517,0.957031,0.964912,0.994085,0.998768,0.925106,0.965517
5,0.966614,0.966292,0.958991,0.977273,0.974359,0.955556,0.958984,0.947368,0.995994,0.987037,0.930513,0.916667
6,0.96378,0.987952,0.959248,0.97619,0.968354,1.0,0.955078,0.982456,0.994139,0.996951,0.930896,0.95122
7,0.966825,0.976744,0.95625,1.0,0.977636,0.954545,0.958984,0.964912,0.994782,1.0,0.940196,1.0
8,0.965079,0.964706,0.958991,1.0,0.971246,0.931818,0.957031,0.947368,0.994702,0.991259,0.931976,0.931818
9,0.96519,0.976744,0.959119,0.976744,0.971338,0.976744,0.957115,0.964286,0.994815,0.996422,0.932129,0.953488


Naive Bayes

In [8]:
NB = GaussianNB(priors=None)

NB.fit(X,Y)
y_pred = NB.predict(X)
accuracy_score(Y, y_pred)

0.95999999999999996

In [9]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(NB, X_2, Y_2, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.961484,0.785714,0.949296,0.647059,0.973988,1.0,0.947266,0.894737,0.989815,0.998024,0.903162,0.978261
1,0.960123,0.888889,0.948485,0.864865,0.97205,0.914286,0.949219,0.859649,0.989703,0.963636,0.907257,0.885714
2,0.954128,0.929577,0.936937,0.942857,0.971963,0.916667,0.941406,0.912281,0.989773,0.968254,0.909396,0.876984
3,0.956522,0.935484,0.941003,0.878788,0.972561,1.0,0.943359,0.929825,0.988584,1.0,0.89316,1.0
4,0.953383,0.949153,0.940653,0.933333,0.966463,0.965517,0.939453,0.947368,0.987921,0.995074,0.90403,0.931034
5,0.949843,0.977778,0.929448,0.977778,0.971154,0.977778,0.9375,0.964912,0.989199,0.985185,0.904103,0.894444
6,0.950311,0.987952,0.932927,0.97619,0.968354,1.0,0.9375,0.982456,0.988294,0.995427,0.905386,0.9375
7,0.95,0.977273,0.929664,0.977273,0.971246,0.977273,0.9375,0.964912,0.988681,0.996503,0.897266,0.977273
8,0.949686,0.964706,0.934985,1.0,0.964856,0.931818,0.9375,0.947368,0.989869,0.994755,0.903656,0.977273
9,0.951487,0.977273,0.935385,0.955556,0.968153,1.0,0.939571,0.964286,0.988829,0.996422,0.896105,0.976744


In [10]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(NB, X, Y, cv_kfold, metrics, av)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.955556,1.0,0.955556,1.0,0.955556,1.0,0.955556,1.0
1,0.955556,1.0,0.955556,1.0,0.955556,1.0,0.955556,1.0
2,0.955556,1.0,0.955556,1.0,0.955556,1.0,0.955556,1.0
3,0.962963,0.933333,0.962963,0.933333,0.962963,0.933333,0.962963,0.933333
4,0.962963,0.933333,0.962963,0.933333,0.962963,0.933333,0.962963,0.933333
5,0.962963,0.866667,0.962963,0.866667,0.962963,0.866667,0.962963,0.866667
6,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0
7,0.962963,0.866667,0.962963,0.866667,0.962963,0.866667,0.962963,0.866667
8,0.97037,0.866667,0.97037,0.866667,0.97037,0.866667,0.97037,0.866667
9,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0


# KNN

In [11]:
KNN = KNeighborsClassifier(n_neighbors=5, 
                           weights='uniform', 
                           leaf_size=30,
                           p=2, 
                           metric='minkowski', 
                           metric_params=None, 
                           n_jobs=1)

KNN.fit(X,Y)
y_pred = KNN.predict(X)
accuracy_score(Y, y_pred)

0.96666666666666667

In [12]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(KNN, X_2, Y_2, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.970043,0.666667,0.957746,0.5,0.982659,1.0,0.958984,0.807018,0.991965,0.969368,0.892298,0.865613
1,0.961948,0.944444,0.943284,0.918919,0.981366,0.971429,0.951172,0.929825,0.992489,0.954545,0.892416,0.84026
2,0.956923,0.944444,0.945289,0.944444,0.968847,0.944444,0.945312,0.929825,0.990809,0.937831,0.884344,0.849206
3,0.958209,0.9,0.938596,0.870968,0.978659,0.931034,0.945312,0.894737,0.992138,0.964901,0.891437,0.860837
4,0.958209,0.983051,0.938596,0.966667,0.978659,1.0,0.945312,0.982456,0.990497,0.999384,0.879242,0.965517
5,0.954331,0.967033,0.93808,0.956522,0.971154,0.977778,0.943359,0.947368,0.991226,0.953704,0.894103,0.894444
6,0.958009,0.963855,0.941896,0.952381,0.974684,0.97561,0.947266,0.947368,0.991297,0.958841,0.881813,0.85061
7,0.952681,0.966292,0.94081,0.955556,0.964856,0.977273,0.941406,0.947368,0.990303,0.948427,0.871707,0.846154
8,0.957614,0.939759,0.941358,1.0,0.974441,0.886364,0.947266,0.912281,0.993016,0.964161,0.8968,0.886364
9,0.956113,0.976744,0.941358,0.976744,0.971338,0.976744,0.945419,0.964286,0.991374,0.953488,0.889239,0.899821


In [13]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(KNN, X, Y, cv_kfold, metrics, av)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0
1,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0
2,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0
3,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0
4,0.985185,0.8,0.985185,0.8,0.985185,0.8,0.985185,0.8
5,0.977778,0.866667,0.977778,0.866667,0.977778,0.866667,0.977778,0.866667
6,0.962963,1.0,0.962963,1.0,0.962963,1.0,0.962963,1.0
7,0.977778,0.866667,0.977778,0.866667,0.977778,0.866667,0.977778,0.866667
8,0.97037,0.8,0.97037,0.8,0.97037,0.8,0.97037,0.8
9,0.97037,1.0,0.97037,1.0,0.97037,1.0,0.97037,1.0


# Árvore de decisão

In [14]:
DT = DecisionTreeClassifier(criterion='gini', 
                            splitter='best', 
                            max_depth=10, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features=None, 
                            random_state=None, 
                            max_leaf_nodes=None, 
                            min_impurity_decrease=0.0, 
                            min_impurity_split=None, 
                            class_weight=None, 
                            presort=False)

DT.fit(X,Y)
y_pred = DT.predict(X)
accuracy_score(Y, y_pred)

1.0

In [15]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(DT, X_2, Y_2, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,1.0,0.814815,1.0,0.6875,1.0,1.0,1.0,0.912281,1.0,0.945652,1.0,0.891304
1,1.0,0.929577,1.0,0.916667,1.0,0.942857,1.0,0.912281,1.0,0.903247,1.0,0.806494
2,1.0,0.901408,1.0,0.914286,1.0,0.888889,1.0,0.877193,1.0,0.873016,1.0,0.746032
3,1.0,0.965517,1.0,0.965517,1.0,0.965517,1.0,0.964912,1.0,0.964901,1.0,0.929803
4,1.0,0.912281,1.0,0.928571,1.0,0.896552,1.0,0.912281,1.0,0.912562,1.0,0.825123
5,1.0,0.989011,1.0,0.978261,1.0,1.0,1.0,0.982456,1.0,0.958333,1.0,0.916667
6,1.0,0.936709,1.0,0.973684,1.0,0.902439,1.0,0.912281,1.0,0.91997,1.0,0.839939
7,1.0,0.964706,1.0,1.0,1.0,0.931818,1.0,0.947368,1.0,0.965909,1.0,0.931818
8,1.0,0.976744,1.0,1.0,1.0,0.954545,1.0,0.964912,1.0,0.977273,1.0,0.954545
9,1.0,0.939759,1.0,0.975,1.0,0.906977,1.0,0.910714,1.0,0.915027,1.0,0.830054


In [16]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(DT, X, Y, cv_kfold, metrics, av)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.933333,1.0,0.933333,1.0,0.933333,1.0,0.933333
5,1.0,0.866667,1.0,0.866667,1.0,0.866667,1.0,0.866667
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,0.866667,1.0,0.866667,1.0,0.866667,1.0,0.866667
8,1.0,0.733333,1.0,0.733333,1.0,0.733333,1.0,0.733333
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Random forest

In [17]:
RF = RandomForestClassifier(n_estimators=10, 
                            criterion='gini', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features='auto', 
                            max_leaf_nodes=10, 
                            min_impurity_decrease=0.0, 
                            min_impurity_split=None, 
                            bootstrap=True, 
                            oob_score=False, 
                            n_jobs=1, 
                            random_state=None, 
                            verbose=0, 
                            warm_start=False, 
                            class_weight=None)

RF.fit(X,Y)
y_pred = RF.predict(X)
accuracy_score(Y, y_pred)

0.98666666666666669

In [18]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(RF, X_2, Y_2, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.985755,0.785714,0.97191,0.647059,1.0,1.0,0.980469,0.894737,0.996318,1.0,0.964099,1.0
1,0.989214,0.929577,0.981651,0.916667,0.996894,0.942857,0.986328,0.912281,0.996069,0.980519,0.975842,0.914286
2,0.987616,0.931507,0.981538,0.918919,0.993769,0.944444,0.984375,0.912281,0.998858,0.958333,0.971832,0.869048
3,0.993939,0.966667,0.987952,0.935484,1.0,1.0,0.992188,0.964912,0.999022,1.0,0.980647,1.0
4,0.98494,0.983051,0.973214,0.966667,0.996951,1.0,0.980469,0.982456,0.995303,1.0,0.96474,1.0
5,0.988871,0.989011,0.981073,0.978261,0.996795,1.0,0.986328,0.982456,0.994968,0.992593,0.97859,0.916667
6,0.979463,0.97561,0.977918,0.97561,0.981013,0.97561,0.974609,0.964912,0.997958,0.989329,0.960604,0.91311
7,0.992026,0.988764,0.990446,0.977778,0.99361,1.0,0.990234,0.982456,0.997423,0.998252,0.978535,0.977273
8,0.9856,0.988506,0.987179,1.0,0.984026,0.977273,0.982422,0.982456,0.996364,0.998252,0.97078,0.977273
9,0.982622,0.977273,0.974922,0.955556,0.990446,1.0,0.978558,0.964286,0.994767,1.0,0.96532,1.0


In [19]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(RF, X, Y, cv_kfold, metrics, av)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.992593,1.0,0.992593,1.0,0.992593,1.0,0.992593,1.0
1,0.992593,1.0,0.992593,1.0,0.992593,1.0,0.992593,1.0
2,0.985185,1.0,0.985185,1.0,0.985185,1.0,0.985185,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.933333,1.0,0.933333,1.0,0.933333,1.0,0.933333
5,0.992593,0.866667,0.992593,0.866667,0.992593,0.866667,0.992593,0.866667
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,0.992593,0.866667,0.992593,0.866667,0.992593,0.866667,0.992593,0.866667
8,0.992593,0.8,0.992593,0.8,0.992593,0.8,0.992593,0.8
9,0.992593,1.0,0.992593,1.0,0.992593,1.0,0.992593,1.0


# Gradient boosting

In [20]:
GB = GradientBoostingClassifier(loss='deviance', 
                                learning_rate=0.01, 
                                n_estimators=10, 
                                subsample=1.0, 
                                criterion='friedman_mse', 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_depth=3, 
                                min_impurity_decrease=0.0, 
                                min_impurity_split=None, 
                                init=None, 
                                random_state=None, 
                                max_features=None, 
                                verbose=0, 
                                max_leaf_nodes=None, 
                                warm_start=False, 
                                presort='auto')


GB.fit(X,Y)
y_pred = GB.predict(X)
accuracy_score(Y, y_pred)

0.99333333333333329

In [21]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(GB, X_2, Y_2, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.806527,0.323529,0.675781,0.192982,1.0,1.0,0.675781,0.192982,0.99408,0.98913,0.942893,0.978261
1,0.772182,0.76087,0.628906,0.614035,1.0,1.0,0.628906,0.614035,0.972687,0.879221,0.922524,0.761039
2,0.770708,0.774194,0.626953,0.631579,1.0,1.0,0.626953,0.631579,0.98793,0.908069,0.936178,0.821429
3,0.780952,0.674419,0.640625,0.508772,1.0,1.0,0.640625,0.508772,0.987092,0.998768,0.953473,0.965517
4,0.780952,0.674419,0.640625,0.508772,1.0,1.0,0.640625,0.508772,0.993704,0.985222,0.931866,0.929803
5,0.757282,0.882353,0.609375,0.789474,1.0,1.0,0.609375,0.789474,0.994792,0.953704,0.95859,0.916667
6,0.763285,0.836735,0.617188,0.719298,1.0,1.0,0.617188,0.719298,0.981723,0.928354,0.938776,0.839939
7,0.758788,0.871287,0.611328,0.77193,1.0,1.0,0.611328,0.77193,0.993779,1.0,0.96712,1.0
8,0.758788,0.871287,0.611328,0.77193,1.0,1.0,0.611328,0.77193,0.991796,0.998252,0.976705,0.977273
9,0.759371,0.868687,0.612086,0.767857,1.0,1.0,0.612086,0.767857,0.98995,0.949016,0.948404,0.846154


In [22]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(GB, X, Y, cv_kfold, metrics, av)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.992593,1.0,0.992593,1.0,0.992593,1.0,0.992593,1.0
1,0.992593,1.0,0.992593,1.0,0.992593,1.0,0.992593,1.0
2,0.992593,1.0,0.992593,1.0,0.992593,1.0,0.992593,1.0
3,0.977778,1.0,0.977778,1.0,0.977778,1.0,0.977778,1.0
4,0.985185,0.866667,0.985185,0.866667,0.985185,0.866667,0.985185,0.866667
5,0.985185,0.8,0.985185,0.8,0.985185,0.8,0.985185,0.8
6,0.985185,1.0,0.985185,1.0,0.985185,1.0,0.985185,1.0
7,1.0,0.866667,1.0,0.866667,1.0,0.866667,1.0,0.866667
8,0.985185,0.6,0.985185,0.6,0.985185,0.6,0.985185,0.6
9,0.985185,1.0,0.985185,1.0,0.985185,1.0,0.985185,1.0


# MLP

In [23]:
MLP = MLPClassifier(hidden_layer_sizes=(5, ), 
                    activation='relu', 
                    solver='adam', 
                    alpha=0.0001, 
                    batch_size='auto', 
                    learning_rate='constant', 
                    learning_rate_init=0.001, 
                    power_t=0.5, max_iter=200, 
                    shuffle=True, 
                    random_state=None, 
                    tol=0.0001, 
                    verbose=False, 
                    warm_start=False, 
                    momentum=0.9, 
                    nesterovs_momentum=True, 
                    early_stopping=False, 
                    validation_fraction=0.1, 
                    beta_1=0.9, 
                    beta_2=0.999, 
                    epsilon=1e-08)

MLP.fit(X,Y)
y_pred = MLP.predict(X)
accuracy_score(Y, y_pred)



0.33333333333333331

In [24]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(MLP, X_2, Y_2, cv_kfold, metrics)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.0,0.0,0.0,0.0,0.0,0.0,0.324219,0.807018,0.959712,0.950593,0.819103,0.869565
1,0.0,0.0,0.0,0.0,0.0,0.0,0.275391,0.263158,0.302697,0.283117,0.472573,0.538961
2,0.0,0.0,0.0,0.0,0.0,0.0,0.373047,0.368421,0.962388,0.952381,0.796855,0.829365
3,0.890411,0.794521,0.808458,0.659091,0.990854,1.0,0.84375,0.736842,0.874818,0.821429,0.739528,0.642857
4,0.935484,0.966667,0.90113,0.935484,0.972561,1.0,0.914062,0.964912,0.89316,0.964286,0.789502,0.964286
5,0.911076,0.955556,0.887538,0.955556,0.935897,0.955556,0.888672,0.929825,0.872524,0.911111,0.759487,0.833333
6,0.763285,0.836735,0.617188,0.719298,1.0,1.0,0.617188,0.719298,0.389241,0.439024,0.221519,0.121951
7,0.758788,0.871287,0.611328,0.77193,1.0,1.0,0.611328,0.77193,0.5,0.5,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.388672,0.22807,0.921155,0.926573,0.705444,0.777972
9,0.929448,0.965517,0.89645,0.954545,0.964968,0.976744,0.910331,0.946429,0.969625,0.957066,0.824905,0.846154


In [27]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(MLP, X, Y, cv_kfold, metrics, av)



Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.62963,1.0,0.62963,1.0,0.62963,1.0,0.62963,1.0
1,0.57037,0.0,0.57037,0.0,0.57037,0.0,0.57037,0.0
2,0.007407,0.0,0.007407,0.0,0.007407,0.0,0.007407,0.0
3,0.607407,1.0,0.607407,1.0,0.607407,1.0,0.607407,1.0
4,0.37037,0.0,0.37037,0.0,0.37037,0.0,0.37037,0.0
5,0.740741,0.0,0.740741,0.0,0.740741,0.0,0.740741,0.0
6,0.696296,0.333333,0.696296,0.333333,0.696296,0.333333,0.696296,0.333333
7,0.62963,1.0,0.62963,1.0,0.62963,1.0,0.62963,1.0
8,0.259259,1.0,0.259259,1.0,0.259259,1.0,0.259259,1.0
9,0.725926,0.066667,0.725926,0.066667,0.725926,0.066667,0.725926,0.066667


# SVM

In [36]:
SVM = LinearSVC(penalty='l2', 
                loss='squared_hinge', 
                dual=True, 
                tol=0.0001, 
                C=1.0, 
                multi_class='ovr', 
                fit_intercept=True, 
                intercept_scaling=1, 
                class_weight=None, 
                verbose=0, 
                random_state=None, 
                max_iter=1000)

SVM.fit(X_2,Y_2, sample_weight=None)
y_pred = SVM.predict(X_2)

ac = accuracy_score(Y_2, y_pred)
f1 = f1_score(Y_2, y_pred)
pr = precision_score(Y_2, y_pred)
rc = recall_score(Y_2, y_pred)
roc = roc_auc_score(Y_2, y_pred)

print("Acurácia: {:.2f}".format(ac))
print("f1-Score: {:.2f}".format(f1))
print("Precision: {:.2f}".format(pr))
print("Recall: {:.2f}".format(rc))
print("ROC_AUC_Score: {:.2f}".format(roc))

Acurácia: 0.92
f1-Score: 0.94
Precision: 0.89
Recall: 0.99
ROC_AUC_Score: 0.89


In [37]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()

model_classif_cv(SVM,X_2, Y_2, cv_kfold, metrics)

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [33]:
# resposta multiclass
# av = micro, macro ou weighted
metrics  = [f1_score, precision_score , recall_score,  accuracy_score]


cv_kfold = KFold(10, random_state=12)
cv_loo = LeaveOneOut()
av = 'micro'

model_classif_cv_m(SVM, X, Y, cv_kfold, metrics, av)

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'