# Utilizando GridSearchCV para selecionar os melhores parametros

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score

In [2]:
transacoes = pd.read_csv('creditcard.csv')

### Seprando X e Y

In [3]:
X = transacoes.drop('Class',axis=1)
y = transacoes.Class

In [4]:
# Retirando temporariamente os warnings do nosso código
import warnings
warnings.filterwarnings('ignore')

### Treino e teste

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Podemos ajustar manualmente os hiperparametros para nosso modelo

Faremos um for para testar cada parametro do nosso modelo e selecionar os melhores

Testando 'newton-cg','lbfgs','liblinear' para o parametro SOLVER e os valores [1,100,1000] para C

In [6]:
# Treino e teste
X_train2, X_valid, y_train2, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0, stratify=y_train)


# Parametros
C = [0.001,100,1000]
solver = ['newton-cg','lbfgs','liblinear']


# For para testar cada parametro
for i in C:
    for j in solver:
        clf_RL3 = LogisticRegression(random_state=42,
                                solver=j,C=i
                                ).fit(X_train2, y_train2)
        y_pred_RL3 = clf_RL3.predict(X_valid)
        print(i,j,recall_score(y_valid,y_pred_RL3))

0.001 newton-cg 0.5217391304347826
0.001 lbfgs 0.4782608695652174
0.001 liblinear 0.4927536231884058
100 newton-cg 0.6086956521739131
100 lbfgs 0.7246376811594203
100 liblinear 0.6376811594202898
1000 newton-cg 0.6086956521739131
1000 lbfgs 0.6956521739130435
1000 liblinear 0.6376811594202898


# Podemos automatizar essa seleção utilizando o `GridSearchCV`

In [7]:
from sklearn.model_selection import GridSearchCV

### Definindo os parâmetros que queremos testar

In [8]:
parametros = {
    'C': [0.001,100,1000],
    'solver': ['newton-cg','lbfgs','liblinear'],
}

### Selecionando o modelo: Regressão Logística 

In [9]:
LogReg = LogisticRegression(random_state=42)

### Criando um novo classificador usando os parâmetros que escolhemos anteriormente

In [10]:
clf_GS = GridSearchCV(LogReg, parametros,
                     scoring='recall')

### Fazendo o fit 

In [11]:
clf_GS = clf_GS.fit(X,y)

## Visualizando os melhores parâmetros definidos pelo GridSearchCV

In [12]:
 clf_GS.best_params_

{'C': 100, 'solver': 'lbfgs'}

## Usando esse modelo para fazer as previsões

In [13]:
y_pred_GS = clf_GS.predict(X_test)

## Analisando atraves das metricas

In [14]:
# Matriz de confusão
confusion_matrix(y_test,y_pred_GS)

array([[85248,    47],
       [   46,   102]], dtype=int64)

In [15]:
# Recall
recall_score(y_test,y_pred_GS)

0.6891891891891891

In [16]:
# Precisão
precision_score(y_test,y_pred_GS)

0.6845637583892618

## Podemos tambem visualizar tudo que foi feito

In [17]:
resultados = pd.DataFrame(clf_GS.cv_results_)
resultados

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,38.523537,3.229972,0.060618,0.011167,0.001,newton-cg,"{'C': 0.001, 'solver': 'newton-cg'}",0.787879,0.555556,0.336735,0.571429,0.244898,0.499299,0.191203,7
1,6.314146,0.230303,0.045564,0.006993,0.001,lbfgs,"{'C': 0.001, 'solver': 'lbfgs'}",0.919192,0.707071,0.387755,0.438776,0.0,0.490559,0.311288,8
2,5.273272,0.495216,0.053852,0.010792,0.001,liblinear,"{'C': 0.001, 'solver': 'liblinear'}",0.949495,0.69697,0.357143,0.44898,0.0,0.490517,0.320439,9
3,44.078208,6.094464,0.056382,0.013138,100.0,newton-cg,"{'C': 100, 'solver': 'newton-cg'}",0.777778,0.636364,0.438776,0.683673,0.469388,0.601196,0.128821,5
4,6.309558,0.258179,0.043804,0.010036,100.0,lbfgs,"{'C': 100, 'solver': 'lbfgs'}",0.949495,0.818182,0.561224,0.755102,0.316327,0.680066,0.220784,1
5,7.372427,1.667136,0.051397,0.010567,100.0,liblinear,"{'C': 100, 'solver': 'liblinear'}",0.919192,0.787879,0.44898,0.612245,0.336735,0.621006,0.213186,3
6,44.331805,5.571147,0.053859,0.011171,1000.0,newton-cg,"{'C': 1000, 'solver': 'newton-cg'}",0.79798,0.636364,0.438776,0.673469,0.469388,0.603195,0.133355,4
7,6.275025,0.290212,0.055558,0.005905,1000.0,lbfgs,"{'C': 1000, 'solver': 'lbfgs'}",0.959596,0.808081,0.591837,0.622449,0.204082,0.637209,0.254168,2
8,6.763013,1.073813,0.055834,0.013691,1000.0,liblinear,"{'C': 1000, 'solver': 'liblinear'}",0.919192,0.787879,0.459184,0.612245,0.183673,0.592435,0.256984,6


### Também é possível adicionar novas métricas no scoring 
#### Como, por exemplo, adicionar a precisão além do recall

In [18]:
# Criando o classificador
clf_RL3 = GridSearchCV(LogReg, parametros,
                       scoring=['recall','precision'],
                       refit='recall'
                      )

In [20]:
clf_RL3 = clf_RL3.fit(X_train, y_train)

In [21]:
# Avaliando o novo modelo
y_pred_RL3 = clf_RL3.predict(X_test)

confusion_matrix(y_test,y_pred_RL3)

array([[85246,    49],
       [   57,    91]], dtype=int64)