In [3]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TunedThresholdClassifierCV
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import sys
import joblib

sys.path.append('../src')

from lib_aux import *
from modeloClassificacao import ModeloClassificacao

In [4]:
df_train = pd.read_parquet("../data/SOT/base_tratada_treino_sem_outliers.parquet")
df_test = pd.read_parquet("../data/SOT/base_tratada_teste.parquet")
df_val = pd.read_parquet("../data/SOT/base_validacao_nao_normalizada.parquet")

X_train = df_train.drop(columns=["id", "safra", "y"])
y_train = df_train["y"]
X_test = df_test.drop(columns=["id", "safra", "y"])
y_test = df_test["y"]

scaler = joblib.load("../artifacts/scaler.pkl")

X_total = pd.concat([X_train, X_test])
X_total = scaler.inverse_transform(X_total)
y_total = pd.concat([y_train, y_test])

X_val = df_val.drop(columns=["id", "safra", "y"])
y_val = df_val["y"]

In [7]:
modelo = ModeloClassificacao()

modelo.fit(X_total, y_total)
y_pred = modelo.predict(X_val)
y_proba = modelo.predict_proba(X_val)

print("Relatório de classificação: \n", classification_report(y_val, y_pred))
print("Acurácia: ", accuracy_score(y_val, y_pred))
print("Recall: ", recall_score(y_val, y_pred))
print("F1 Score: ", f1_score(y_val, y_pred))
# print("ROC AUC: ", roc_auc_score(y_val, y_proba))
print("Matriz de confusão: \n", confusion_matrix(y_val, y_pred))

Relatório de classificação: 
               precision    recall  f1-score   support

           0       0.76      0.68      0.72       509
           1       0.50      0.61      0.55       277

    accuracy                           0.65       786
   macro avg       0.63      0.64      0.63       786
weighted avg       0.67      0.65      0.66       786

Acurácia:  0.6513994910941476
Recall:  0.6064981949458483
F1 Score:  0.5508196721311476
Matriz de confusão: 
 [[344 165]
 [109 168]]




In [8]:
joblib.dump(modelo, "../artifacts/modelo_teste.pkl")

['../artifacts/modelo_teste.pkl']

In [None]:
modelo = ModeloClassificacao()

X_total_df = pd.DataFrame(X_total, columns=X_train.columns)

X_completo = pd.concat([X_total_df, X_val])
y_completo = pd.concat([y_total, y_val])

modelo.fit(X_completo, y_completo)
joblib.dump(modelo, "../artifacts/modelo.pkl")

['../artifacts/modelo.pkl']

In [16]:
y_pred = modelo.predict(X_completo)
y_proba = modelo.predict_proba(X_completo)

In [17]:
prob_1 = y_proba[:, 1]
prob_1

array([0.18127899, 0.46135249, 0.82589675, ..., 0.7177042 , 0.36237621,
       0.38413972], shape=(10312,))

In [25]:
df_pred = pd.concat([df_train[["id", "safra", "y"]],
          df_test[["id", "safra", "y"]],
           df_val[["id", "safra", "y"]]])

df_pred["y_pred"] = y_pred
df_pred["y_proba"] = prob_1

In [26]:
df_pred

Unnamed: 0,id,safra,y,y_pred,y_proba
0,1.0,201404,0,0,0.181279
1,2.0,201407,0,0,0.461352
2,3.0,201405,0,1,0.825897
3,5.0,201403,1,1,0.633286
4,6.0,201405,0,1,0.708000
...,...,...,...,...,...
781,10679.0,201412,1,0,0.451155
782,10691.0,201412,0,0,0.245052
783,10699.0,201412,0,1,0.717704
784,10706.0,201412,0,0,0.362376


In [27]:
df_pred.to_parquet("../data/SPEC/base_predita_completa.parquet", index = False)