## Regresión logística

In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [43]:
BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_con_outliers.xlsx",
    "X_train_sin_outliers.xlsx",
    "X_train_con_outliers_norm.xlsx",
    "X_train_sin_outliers_norm.xlsx",
    "X_train_con_outliers_scal.xlsx",
    "X_train_sin_outliers_scal.xlsx"
]
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

TEST_PATHS = [
    "X_test_con_outliers.xlsx",
    "X_test_sin_outliers.xlsx",
    "X_test_con_outliers_norm.xlsx",
    "X_test_sin_outliers_norm.xlsx",
    "X_test_con_outliers_scal.xlsx",
    "X_test_sin_outliers_scal.xlsx"
]
TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

In [44]:
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    model = LogisticRegression(random_state = 42)
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append(
        {
            "train": accuracy_score(y_train, y_pred_train),
            "test": accuracy_score(y_test, y_pred_test)
        }
    )

results

[{'train': 0.90197328476017, 'test': 0.8920592520641087},
 {'train': 0.9006071645415907, 'test': 0.8942447790189413},
 {'train': 0.9109593199757134, 'test': 0.9038368139873725},
 {'train': 0.9091074681238616, 'test': 0.9039582321515298},
 {'train': 0.9115361262902246, 'test': 0.9027440505099563},
 {'train': 0.9093199757134184, 'test': 0.9039582321515298}]

In [45]:
best_dataset = 4

hyperparams = {
    "penalty": ["l1", "l2"],
    "tol": [0.0001, 0.001, 0.1],
    "fit_intercept": [True, False],
    "solver": ["liblinear"] 
}

model = LogisticRegression(random_state = 42)
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv=5)
grid


In [46]:
grid.fit(TRAIN_DATASETS[best_dataset], y_train)

In [47]:
final_model = grid.best_estimator_
y_pred_train = final_model.predict(TRAIN_DATASETS[best_dataset])
y_pred_test = final_model.predict(TEST_DATASETS[best_dataset])

In [48]:
results.append({
        "train": accuracy_score(y_train, y_pred_train),
        "test": accuracy_score(y_test, y_pred_test),
        "best_params": grid.best_params_
    })
results

[{'train': 0.90197328476017, 'test': 0.8920592520641087},
 {'train': 0.9006071645415907, 'test': 0.8942447790189413},
 {'train': 0.9109593199757134, 'test': 0.9038368139873725},
 {'train': 0.9091074681238616, 'test': 0.9039582321515298},
 {'train': 0.9115361262902246, 'test': 0.9027440505099563},
 {'train': 0.9093199757134184, 'test': 0.9039582321515298},
 {'train': 0.9115361262902246,
  'test': 0.9028654686741137,
  'best_params': {'fit_intercept': True,
   'penalty': 'l2',
   'solver': 'liblinear',
   'tol': 0.0001}}]

In [49]:
with open("../models/logreg_best_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

with open("../models/final_results.json", "w") as f:
    json.dump(results, f, indent=4)
