In [6]:
import pandas as pd

import optuna
from optuna.exceptions import TrialPruned

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"E:\Github\Reto-3006C-equipo5\retro\M4_Reto\Data\train_clean.csv")

ya que el unico modelo que vimos que disminuia el overfitting era la regresion logistica, es el que vamos a utilizar para hacer optimizacion de hiperparametros con optuna

In [3]:
X = df.drop(columns = ["Survived"])
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
def objective(trial):
    # Definición de hiperparámetros
    C = trial.suggest_float('C', 1e-5, 10, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga', 'newton-cg', 'sag'])
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    tol = trial.suggest_float('tol', 1e-6, 1e-3, log=True)
    max_iter = trial.suggest_int('max_iter', 100, 2000, step=100)

    # Verificar combinaciones de hiperparámetros inválidas
    if solver in ['lbfgs', 'newton-cg', 'sag'] and penalty == 'l1':
        raise TrialPruned()

    if solver == 'saga' and penalty == 'none':
        raise TrialPruned()

    # Definir el modelo
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        fit_intercept=fit_intercept,
        class_weight=class_weight,
        tol=tol,
        max_iter=max_iter,
        random_state=42
    )

    # Entrenar el modelo y calcular accuracy
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Crear el estudio
study = optuna.create_study(direction='maximize')

# Ejecutar el estudio
study.optimize(objective, n_trials=10000)

# Mostrar los mejores hiperparámetros
trial = study.best_trial
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2023-09-12 17:53:55,594][0m A new study created in memory with name: no-name-11e7a0a9-31d5-4200-9d45-76552191c39e[0m
[32m[I 2023-09-12 17:53:55,628][0m Trial 0 pruned. [0m
[32m[I 2023-09-12 17:53:55,926][0m Trial 1 finished with value: 0.7921348314606742 and parameters: {'C': 0.010838663554615331, 'penalty': 'l2', 'solver': 'lbfgs', 'fit_intercept': False, 'class_weight': 'balanced', 'tol': 2.5897987605920543e-05, 'max_iter': 1400}. Best is trial 1 with value: 0.7921348314606742.[0m
[32m[I 2023-09-12 17:53:56,097][0m Trial 2 finished with value: 0.7696629213483146 and parameters: {'C': 3.5811962063430207, 'penalty': 'l2', 'solver': 'sag', 'fit_intercept': False, 'class_weight': None, 'tol': 2.2601126932416496e-06, 'max_iter': 1800}. Best is trial 1 with value: 0.7921348314606742.[0m
[32m[I 2023-09-12 17:53:56,110][0m Trial 3 finished with value: 0.7303370786516854 and parameters: {'C': 0.00025752345902240587, 'penalty': 'l2', 'solver': 'saga', 'fit_intercept': Fals

Best hyperparameters: {'C': 0.012641235436873175, 'penalty': 'l2', 'solver': 'saga', 'fit_intercept': False, 'class_weight': 'balanced', 'tol': 2.614384807577375e-05, 'max_iter': 1200}


se prueba modelo con los hiperparametros optimizados

In [8]:
model = LogisticRegression(
    C=trial.params['C'],
    penalty=trial.params['penalty'],
    solver=trial.params['solver'],
    fit_intercept=trial.params['fit_intercept'],
    class_weight=trial.params['class_weight'],
    tol=trial.params['tol'],
    max_iter=trial.params['max_iter'],
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

confusion_matrix(y_test, y_pred)

Accuracy: 0.797752808988764


array([[88, 18],
       [18, 54]], dtype=int64)