In [13]:
import pandas as pd

import optuna

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("/Users/davidesquer/Documents/Github/Reto-3006C-equipo5/retro/M4_Reto/Data/train_clean.csv")

ya que el unico modelo que vimos que disminuia el overfitting era la regresion logistica, es el que vamos a utilizar para hacer optimizacion de hiperparametros con optuna

In [10]:
X = df.drop(columns = ["Survived"])
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
def objective(trial):
    # aqui definimos los hiperparametros que queremos optimizar
    C = trial.suggest_float('C', 1e-5, 10, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga', 'newton-cg', 'sag'])
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    tol = trial.suggest_float('tol', 1e-6, 1e-3, log=True)
    max_iter = trial.suggest_int('max_iter', 100, 2000, step=100)

    # algunos de los solvers no soportan penalizacion l1, asi que cuando se use uno de esos no se usara penalizacion l1
    if solver in ['lbfgs', 'newton-cg', 'sag'] and penalty == 'l1':
        return None

    # definimos el modelo con los hiperparametros que queremos optimizar
    model = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        fit_intercept=fit_intercept,
        class_weight=class_weight,
        tol=tol,
        max_iter=max_iter,
        random_state=42
    )

    # se entrena el modelo y se calcula el accuracy
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# se crea el estudio de optuna
study = optuna.create_study(direction='maximize')

# se corre el estudio
study.optimize(objective, n_trials=10000)

# se obtienen los resultados
trial = study.best_trial
print("Best hyperparameters: {}".format(trial.params))

[I 2023-09-03 19:15:51,022] A new study created in memory with name: no-name-c65e001b-cb97-4ae8-b70c-99d064efb30e
[I 2023-09-03 19:15:51,192] Trial 0 finished with value: 0.7303370786516854 and parameters: {'C': 0.0768283441790923, 'penalty': 'l1', 'solver': 'saga', 'fit_intercept': False, 'class_weight': None, 'tol': 0.000371700823561875, 'max_iter': 400}. Best is trial 0 with value: 0.7303370786516854.
[I 2023-09-03 19:15:51,227] Trial 1 finished with value: 0.7247191011235955 and parameters: {'C': 0.002066430830712905, 'penalty': 'l2', 'solver': 'newton-cg', 'fit_intercept': False, 'class_weight': 'balanced', 'tol': 0.00014665440790988707, 'max_iter': 1100}. Best is trial 0 with value: 0.7303370786516854.
[I 2023-09-03 19:15:51,358] Trial 2 finished with value: 0.7247191011235955 and parameters: {'C': 0.04081707369586023, 'penalty': 'l2', 'solver': 'saga', 'fit_intercept': False, 'class_weight': None, 'tol': 6.640545814524666e-06, 'max_iter': 400}. Best is trial 0 with value: 0.7303

Best hyperparameters: {'C': 2.7399744574633367, 'penalty': 'l2', 'solver': 'lbfgs', 'fit_intercept': True, 'class_weight': None, 'tol': 0.00020503644013261026, 'max_iter': 700}


se prueba modelo con los hiperparametros optimizados

In [22]:
model = LogisticRegression(
    C=trial.params['C'],
    penalty=trial.params['penalty'],
    solver=trial.params['solver'],
    fit_intercept=trial.params['fit_intercept'],
    class_weight=trial.params['class_weight'],
    tol=trial.params['tol'],
    max_iter=trial.params['max_iter'],
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

confusion_matrix(y_test, y_pred)

Accuracy: 0.7752808988764045


array([[88, 18],
       [22, 50]])