In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import pickle

In [6]:
def algoritmo():
    # Carga del dataset de entrenamiento
    df = pd.read_csv("../data/df_concat_a.csv")
    df = df.drop(columns="Unnamed: 0")

    # Definición de target y variables predictoras
    X = df.drop(columns=["sk_id_curr", "target"])
    y = df["target"]

    # Codificación de variables categóricas
    cat_features = X.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder(drop='first', sparse_output=False)
    X_encoded = encoder.fit_transform(X[cat_features])
    X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(cat_features))
    X = X.drop(columns=cat_features).reset_index(drop=True)
    X = pd.concat([X, X_encoded], axis=1)

    # Oversampling
    smote = SMOTE(random_state=16)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # División de los datos en train y test
    X_train, X_test, y_train, y_test = train_test_split(
        X_balanced, y_balanced, test_size=0.20, random_state=16)

    # Instancia regresión logística y ajuste
    reg = LogisticRegression(max_iter=1000).fit(X_train, y_train)

    return reg, encoder

# Entrenamiento del modelo y codificador
modelo, encoder = algoritmo()

# Guardado del modelo y codificador en archivos pickle
with open('modelo_logistico.pkl', 'wb') as model_file:
    pickle.dump(modelo, model_file)

with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(encoder, encoder_file)