In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

# Cargar el dataset
file_path = "../data/processed/train_data_processed_imputeKnn_scale.csv"
data = pd.read_csv(file_path)

# Inspección inicial
print("Dimensiones del dataset:", data.shape)
print(data.head())

# Identificar la variable objetivo y las características
target_col = "target"
features = [col for col in data.columns if col != target_col]

# Preprocesamiento: manejar valores categóricos y nulos
# Convertir categorías a valores numéricos
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    if col in features:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le


# Dividir en características (X) y etiqueta (y)
X = data[features]
y = data[target_col]

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Conjunto de entrenamiento:", X_train.shape)
print("Conjunto de prueba:", X_test.shape)

# Parámetros mejorados para LightGBM
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "lambda_l1": 0.1,
    "lambda_l2": 0.2,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbosity": -1,
    "seed": 42,
}

# Convertir X y y a arreglos NumPy
X_array = X.values
y_array = y.values

# Implementación de K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X_array)):
    print(f"Fold {fold + 1}")
    
    X_train, X_val = X_array[train_index], X_array[val_index]
    y_train, y_val = y_array[train_index], y_array[val_index]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        num_boost_round=5000,
        #feval=metric_novartis
    )
    
    # Predicción en el conjunto de validación
    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"Fold {fold + 1} RMSE: {rmse:.4f}")

# Calcular RMSE promedio
mean_rmse = np.mean(rmse_scores)
print(f"RMSE promedio después de Cross-Validation: {mean_rmse:.4f}")

ModuleNotFoundError: No module named 'lightgbm'

In [26]:
# Preparar el conjunto de datos de submission
submission_data_path = "../data/processed/submission_data_processed_imputeKnn_scale.csv"  # Ruta de tu archivo de datos para predicción
submission_data = pd.read_csv(submission_data_path)
original_submission_data = pd.read_csv("../data/raw/submission_data.csv")

# Preprocesar el conjunto de datos de submission
# (Realiza las mismas transformaciones que hiciste para el conjunto de entrenamiento)
categorical_columns = submission_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if col in label_encoders:  # Usa los label encoders existentes
        submission_data[col] = label_encoders[col].fit_transform(submission_data[col].astype(str))
submission_data = submission_data.fillna(0)

# Seleccionar las características utilizadas para el entrenamiento
X_submission = submission_data[features].values

# Predicciones con el modelo final
original_submission_data['prediction'] = model.predict(X_submission)

# Formatear el archivo de salida
original_submission_data = original_submission_data[['date', 'cluster_nl', 'prediction']]
original_submission_data.to_csv("../data/processed/submission_data_predictions_imputeKnn_scale.csv", index=False)

print("Archivo de predicciones creado: submission.csv")

Archivo de predicciones creado: submission.csv


In [27]:
import pickle

# Save the model to a file in the models/weights folder
model_path = "../models/weights/model_imputeKnn_imputeKnn_scale.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved to {model_path}")

Model saved to ../models/weights/model_imputeKnn_imputeKnn_scale.pkl
