In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Cargar el dataset
file_path = "../data/processed/train_data_processed_imputeKnn_scale.csv"
data = pd.read_csv(file_path)

# Inspección inicial
print("Dimensiones del dataset:", data.shape)
print(data.head())

# Identificar la variable objetivo y las características
target_col = "target"
features = [col for col in data.columns if col != target_col]

# Preprocesamiento: manejar valores categóricos y nulos
# Convertir categorías a valores numéricos
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    if col in features:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le


# Dividir en características (X) y etiqueta (y)
X = data[features]
y = data[target_col]

# Dividir en conjuntos de características (X) y etiquetas (y)
X_array = X.values
y_array = y.values

# Parámetros para XGBoost
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.05,
    "max_depth": 6,
    "min_child_weight": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 0.2,
    "seed": 42,
}

# Implementación de K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X_array)):
    print(f"Fold {fold + 1}")
    
    # Dividir los datos en conjuntos de entrenamiento y validación
    X_train, X_val = X_array[train_index], X_array[val_index]
    y_train, y_val = y_array[train_index], y_array[val_index]
    
    # Convertir los datos en DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Lista de evaluación
    evals = [(dtrain, 'train'), (dval, 'eval')]
    
    # Entrenar el modelo con early stopping
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=5000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Predicción en el conjunto de validación
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"Fold {fold + 1} RMSE: {rmse:.4f}")

# Calcular RMSE promedio
mean_rmse = np.mean(rmse_scores)
print(f"RMSE promedio después de Cross-Validation: {mean_rmse:.4f}")


Dimensiones del dataset: (118917, 18)
        brand  che_pc_usd  che_perc_gdp corporation       country launch_date  \
0  BRAND_354E   -0.861595     -0.069532   CORP_D524  COUNTRY_88A3  2014-06-01   
1  BRAND_626D    1.012675      0.542469   CORP_01C7  COUNTRY_8B47  2014-06-01   
2  BRAND_45D9   -0.861595     -0.069532   CORP_39F7  COUNTRY_88A3  2014-06-01   
3  BRAND_D724    0.780386      1.333155   CORP_711A  COUNTRY_445D  2014-06-01   
4  BRAND_4887    0.626762      1.359908   CORP_443D  COUNTRY_D8B0  2014-06-01   

         date       drug_id                indication  insurance_perc_che  \
0  2014-06-01  DRUG_ID_8795              ['IND_C3B6']            1.150233   
1  2014-06-01  DRUG_ID_E66E  ['IND_1590', 'IND_ECAC']           -0.845488   
2  2014-06-01  DRUG_ID_F272              ['IND_B2EF']            1.150233   
3  2014-06-01  DRUG_ID_1D4E              ['IND_BAFB']           -1.332507   
4  2014-06-01  DRUG_ID_AA88              ['IND_3F31']            1.483735   

   populatio

KeyboardInterrupt: 