In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Cargar el dataset
file_path = "../data/processed/train_data_processed_imputeKnn_scale.csv"
data = pd.read_csv(file_path)

# Inspección inicial
print("Dimensiones del dataset:", data.shape)
print(data.head())

# Identificar la variable objetivo y las características
target_col = "target"
features = [col for col in data.columns if col != target_col and col != 'therapeutic_area']  # Excluir 'therapeutic_area'

# Codificar columnas categóricas
label_encoders = {}  # Almacenar los codificadores para poder revertir la codificación si es necesario
for col in features:
    if data[col].dtype == 'object':  # Si la columna es de tipo objeto (categórica)
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le  # Guardar el codificador para futuras referencias

# Crear un diccionario para almacenar los modelos entrenados por cada área
models_by_area = {}
rmse_by_area = {}

# Asegúrate de que 'therapeutic_area' es una columna que representa las diferentes áreas terapéuticas
therapeutic_areas = data['therapeutic_area'].unique()  # Asumiendo que tienes una columna 'therapeutic_area'

# Entrenamiento de modelos para cada área terapéutica
for area in therapeutic_areas:
    print(f"\nEntrenando modelo para el área terapéutica: {area}")
    
    # Filtrar los datos por área terapéutica
    area_data = data[data['therapeutic_area'] == area]
    X_area = area_data[features]
    y_area = area_data[target_col]
    
    # Dividir en entrenamiento y validación
    X_train, X_val, y_train, y_val = train_test_split(X_area, y_area, test_size=0.2, random_state=42)

    # Definir el modelo XGBoost
    xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

    # Definir el espacio de búsqueda de hiperparámetros
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 0.1, 0.2],
        'min_child_weight': [1, 5, 10]
    }

    # Realizar la búsqueda de hiperparámetros con validación cruzada
    grid_search = GridSearchCV(estimator=xgb_model,
                               param_grid=param_grid,
                               scoring='neg_mean_squared_error', 
                               cv=3, n_jobs=-1, verbose=2)
    
    grid_search.fit(X_train, y_train)

    # Mostrar los mejores parámetros
    print("Mejores hiperparámetros para el área:", area)
    print(grid_search.best_params_)

    # Usar el mejor modelo encontrado
    best_xgb_model = grid_search.best_estimator_

    # Realizar predicciones en el conjunto de validación
    y_pred = best_xgb_model.predict(X_val)
    
    # Calcular el RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_by_area[area] = rmse
    
    # Almacenar el modelo entrenado
    models_by_area[area] = best_xgb_model
    
    print(f"RMSE para el área {area}: {rmse:.4f}")

# Resumen del RMSE por área terapéutica
print("\nResumen de RMSE por área terapéutica:")
for area, rmse in rmse_by_area.items():
    print(f"Área {area}: RMSE = {rmse:.4f}")

# Opcional: Almacenar los modelos entrenados
# Si necesitas guardar los modelos entrenados en disco
import joblib

for area, model in models_by_area.items():
    model_filename = f"xgb_model_{area}.joblib"
    joblib.dump(model, model_filename)
    print(f"Modelo para el área {area} guardado como {model_filename}")


Dimensiones del dataset: (118917, 18)
        brand  che_pc_usd  che_perc_gdp corporation       country launch_date  \
0  BRAND_354E   -0.861595     -0.069532   CORP_D524  COUNTRY_88A3  2014-06-01   
1  BRAND_626D    1.012675      0.542469   CORP_01C7  COUNTRY_8B47  2014-06-01   
2  BRAND_45D9   -0.861595     -0.069532   CORP_39F7  COUNTRY_88A3  2014-06-01   
3  BRAND_D724    0.780386      1.333155   CORP_711A  COUNTRY_445D  2014-06-01   
4  BRAND_4887    0.626762      1.359908   CORP_443D  COUNTRY_D8B0  2014-06-01   

         date       drug_id                indication  insurance_perc_che  \
0  2014-06-01  DRUG_ID_8795              ['IND_C3B6']            1.150233   
1  2014-06-01  DRUG_ID_E66E  ['IND_1590', 'IND_ECAC']           -0.845488   
2  2014-06-01  DRUG_ID_F272              ['IND_B2EF']            1.150233   
3  2014-06-01  DRUG_ID_1D4E              ['IND_BAFB']           -1.332507   
4  2014-06-01  DRUG_ID_AA88              ['IND_3F31']            1.483735   

   populatio

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_980E
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
RMSE para el área THER_AREA_980E: 0.1058

Entrenando modelo para el área terapéutica: THER_AREA_96D7
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dept

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_96D7
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
RMSE para el área THER_AREA_96D7: 0.2141

Entrenando modelo para el área terapéutica: THER_AREA_6CEE
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dept

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_644A
{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 1.0}
RMSE para el área THER_AREA_644A: 0.1060

Entrenando modelo para el área terapéutica: THER_AREA_66C5
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.7s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dept

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_66C5
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
RMSE para el área THER_AREA_66C5: 0.1169

Entrenando modelo para el área terapéutica: THER_AREA_CD59
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dept

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_CD59
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 10, 'n_estimators': 100, 'subsample': 0.8}
RMSE para el área THER_AREA_CD59: 0.2467

Entrenando modelo para el área terapéutica: THER_AREA_22ED
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dep

  _data = np.array(data, dtype=dtype, copy=copy,


[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8; total time

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_8E53
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 200, 'subsample': 0.8}
RMSE para el área THER_AREA_8E53: 0.0814

Entrenando modelo para el área terapéutica: THER_AREA_4BA5
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dept

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_4BA5
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.8}
RMSE para el área THER_AREA_4BA5: 0.0641

Entrenando modelo para el área terapéutica: THER_AREA_051D
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dept

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_051D
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 10, 'n_estimators': 200, 'subsample': 1.0}
RMSE para el área THER_AREA_051D: 0.1504

Entrenando modelo para el área terapéutica: THER_AREA_032C
Fitting 3 folds for each of 972 candidates, totalling 2916 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=50, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_dep

  _data = np.array(data, dtype=dtype, copy=copy,


Mejores hiperparámetros para el área: THER_AREA_032C
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 200, 'subsample': 0.8}
RMSE para el área THER_AREA_032C: 0.0618

Resumen de RMSE por área terapéutica:
Área THER_AREA_980E: RMSE = 0.1058
Área THER_AREA_96D7: RMSE = 0.2141
Área THER_AREA_6CEE: RMSE = 0.2194
Área THER_AREA_644A: RMSE = 0.1060
Área THER_AREA_66C5: RMSE = 0.1169
Área THER_AREA_CD59: RMSE = 0.2467
Área THER_AREA_22ED: RMSE = 0.0023
Área THER_AREA_645F: RMSE = 0.0186
Área THER_AREA_8E53: RMSE = 0.0814
Área THER_AREA_4BA5: RMSE = 0.0641
Área THER_AREA_051D: RMSE = 0.1504
Área THER_AREA_032C: RMSE = 0.0618
Modelo para el área THER_AREA_980E guardado como xgb_model_THER_AREA_980E.joblib
Modelo para el área THER_AREA_96D7 guardado como xgb_model_THER_AREA_96D7.joblib
Modelo para el área THER_AREA_6CEE guardado como xgb_model_THER_AREA_6CEE.joblib
Modelo para el área THER_AREA_644A guardado como xgb_model_THER_AR