In [None]:
pip install catboost

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, cv

# Cargar datos históricos
train_df = pd.read_csv("train.csv", sep=";")

# Eliminar columnas que NO queremos usar
cols_to_drop = ["image_embedding", "num_stores", "num_sizes", "weekly_demand"]
train_df = train_df.drop(columns=cols_to_drop, errors="ignore")

# Identificar variables categóricas
categorical_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
categorical_cols = [c for c in categorical_cols if c not in ["ID", "Production"]]

# Separar X e y
X_train = train_df.drop(columns=["ID", "Production"])
y_train = train_df["Production"]

# Rellenar NaN
X_train = X_train.fillna(0)

# Pool de entrenamiento para CatBoost
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)

# GRID SEARCH de CatBoost
model = CatBoostRegressor(
    loss_function="RMSE",
    random_seed=42,
    verbose=False
)

# hem buscat el millor parametre amb gridsearch 
# despres de compilar i que hagin pasat 60 min
# {'depth': 10, 'learning_rate': 0.05, 'iterations': 900}
param_grid = {
    "depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.03, 0.05],
    "iterations": [300, 600, 900]
}

print("\n===== Ejecutando Grid Search =====\n")
grid_result = model.grid_search(
    param_grid,
    X=train_pool,
    cv=5,
    partition_random_seed=42,
    verbose=False
)

print("\n===== MEJORES PARÁMETROS =====")
print(grid_result["params"])
print("====================================\n")

best_params = grid_result["params"]

# Entrenamiento final con los mejores parámetros
print("Entrenando modelo final con parámetros óptimos...\n")

model = CatBoostRegressor(
    **best_params,
    loss_function="RMSE",
    verbose=200,
    random_seed=42
)

model.fit(train_pool)

# Cargar CSV de test
test_df = pd.read_csv("test.csv", sep=";")
test_df = test_df.loc[:, ~test_df.columns.str.contains("^Unnamed")]
test_df = test_df.drop(columns=cols_to_drop, errors="ignore")

# Alinear columnas con train
for col in X_train.columns:
    if col not in test_df.columns:
        test_df[col] = 0

X_test = test_df[X_train.columns].fillna(0)

# Predicción
preds = model.predict(X_test)

# Crear CSV de salida
output_df = pd.DataFrame({
    "ID": test_df["ID"],
    "Production": preds.astype(int)
})

output_df.to_csv("predicciones_finales.csv", index=False)
print("Archivo 'predicciones_finales.csv' generado con éxito.")



===== Ejecutando Grid Search =====


bestTest = 16556.32628
bestIteration = 299


bestTest = 13825.32633
bestIteration = 299


bestTest = 12622.25968
bestIteration = 299


bestTest = 14724.25141
bestIteration = 599


bestTest = 12234.61617
bestIteration = 599


bestTest = 11091.96182
bestIteration = 599


bestTest = 13704.71757
bestIteration = 899


bestTest = 11374.38037
bestIteration = 899


bestTest = 10251.55673
bestIteration = 899


bestTest = 14993.37453
bestIteration = 299


bestTest = 11767.58417
bestIteration = 299


bestTest = 10611.09428
bestIteration = 299


bestTest = 12914.47456
bestIteration = 599


bestTest = 10023.25278
bestIteration = 599


bestTest = 8947.785838
bestIteration = 599


bestTest = 11799.22848
bestIteration = 899


bestTest = 9063.306303
bestIteration = 899


bestTest = 8124.931182
bestIteration = 899


bestTest = 13509.31038
bestIteration = 299


bestTest = 10202.12536
bestIteration = 299


bestTest = 8738.384754
bestIteration = 299


bestTest = 11320.