In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

#Cargar datos hist√≥ricos
train_df = pd.read_csv("train.csv", sep=";")

#Eliminar columnas que NO queremos usar
cols_to_drop = ["image_embedding", "num_stores", "num_sizes", "weekly_demand"]
train_df = train_df.drop(columns=cols_to_drop, errors="ignore")

#Identificar variables categ√≥ricas
categorical_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
categorical_cols = [c for c in categorical_cols if c not in ["ID", "Production"]]

#Separar X e y
X_train = train_df.drop(columns=["ID", "Production"])
y_train = train_df["Production"]

#Rellenar con ceros en train
X_train = X_train.fillna(0)

#Entrenar modelo CatBoost
model = CatBoostRegressor(
    iterations=600,
    learning_rate=0.05,
    depth=8,
    loss_function="RMSE",
    verbose=200
)
model.fit(X_train, y_train, cat_features=categorical_cols)

#Cargar CSV de test y limpiar columnas
test_df = pd.read_csv("test.csv", sep=";")
# Eliminar columnas sin nombre
test_df = test_df.loc[:, ~test_df.columns.str.contains("^Unnamed")]
# Eliminar columnas que no usamos
test_df = test_df.drop(columns=cols_to_drop, errors="ignore")

#Detectar columnas faltantes y agregarlas con 0
for col in X_train.columns:
    if col not in test_df.columns:
        test_df[col] = 0  # Agregamos columna faltante con 0

#Preparar X_test
X_test = test_df[X_train.columns]  # Alinear orden y columnas
X_test = X_test.fillna(0)

#Predecir producci√≥n
preds = model.predict(X_test)

#Crear CSV de predicciones
output_df = pd.DataFrame({
    "ID": test_df["ID"],
    "Production": preds.astype(int) 
})

output_df.to_csv("predicciones_finales.csv", index=False)
print("Archivo 'predicciones_finales.csv' generado con √©xito")

0:	learn: 33529.1671874	total: 118ms	remaining: 1m 10s
200:	learn: 9594.7610794	total: 22.7s	remaining: 45.1s
400:	learn: 7707.1968637	total: 46.1s	remaining: 22.9s
599:	learn: 6762.2804778	total: 1m 9s	remaining: 0us
Archivo 'predicciones_finales.csv' generado con √©xito


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool, cv

# ======================
# 1Ô∏è‚É£ Cargar datos hist√≥ricos
# ======================
train_df = pd.read_csv("train.csv", sep=";")

# ======================
# 2Ô∏è‚É£ Eliminar columnas que NO queremos usar
# ======================
cols_to_drop = ["image_embedding", "num_stores", "num_sizes", "weekly_demand"]
train_df = train_df.drop(columns=cols_to_drop, errors="ignore")

# ======================
# 3Ô∏è‚É£ Identificar variables categ√≥ricas
# ======================
categorical_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
categorical_cols = [c for c in categorical_cols if c not in ["ID", "Production"]]

# ======================
# 4Ô∏è‚É£ Separar X e y
# ======================
X_train = train_df.drop(columns=["ID", "Production"])
y_train = train_df["Production"]

# ======================
# 5Ô∏è‚É£ Rellenar NaN en train
# ======================
X_train = X_train.fillna(0)

# ======================
# üî• NUEVO: Cross-Validation antes del entrenamiento
# ======================
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)

params = {
    "iterations": 600,
    "learning_rate": 0.05,
    "depth": 8,
    "loss_function": "RMSE",
    "verbose": False
}

cv_results = cv(
    pool=train_pool,
    params=params,
    fold_count=5,
    shuffle=True,
    partition_random_seed=42
)

print("\n===== Resultados Cross Validation =====")
print(cv_results.tail(1))  # √öltimo registro con el RMSE final
print("=======================================\n")

# ======================
# 6Ô∏è‚É£ Entrenar modelo CatBoost (SIN cambios)
# ======================
model = CatBoostRegressor(
    iterations=600,
    learning_rate=0.05,
    depth=8,
    loss_function="RMSE",
    verbose=200
)
model.fit(X_train, y_train, cat_features=categorical_cols)

# ======================
# 7Ô∏è‚É£ Cargar CSV de test y limpiar columnas
# ======================
test_df = pd.read_csv("test.csv", sep=";")
test_df = test_df.loc[:, ~test_df.columns.str.contains("^Unnamed")]
test_df = test_df.drop(columns=cols_to_drop, errors="ignore")

# ======================
# 8Ô∏è‚É£ Detectar columnas faltantes y agregarlas con 0
# ======================
for col in X_train.columns:
    if col not in test_df.columns:
        test_df[col] = 0

# ======================
# 9Ô∏è‚É£ Preparar X_test
# ======================
X_test = test_df[X_train.columns]
X_test = X_test.fillna(0)

# ======================
# üîü Predecir producci√≥n
# ======================
preds = model.predict(X_test)

# ======================
# 1Ô∏è‚É£1Ô∏è‚É£ Crear CSV de predicciones
# ======================
output_df = pd.DataFrame({
    "ID": test_df["ID"],
    "Production": preds.astype(int)
})

output_df.to_csv("predicciones_finales.csv", index=False)
print("Archivo 'predicciones_finales.csv' generado con √©xito")


Training on fold [0/5]

bestTest = 6892.544722
bestIteration = 599

Training on fold [1/5]

bestTest = 7115.657388
bestIteration = 599

Training on fold [2/5]

bestTest = 7001.183925
bestIteration = 599

Training on fold [3/5]

bestTest = 7002.118362
bestIteration = 599

Training on fold [4/5]

bestTest = 7061.515106
bestIteration = 599


===== Resultados Cross Validation =====
     iterations  test-RMSE-mean  test-RMSE-std  train-RMSE-mean  \
599         599       7014.6039      83.136779      6900.893326   

     train-RMSE-std  
599       54.634538  

0:	learn: 0.8288366	total: 150ms	remaining: 1m 29s
200:	learn: 0.7643297	total: 24.1s	remaining: 47.9s
400:	learn: 0.7566511	total: 48s	remaining: 23.8s
599:	learn: 0.7534590	total: 1m 11s	remaining: 0us
Archivo 'predicciones_finales.csv' generado con √©xito
