In [None]:
# Ridge
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Leer CSV
df = pd.read_csv("train.csv", sep=";")
df.columns = df.columns.str.strip()

# Crear matriz aplanada como hacías antes
vars_semanales = ["num_week_iso", "weekly_sales", "weekly_demand"]
vars_fijas = [col for col in df.columns if col not in vars_semanales + ["ID", "image_embedding", "weekly_demand", "num_stores", "num_sizes"]]

matriz = []
for id_val, group in df.groupby("ID"):
    entrada = {}
    entrada["ID"] = id_val
    entrada["fijos"] = group.iloc[0][vars_fijas].to_dict()
    entrada["semanales"] = group[vars_semanales].to_dict(orient='records')
    matriz.append(entrada)

rows = []
for item in matriz:
    fila = {"ID": item["ID"]}
    fila.update(item["fijos"])
    for semana in item["semanales"]:
        w = int(semana["num_week_iso"])
        fila[f"week_{w}_weekly_sales"] = semana["weekly_sales"]
        fila[f"week_{w}_weekly_demand"] = semana["weekly_demand"]
    rows.append(fila)

df = pd.DataFrame(rows)

# Convertir variables no numéricas
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
label_encoder = LabelEncoder()
for col in non_numeric_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

# Rellenar NaNs
df = df.fillna(0)

# Separar X e y
y = df["Production"]
X = df.drop(columns=["ID", "Production"])

# Entrenamiento
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))

print(pd.DataFrame({"y_true": y_test.values[:5], "y_pred": y_pred[:5]}))


MSE: 97594098.88017324
   y_true        y_pred
0    6336  10376.551868
1   18945  15816.306780
2    4069   8708.771633
3   36534  44442.148128
4    9650  12315.321144
