In [4]:
# -------------------------
# RandomForest completo: entrenamiento + predicción
# -------------------------

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

# -------------------------
# 1️⃣ Leer train.csv
df_train = pd.read_csv("train.csv", sep=";")
df_train.columns = df_train.columns.str.strip()  # limpiar espacios

# -------------------------
# 2️⃣ Preprocesamiento train
# Convertir variables no numéricas
non_numeric_cols_train = df_train.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols_train:
    df_train[col] = df_train[col].astype(str)  # asegurar que sean strings
df_train = pd.get_dummies(df_train, drop_first=True)  # dummies para categóricas

# Separar X e y
y = df_train["Production"]
X = df_train.drop(columns=["ID", "Production"])

# Rellenar posibles NaNs
X = X.fillna(0)

# -------------------------
# 3️⃣ Dividir entrenamiento/prueba
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------
# 4️⃣ Entrenar RandomForest
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# -------------------------
# 5️⃣ Evaluar en validación
y_pred_val = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred_val)
print("MSE en validación:", mse)

# -------------------------
# 6️⃣ Leer test.csv
df_test = pd.read_csv("test.csv", sep=";")
df_test.columns = df_test.columns.str.strip()

# -------------------------
# 7️⃣ Preprocesamiento test
# Convertir variables no numéricas
non_numeric_cols_test = df_test.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols_test:
    df_test[col] = df_test[col].astype(str)

# Crear dummies
df_test = pd.get_dummies(df_test, drop_first=True)

# Asegurarse de que test tenga mismas columnas que entrenamiento
for col in X_train.columns:
    if col not in df_test.columns:
        df_test[col] = 0  # columnas faltantes

# Reordenar columnas exactamente como X_train
X_test = df_test[X_train.columns].copy()

# Rellenar NaNs por si acaso
X_test = X_test.fillna(0)

# -------------------------
# 8️⃣ Predecir Production para test.csv
y_pred_test = model.predict(X_test)

# -------------------------
# 9️⃣ Guardar resultados
output = pd.DataFrame({
    "ID": df_test.index,  # si tienes columna ID, usa df_test["ID"]
    "Production_pred": y_pred_test
})
output.to_csv("predicciones_test.csv", index=False)

# -------------------------
# 10️⃣ Mostrar primeras filas
print(output.head(10))


MSE en validación: 4395163.908155117


  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] = 0  # columnas faltantes
  df_test[col] =

   ID  Production_pred
0   0          5555.12
1   1         14914.95
2   2         24786.26
3   3          3093.93
4   4          2257.88
5   5         44730.21
6   6         30967.67
7   7         29818.86
8   8         99147.77
9   9          1269.75
