In [1]:
# RandomForest
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Leer CSV
df = pd.read_csv("train.csv", sep=";")
df.columns = df.columns.str.strip()  # limpiar posibles espacios

# Preprocesamiento
# Convertir variables no numéricas
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
df[non_numeric_cols] = df[non_numeric_cols].astype(str)
df = pd.get_dummies(df, drop_first=True)

# Separar X e y
y = df["Production"]
X = df.drop(columns=["ID", "Production"])

# Rellenar posibles NaNs
X = X.fillna(0)

# Dividir en entrenamiento y prueba 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Entrenar Random Forest
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Predecir y evaluar 
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

# Mostrar comparación
comp = pd.DataFrame({"y_true": y_test, "y_pred": y_pred}).reset_index(drop=True)
print(comp.head(10))


MSE: 52795635.099433996
   y_true        y_pred
0   32252  26380.551304
1   41758  42626.322117
2    4896   7872.656336
3   36534  27999.587037
4   27242  28093.298453
5   11426  12921.771930
6   43022  54749.918494
7    6619   6239.349126
8    7177   8798.800767
9   52052  49179.494352


In [3]:
# Entrenamiento
df_train = pd.read_csv("train.csv", sep=";")
df_train.columns = df_train.columns.str.strip()
y = df_train["Production"]
X = df_train.drop(columns=["ID", "Production"])
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)
y = y.fillna(0)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X, y)

# Guardamos X_train y columnas para usar en test
X_train = X.copy()


import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Leer modelo entrenado
# (Suponemos que ya entrenaste `model` con X_train y y_train)

# Leer CSV de test
df_test = pd.read_csv("test.csv", sep=";")
df_test.columns = df_test.columns.str.strip()  # limpiar posibles espacios

# Preprocesamiento: convertir variables no numéricas
non_numeric_cols_test = df_test.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols_test:
    df_test[col] = LabelEncoder().fit_transform(df_test[col].astype(str))

# Asegurarse de que test tenga las mismas columnas que X_train
for col in X_train.columns:
    if col not in df_test.columns:
        df_test[col] = 0  # agregar columnas que faltan

# Reordenar columnas exactamente como en X_train
X_test = df_test[X_train.columns].copy()

# Predecir Production
y_pred_test = model.predict(X_test)

# Guardar resultados
output = pd.DataFrame({
    "ID": df_test["ID"],  # o df_test["ID"] si existe columna ID
    "Production_pred": y_pred_test
})

output.to_csv("predicciones_test.csv", index=False)

# Mostrar primeras filas
print(output.head(10))


  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan
  df_test[col] = 0  # agregar columnas que faltan


    ID  Production_pred
0   90          4654.59
1   16         16551.39
2   65         25584.92
3  138          3060.17
4  166          2334.60
5  252         45984.39
6  234         27956.25
7  306         28978.99
8  274         54631.74
9  268           897.37


In [11]:
# Guardar resultados
output = pd.DataFrame({
    "ID": df_test["ID"],  # o df_test["ID"] si existe columna ID
    "Production": y_pred_test.astype(int)
})

output.to_csv("predicciones_test.csv", index=False)

# Mostrar primeras filas
print(output.head(10))


    ID  Production
0   90        4654
1   16       16551
2   65       25584
3  138        3060
4  166        2334
5  252       45984
6  234       27956
7  306       28978
8  274       54631
9  268         897


In [5]:
output_df=pd.read_csv("predicciones_finales.csv")
output2_df=pd.read_csv("predicciones_test.csv")


In [10]:
output2_df.head

<bound method NDFrame.head of          ID  Production_pred
0        90             4654
1        16            16551
2        65            25584
3       138             3060
4       166             2334
...     ...              ...
2245  12697            38051
2246  12742            27172
2247  12714            23258
2248  12762            17711
2249  12724             3883

[2250 rows x 2 columns]>