In [4]:
# =====================================================
# 0. Instalar dependencias
# =====================================================
!pip install -q lightgbm scikit-learn pandas

from google.colab import files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# =====================================================
# 1. SUBIR ARCHIVOS
# =====================================================
print("üìå Sube train.csv")
files.upload()

print("üìå Sube test.csv")
files.upload()

print("üìå Sube submission_example.csv")
files.upload()

# =====================================================
# 2. Cargar archivos
# =====================================================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("submission_example.csv")

print("Shapes -> train:", train.shape, " test:", test.shape, " sample:", sample.shape)

# =====================================================
# 3. Preparar columnas
# =====================================================
TARGET = "RENDIMIENTO_GLOBAL"

# Guardar nombres reales del target
target_original_values = train[TARGET].astype(str).unique()

# Mapear target a n√∫meros (LightGBM lo exige)
class_mapping = {c: i for i, c in enumerate(sorted(target_original_values))}
inverse_mapping = {i: c for c, i in class_mapping.items()}

train["target_num"] = train[TARGET].astype(str).map(class_mapping)

# Separar variables
X = train.drop(columns=[TARGET, "target_num"])
y = train["target_num"]

# =====================================================
# 4. Unir train+test para transformar igual
# =====================================================
all_data = pd.concat([X, test], axis=0, ignore_index=True)

# Detectar tipos
num_cols = all_data.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = all_data.select_dtypes(include=["object"]).columns.tolist()

# =====================================================
# 5. Imputar y codificar
# =====================================================
for c in num_cols:
    all_data[c] = all_data[c].fillna(all_data[c].median())

for c in cat_cols:
    all_data[c] = all_data[c].fillna("MISSING").astype(str)
    all_data[c] = all_data[c].factorize()[0]

# Reconstruir matrices finales
X_proc = all_data.iloc[:len(X)].reset_index(drop=True)
test_proc = all_data.iloc[len(X):].reset_index(drop=True)

# =====================================================
# 6. Split train/valid
# =====================================================
X_train, X_val, y_train, y_val = train_test_split(
    X_proc, y, test_size=0.2, random_state=42, stratify=y
)

# =====================================================
# 7. Entrenar LightGBM correctamente
# =====================================================
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "multiclass",
    "num_class": len(class_mapping),
    "metric": "multi_logloss",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "seed": 42,
    "verbosity": -1
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[train_data, val_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=50)
    ]
)

# =====================================================
# 8. Predicci√≥n final
# =====================================================
preds = model.predict(test_proc, num_iteration=model.best_iteration)
pred_labels = np.argmax(preds, axis=1)

# Convertir a etiquetas reales
final_pred = [inverse_mapping[i] for i in pred_labels]

# =====================================================
# 9. Crear submission.csv
# =====================================================
id_col = None
for candidate in ["ID", "id", "Id"]:
    if candidate in test.columns:
        id_col = candidate
        break
if id_col is None:
    raise ValueError("No encontr√© columna ID en test.csv")

submission = pd.DataFrame({
    id_col: test[id_col],
    "RENDIMIENTO_GLOBAL": final_pred
})

submission.to_csv("submission.csv", index=False)
print("‚úîÔ∏è submission.csv creado correctamente")
print(submission.head())

# =====================================================
# 10. Descargar archivo
# =====================================================
files.download("submission.csv")



üìå Sube train.csv


üìå Sube test.csv


üìå Sube submission_example.csv


Shapes -> train: (692500, 21)  test: (296786, 20)  sample: (296786, 2)
Training until validation scores don't improve for 50 rounds
[50]	training's multi_logloss: 1.2565	valid_1's multi_logloss: 1.2595
[100]	training's multi_logloss: 1.23133	valid_1's multi_logloss: 1.23582
[150]	training's multi_logloss: 1.21774	valid_1's multi_logloss: 1.22392
[200]	training's multi_logloss: 1.2097	valid_1's multi_logloss: 1.21773
[250]	training's multi_logloss: 1.20405	valid_1's multi_logloss: 1.21424
[300]	training's multi_logloss: 1.19921	valid_1's multi_logloss: 1.21169
[350]	training's multi_logloss: 1.19512	valid_1's multi_logloss: 1.21002
[400]	training's multi_logloss: 1.19115	valid_1's multi_logloss: 1.20839
[450]	training's multi_logloss: 1.18759	valid_1's multi_logloss: 1.20714
[500]	training's multi_logloss: 1.18425	valid_1's multi_logloss: 1.20616
Did not meet early stopping. Best iteration is:
[500]	training's multi_logloss: 1.18425	valid_1's multi_logloss: 1.20616
‚úîÔ∏è submission.csv

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>