In [7]:
# ===========================================
#  Modelo_LGBM_Sencillo_v1
#  CÃ³digo limpio, corto y funcional para la competencia
# ===========================================

!pip install -q lightgbm

import pandas as pd
import numpy as np
from google.colab import files
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

# ---------------------------
# 1. Subir archivos
# ---------------------------
print("ðŸ“Œ Sube train.csv")
files.upload()
print("ðŸ“Œ Sube test.csv")
files.upload()

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Detectar columna ID
id_col = [c for c in test.columns if c.lower()=="id"]
if len(id_col)==0:
    id_col = test.columns[0]
else:
    id_col = id_col[0]

# ---------------------------
# 2. Preparar datos
# ---------------------------
TARGET = "RENDIMIENTO_GLOBAL"

# Mapear target a nÃºmeros
y_map = {v:k for k,v in enumerate(train[TARGET].unique())}
inv_map = {k:v for v,k in y_map.items()}

train["target_num"] = train[TARGET].map(y_map)

# X e y
X = train.drop(columns=[TARGET, "target_num"])
y = train["target_num"]

# Unir para encoding igual
full = pd.concat([X, test], axis=0, ignore_index=True)

# Imputar numÃ©ricos y factorizar categÃ³ricas
for col in full.columns:
    if full[col].dtype == "object":
        full[col] = full[col].fillna("MISSING").astype(str).factorize()[0]
    else:
        full[col] = full[col].fillna(full[col].median())

# Dividir de nuevo
X = full.iloc[:len(train)]
test_proc = full.iloc[len(train):]

# ---------------------------
# 3. Entrenar con StratifiedKFold
# ---------------------------
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof = np.zeros(len(train))
pred_test = np.zeros((len(test), len(inv_map)))

print("\nEntrenando modelo...")

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    params = {
        "objective": "multiclass",
        "num_class": len(inv_map),
        "metric": "multi_logloss",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "seed": 42
    }

    dtr = lgb.Dataset(X_tr, label=y_tr)
    dva = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        params, dtr,
        num_boost_round=300,
        valid_sets=[dva],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(0)]
    )

    # Predicciones OOF
    oof[val_idx] = np.argmax(model.predict(X_val), axis=1)

    # Test promedio
    pred_test += model.predict(test_proc) / N_SPLITS

print("\nOOF Accuracy:", accuracy_score(y, oof))

# ---------------------------
# 4. Predicciones finales
# ---------------------------
final_labels = np.argmax(pred_test, axis=1)
final_labels = [inv_map[i] for i in final_labels]

submission = pd.DataFrame({
    id_col: test[id_col],
    TARGET: final_labels
})

submission.to_csv("submission_simple.csv", index=False)
print("\nâœ” Archivo 'submission_simple.csv' generado correctamente.")
files.download("submission_simple.csv")


ðŸ“Œ Sube train.csv


Saving train.csv to train.csv
ðŸ“Œ Sube test.csv


Saving test.csv to test.csv

Entrenando modelo...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124867 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 20
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.387096
[LightGBM] [Info] Start training from score -1.371986
[LightGBM] [Info] Start training from score -1.391216
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[300]	valid_0's multi_logloss: 1.21116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[Light

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>