<a href="https://colab.research.google.com/github/jda-21/AI4ENG/blob/main/07%20-%20modelo%20ensemble%20LightGBM%20y%20Catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightgbm catboost tqdm

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

DATA_PATH = "train_clean_unicode.csv"
TEST_PATH = "test_clean_unicode.csv"

print("Cargando TRAIN:", DATA_PATH)
df = pd.read_csv(DATA_PATH)
print(df.shape)

TARGET_COL = "RENDIMIENTO_GLOBAL_NUM"
assert TARGET_COL in df.columns, "No se encuentra el target."

df = df.dropna(subset=[TARGET_COL]).copy()
df[TARGET_COL] = df[TARGET_COL].astype(int)
df = df[df[TARGET_COL].isin([1,2,3,4])]

print("\nConteo de clases:")
print(df[TARGET_COL].value_counts().sort_index())


num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET_COL not in num_cols:
    num_cols.append(TARGET_COL)

df_num = df[num_cols].copy()
X = df_num.drop(columns=[TARGET_COL])
y = df_num[TARGET_COL]

X = X.fillna(X.median())

X = X.astype(np.float32)

print("Shapes -> X:", X.shape, "| y:", y.shape)


X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "| Val:", X_val.shape)


lgbm = LGBMClassifier(
    objective="multiclass",
    num_class=4,
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=2000,
    max_depth=-1,
    num_leaves=63,
    min_data_in_leaf=50,
    reg_lambda=1.0,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("\nEntrenando LightGBM…")
lgbm.fit(X_train, y_train)


print("\nPredicción LightGBM…")
batch = 20000
lgbm_probs_list = []

for i in tqdm(range(0, len(X_val), batch)):
    part = X_val.iloc[i:i+batch]
    lgbm_probs_list.append(lgbm.predict_proba(part))

probs_lgbm = np.vstack(lgbm_probs_list)
pred_lgbm = probs_lgbm.argmax(axis=1) + 1

acc_lgbm = accuracy_score(y_val, pred_lgbm)
f1_lgbm = f1_score(y_val, pred_lgbm, average="macro")

print(f"LightGBM → acc={acc_lgbm:.4f} | f1_macro={f1_lgbm:.4f}")

# Entrenar CatBoost

print("\nEntrenando CatBoost…")

cat = CatBoostClassifier(
    loss_function="MultiClass",
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=5,
    iterations=640,
    random_seed=42,
    verbose=100
)

cat.fit(X_train, y_train)

print("\nPredicción CatBoost…")
cat_probs_list = []

for i in tqdm(range(0, len(X_val), batch)):
    part = X_val.iloc[i:i+batch]
    cat_probs_list.append(cat.predict_proba(part))

probs_cat = np.vstack(cat_probs_list)
pred_cat = probs_cat.argmax(axis=1) + 1

acc_cat = accuracy_score(y_val, pred_cat)
f1_cat = f1_score(y_val, pred_cat, average="macro")

print(f"CatBoost → acc={acc_cat:.4f} | f1_macro={f1_cat:.4f}")

# Ensemble
# =========================================================
probs_ens = (probs_lgbm + probs_cat) / 2
pred_ens = probs_ens.argmax(axis=1) + 1

acc_ens = accuracy_score(y_val, pred_ens)
f1_ens = f1_score(y_val, pred_ens, average="macro")

print(f"\nEnsemble → acc={acc_ens:.4f} | f1_macro={f1_ens:.4f}")


print("\nCargando TEST:", TEST_PATH)
test_df = pd.read_csv(TEST_PATH)

# usar solo columnas numéricas
test_df_num = test_df.select_dtypes(include=[np.number]).fillna(0).astype(np.float32)

# predicción con barra
print("\nPredicción FINAL (ensemble)…")
final_probs_list = []
for i in tqdm(range(0, len(test_df_num), batch)):
    part = test_df_num.iloc[i:i+batch]
    p_l = lgbm.predict_proba(part)
    p_c = cat.predict_proba(part)
    p = (p_l + p_c) / 2
    final_probs_list.append(p)

final_probs = np.vstack(final_probs_list)
final_pred = final_probs.argmax(axis=1) + 1

# submission
sub = pd.DataFrame({
    "ID": test_df["ID"],
    "RENDIMIENTO_GLOBAL": final_pred
})

sub.to_csv("submission_ensemble_colab.csv", index=False)

Cargando TRAIN: train_clean_unicode.csv
(692500, 47)

Conteo de clases:
RENDIMIENTO_GLOBAL_NUM
1    172987
2    172275
3    171619
4    175619
Name: count, dtype: int64
Shapes -> X: (692500, 46) | y: (692500,)
Train: (554000, 46) | Val: (138500, 46)

Entrenando LightGBM…
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.050998 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1465
[LightGBM] [Info] Number of data points in the train set: 554000, number of used features: 42
[LightGBM] [Info] Start training from score -1.387089
[LightGBM] [Info] Start training from score -1.391216
[LightGBM] [Info] Start training from score -1.395033
[LightGBM] [Info] Start training from score -1.371993

Predicción LightGBM…


  0%|          | 0/7 [00:00<?, ?it/s]



 14%|█▍        | 1/7 [00:16<01:41, 16.84s/it]



 29%|██▊       | 2/7 [00:32<01:20, 16.10s/it]



 43%|████▎     | 3/7 [00:47<01:02, 15.70s/it]



 57%|█████▋    | 4/7 [01:02<00:46, 15.45s/it]



 71%|███████▏  | 5/7 [01:18<00:31, 15.62s/it]



 86%|████████▌ | 6/7 [01:37<00:16, 16.63s/it]



100%|██████████| 7/7 [01:52<00:00, 16.08s/it]


LightGBM → acc=0.4283 | f1_macro=0.4170

Entrenando CatBoost…
0:	learn: 1.3778589	total: 346ms	remaining: 3m 41s
100:	learn: 1.2543426	total: 34.5s	remaining: 3m 4s
200:	learn: 1.2393714	total: 1m 8s	remaining: 2m 28s
300:	learn: 1.2304583	total: 1m 42s	remaining: 1m 55s
400:	learn: 1.2236402	total: 2m 15s	remaining: 1m 21s
500:	learn: 1.2193172	total: 2m 49s	remaining: 47.2s
600:	learn: 1.2160638	total: 3m 22s	remaining: 13.1s
639:	learn: 1.2149019	total: 3m 35s	remaining: 0us

Predicción CatBoost…


100%|██████████| 7/7 [00:00<00:00, 21.36it/s]


CatBoost → acc=0.4228 | f1_macro=0.4080

Ensemble → acc=0.4295 | f1_macro=0.4160

Cargando TEST: test_clean_unicode.csv

Predicción FINAL (ensemble)…


  0%|          | 0/15 [00:00<?, ?it/s]



  7%|▋         | 1/15 [00:17<04:09, 17.85s/it]



 13%|█▎        | 2/15 [00:33<03:38, 16.80s/it]



 20%|██        | 3/15 [00:50<03:21, 16.79s/it]



 27%|██▋       | 4/15 [01:09<03:11, 17.42s/it]



 33%|███▎      | 5/15 [01:26<02:54, 17.49s/it]



 40%|████      | 6/15 [01:45<02:40, 17.79s/it]



 47%|████▋     | 7/15 [02:04<02:26, 18.27s/it]



 53%|█████▎    | 8/15 [02:22<02:08, 18.36s/it]



 60%|██████    | 9/15 [02:37<01:43, 17.25s/it]



 67%|██████▋   | 10/15 [02:52<01:22, 16.59s/it]



 73%|███████▎  | 11/15 [03:07<01:03, 15.89s/it]



 80%|████████  | 12/15 [03:22<00:47, 15.83s/it]



 87%|████████▋ | 13/15 [03:41<00:33, 16.76s/it]



 93%|█████████▎| 14/15 [03:56<00:16, 16.30s/it]



100%|██████████| 15/15 [04:09<00:00, 16.64s/it]



✔ Archivo 'submission_ensemble_colab.csv' listo para descargar.
