
# 🚀 03c — XGBoost & LightGBM (Regressão)

Testamos **XGBoost** e **LightGBM** para prever `IMDB_Rating`.

- Entrada: `data/movies.csv`
- Saídas: `models/imdb_model_xgb.pkl` e `models/imdb_model_lgbm.pkl` (se instalados)
- Observação: estes *boosters* lidam bem com **matrizes esparsas**, então podemos manter `OneHotEncoder` e `TF‑IDF` esparsos.
- Se você não tiver `xgboost`/`lightgbm` instalados, instale:
  ```bash
  pip install xgboost lightgbm
  ```


In [2]:

# Imports
import warnings
warnings.filterwarnings("ignore")

import re, os, pickle, importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Optional boosters
have_xgb = importlib.util.find_spec("xgboost") is not None
have_lgb = importlib.util.find_spec("lightgbm") is not None

if have_xgb:
    import xgboost as xgb
if have_lgb:
    import lightgbm as lgb



## 1) Carregar e preparar dados


In [3]:

DATA_PATH = "../data/movies.csv"
df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)

def parse_runtime_to_minutes(runtime):
    if pd.isna(runtime):
        return np.nan
    if isinstance(runtime, (int, float)):
        return int(runtime) if runtime > 0 else np.nan
    m = re.search(r"(\d+)", str(runtime))
    return int(m.group(1)) if m else np.nan

def parse_gross_to_number(gross):
    if gross is None:
        return np.nan
    if isinstance(gross, (int, float)):
        return float(gross)
    s = str(gross).strip()
    if s == "":
        return np.nan
    s = s.replace(",", "").replace(".", "")
    return float(s) if s.isdigit() else np.nan

dfc = df.copy()
dfc["Runtime_min"] = dfc["Runtime"].apply(parse_runtime_to_minutes)
dfc["Gross_num"]   = dfc["Gross"].apply(parse_gross_to_number)
dfc["Gross_log1p"] = np.log1p(dfc["Gross_num"])
dfc["Released_Year_int"] = pd.to_numeric(dfc["Released_Year"], errors="coerce").astype("Int64")
dfc["Overview"] = dfc["Overview"].fillna("")


Shape: (999, 16)



## 2) Cenário e colunas


In [4]:

SCENARIO = "post_release"   # "post_release" | "early_stage"
use_text = True             # pode manter texto esparso

num_base = ["No_of_Votes", "Runtime_min", "Released_Year_int"]
cat_cols = ["Genre", "Certificate", "Director", "Star1", "Star2", "Star3", "Star4"]
text_col = "Overview"

if SCENARIO == "post_release":
    num_cols = num_base + ["Meta_score", "Gross_log1p"]
else:
    num_cols = num_base
    # Opcional: num_cols += ["Meta_score"]

y = dfc["IMDB_Rating"].astype(float)
X = dfc[num_cols + cat_cols + [text_col]].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train:", X_train.shape, " Test:", X_test.shape)


Train: (799, 13)  Test: (200, 13)



## 3) Pré-processamento (esparso) + Modelos
- Numéricas → `SimpleImputer(median)`
- Categóricas → `SimpleImputer(most_frequent)` + `OneHotEncoder` (esparso)
- Texto → `TfidfVectorizer(max_features=12000, ngram_range=(1,2))` (esparso)


In [5]:

num_pipe = SimpleImputer(strategy="median")
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  # esparso por padrão
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
        ("txt", TfidfVectorizer(max_features=12000, ngram_range=(1,2), stop_words="english"), text_col),
    ],
    remainder="drop"
)



## 4) XGBoost


In [6]:

if have_xgb:
    xgb_est = xgb.XGBRegressor(
        n_estimators=800,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        tree_method="hist"
    )
    xgb_pipe = Pipeline([("pre", pre), ("est", xgb_est)])
    xgb_pipe.fit(X_train, y_train)
    ypx = xgb_pipe.predict(X_test)
    rmse = mean_squared_error(y_test, ypx, squared=False)
    mae  = mean_absolute_error(y_test, ypx)
    r2   = r2_score(y_test, ypx)
    print(f"XGBoost — RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")
else:
    print("xgboost não instalado. Rode: pip install xgboost")


xgboost não instalado. Rode: pip install xgboost



### Grid Search (pequeno) — XGBoost


In [7]:

if have_xgb:
    param_grid = {
        "est__n_estimators": [400, 800],
        "est__max_depth": [4, 6],
        "est__learning_rate": [0.05, 0.1],
        "est__subsample": [0.8, 1.0],
        "est__colsample_bytree": [0.7, 1.0]
    }
    gsx = GridSearchCV(xgb_pipe, param_grid, cv=3, scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=1)
    gsx.fit(X_train, y_train)
    best_xgb = gsx.best_estimator_
    ypb = best_xgb.predict(X_test)
    rmse = mean_squared_error(y_test, ypb, squared=False)
    mae  = mean_absolute_error(y_test, ypb)
    r2   = r2_score(y_test, ypb)
    print("Best XGB params:", gsx.best_params_)
    print(f"XGB (tuned) — RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")
else:
    pass



## 5) LightGBM


In [8]:

if have_lgb:
    lgb_est = lgb.LGBMRegressor(
        n_estimators=1200,
        num_leaves=63,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    lgb_pipe = Pipeline([("pre", pre), ("est", lgb_est)])
    lgb_pipe.fit(X_train, y_train)
    ypl = lgb_pipe.predict(X_test)
    rmse = mean_squared_error(y_test, ypl, squared=False)
    mae  = mean_absolute_error(y_test, ypl)
    r2   = r2_score(y_test, ypl)
    print(f"LightGBM — RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")
else:
    print("lightgbm não instalado. Rode: pip install lightgbm")


lightgbm não instalado. Rode: pip install lightgbm



### Grid Search (pequeno) — LightGBM


In [9]:

if have_lgb:
    param_grid = {
        "est__n_estimators": [800, 1200],
        "est__num_leaves": [31, 63],
        "est__learning_rate": [0.05, 0.1],
        "est__subsample": [0.8, 1.0],
        "est__colsample_bytree": [0.8, 1.0]
    }
    gsl = GridSearchCV(lgb_pipe, param_grid, cv=3, scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=1)
    gsl.fit(X_train, y_train)
    best_lgb = gsl.best_estimator_
    ypb = best_lgb.predict(X_test)
    rmse = mean_squared_error(y_test, ypb, squared=False)
    mae  = mean_absolute_error(y_test, ypb)
    r2   = r2_score(y_test, ypb)
    print("Best LGB params:", gsl.best_params_)
    print(f"LGB (tuned) — RMSE: {rmse:.3f} | MAE: {mae:.3f} | R²: {r2:.3f}")
else:
    pass



## 6) Salvar os melhores (se existirem)


In [10]:

os.makedirs("../models", exist_ok=True)

if have_xgb:
    with open("../models/imdb_model_xgb.pkl", "wb") as f:
        pickle.dump(best_xgb if 'best_xgb' in globals() else xgb_pipe, f)
    print("Modelo XGBoost salvo em models/imdb_model_xgb.pkl")

if have_lgb:
    with open("../models/imdb_model_lgbm.pkl", "wb") as f:
        pickle.dump(best_lgb if 'best_lgb' in globals() else lgb_pipe, f)
    print("Modelo LightGBM salvo em models/imdb_model_lgbm.pkl")
