# 04 - Model Training
LightGBM + Grid Search + Threshold Optimization


## 1. Cargar Dataset

In [ ]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn import set_config
from lightgbm import LGBMClassifier

DATA_PATH = "../data/processed/spotify_clean_modeling.csv"
df = pd.read_csv(DATA_PATH)
df.shape

## 2. Separar X / y y preprocesamiento

In [ ]:
X = df.drop(columns=["is_hit","popularity"])
y = df["is_hit"]
set_config(transform_output="pandas")
numeric_cols = X.select_dtypes(include=["float64","int64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
    ("num", "passthrough", numeric_cols)
])
len(numeric_cols), len(categorical_cols)

## 3. Train / Validation / Test Split

In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
X_trainModel, X_val, y_trainModel, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42, stratify=y_train)
X_trainModel.shape, X_val.shape, X_test.shape

## 4. Pipeline + Grid Search

In [ ]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", LGBMClassifier(random_state=42, n_jobs=-1, verbose=-1))
])
param_grid = {
    "model__num_leaves": [31,63,95],
    "model__learning_rate": [0.03,0.015],
    "model__n_estimators": [800,1500],
    "model__min_child_samples": [20,50],
    "model__scale_pos_weight": [8,9.5,12],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8]
}
grid = GridSearchCV(pipeline, param_grid, scoring="f1", cv=3, verbose=1, n_jobs=-1)
grid.fit(X_trainModel, y_trainModel)
grid.best_params_, grid.best_score_

## 5. Evaluación Final

In [ ]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]
accuracy_score(y_test,y_pred), f1_score(y_test,y_pred), roc_auc_score(y_test,y_proba)

## 6. Threshold Tuning

In [ ]:
thresholds = np.linspace(0.05,0.95,30)
f1_vals = []
y_proba_val = best_model.predict_proba(X_val)[:,1]
for t in thresholds:
    f1_vals.append(f1_score(y_val,(y_proba_val>=t).astype(int)))
best_t = thresholds[np.argmax(f1_vals)]
best_t

## 7. Guardar modelo

In [ ]:
joblib.dump(best_model, "../models/lightgbm_best.pkl")