│   │     • Reuse the TF-IDF features  
│   │     • Train & tune:  
│   │         - XGBoost (`multi:softprob`)  
│   │         - LightGBM (`multiclass`)  
│   │     • Early stopping on a validation fold  
│   │     • Plot feature importances  

In [1]:
# boosting.ipynb

import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Boosting libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 1. Load data (same as before)
def load_data():
    train_df = pd.read_csv("../data/processed/train.csv")
    test_df = pd.read_csv("../data/processed/test.csv")
    train_df["combined_text"] = train_df["headline"].fillna('') + " " + train_df["short_description"].fillna('')
    test_df["combined_text"] = test_df["headline"].fillna('') + " " + test_df["short_description"].fillna('')
    train_df = train_df[train_df["combined_text"].str.strip() != ""]
    test_df = test_df[test_df["combined_text"].str.strip() != ""]

    le = LabelEncoder()
    y_train = le.fit_transform(train_df["category"])
    y_test = le.transform(test_df["category"])

    return train_df["combined_text"], test_df["combined_text"], y_train, y_test, le

# 2. TF-IDF vectorization
def vectorize_text(train_texts, test_texts):
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test

# 3. Evaluation function
def train_and_evaluate(model, model_name, X_train, y_train, X_test, y_test, label_encoder):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"📌 {model_name}")
    print(f"Accuracy: {acc:.4f} | Macro F1: {f1:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("=" * 80)

# 4. Load data
train_texts, test_texts, y_train, y_test, le = load_data()
X_train, X_test = vectorize_text(train_texts, test_texts)


## XGBoost
- Very slow and memory intensive for large datasets 
- Took about 

In [None]:
# 5. Train XGBoost
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_train)),
    eval_metric='mlogloss', 
    use_label_encoder=False, 
    n_estimators=150, # Number of trees (higher = more complex).
    learning_rate=0.1, # Shrinks contribution of each tree (lower = slower but often better).
    max_depth=8, # Maximum tree depth. Controls complexity.
    subsample=0.8, # Percentage of data used for each tree. Prevents overfitting.
    colsample_bytree=0.8, # Percentage of features used for each tree. Prevents overfitting.
    random_state=42 
    n_jobs=-1, # Use all available cores
)
train_and_evaluate(xgb_model, "XGBoost", X_train, y_train, X_test, y_test, le)


## LightGBM
Training time roughly 3.5-4 mins

In [7]:
lgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=len(np.unique(y_train)),
    n_estimators=150,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1, # Use all available cores
    min_data_in_leaf=45, # Minimum number of data points in a leaf node
)
s = time.time()
train_and_evaluate(lgbm_model, "LightGBM", X_train, y_train, X_test, y_test, le)
e = time.time()
print(f"LightGBM training time: {e - s:.2f} seconds")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.983392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 608113
[LightGBM] [Info] Number of data points in the train set: 167617, number of used features: 9449
[LightGBM] [Info] Start training from score -4.933544
[LightGBM] [Info] Start training from score -5.053089
[LightGBM] [Info] Start training from score -3.822580
[LightGBM] [Info] Start training from score -3.554525
[LightGBM] [Info] Start training from score -5.210513
[LightGBM] [Info] Start training from score -3.658426
[LightGBM] [Info] Start training from score -4.074363
[LightGBM] [Info] Start training from score -5.273668
[LightGBM] [Info] Start training from score -4.113359
[LightGBM] [Info] Start training from score -5.331169
[LightGBM] [Info] Start training from score -2.490584
[LightGBM] [Info] Start training from score -4.9

## CATBoost

In [None]:

catboost_model = CatBoostClassifier(
    loss_function='MultiClass',
    iterations=150,
    learning_rate=0.1,
    depth=8,
    random_seed=42,
    verbose=0,  # change to 100 for more logs
)

s = time.time()
train_and_evaluate(catboost_model, "CatBoost", X_train, y_train, X_test, y_test, le)
e = time.time()
print(f"CatBoost training time: {e - s:.2f} seconds")
