In [1]:
# ==== Basic Python / Data Handling ====
import os
import re
import json
import numpy as np
import pandas as pd

# ==== Scikit-Learn Utilities ====
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelEncoder


# Matplotlib
import matplotlib.pyplot as plt


# ==== PyTorch ====
import torch
from torch.utils.data import Dataset, DataLoader

# ==== HuggingFace Transformers ====
from transformers import (
    CamembertTokenizer,
    CamembertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from transformers import CamembertModel


# ==== LightGBM (Metadata Model) ====
import lightgbm as lgb

# ==== Progress Bar (optional but recommended) ====
from tqdm import tqdm

# ==== Warnings (optional) ====
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_jsonl(path):
    df = pd.read_json(path, lines=True)

    # --- Clean text ---
    def clean_text(t):
        if pd.isna(t):
            return ""
        t = str(t).replace("\n", " ").strip()
        return re.sub(r"\s+", " ", t)

    df["text"] = df["text"].apply(clean_text)

    # --- Text features Extraction---
    df["has_url"] = df["text"].str.contains(r"http[s]?://", regex=True).astype(int)
    df["num_hashtags"] = df["text"].str.count(r"#")
    df["has_hashtag"] = (df["num_hashtags"] > 0).astype(int)
    df["num_mentions"] = df["text"].str.count(r"@")

    emoji_pattern = r"[\U0001F600-\U0001F64F]"
    df["num_emojis"] = df["text"].str.count(emoji_pattern)

    df["text_len"] = df["text"].str.len()
    df["num_caps"] = df["text"].str.count(r"[A-Z]")
    df["num_exclam"] = df["text"].str.count(r"!")
    df["num_question"] = df["text"].str.count(r"\?")
    df["elongated_words"] = df["text"].str.contains(r"(.)\1\1+").astype(int)

    df["emoji_density"] = df["num_emojis"] / (df["text_len"] + 1)
    df["punct_ratio"] = (df["num_exclam"] + df["num_question"]) / (df["text_len"] + 1)


    df["user_created_at"] = df["user"].apply(
    lambda u: u.get("created_at", None) if isinstance(u, dict) else None)

    df['hour'] = pd.to_datetime(df['created_at'], errors='coerce').dt.hour.fillna(-1).astype(int)
    df['day_of_week'] = pd.to_datetime(df['created_at'], errors='coerce').dt.dayofweek.fillna(-1).astype(int)
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_business_hours'] = df['hour'].between(9, 17).astype(int)
    def part_of_day(h):
        if h == -1: return -1
        if 5 <= h < 12:  return 0   # morning
        if 12 <= h < 17: return 1   # afternoon
        if 17 <= h < 21: return 2   # evening (prime time)
        return 3                    # night

    df['part_of_day'] = df['hour'].apply(part_of_day)
    df['month'] = pd.to_datetime(df['created_at'], errors='coerce').dt.month.fillna(-1).astype(int)







    # --- Promo words ---
    promo_words = [
    # English
    "check out", "giveaway", "subscribe", "new video", "follow me", "promo",
    "discount", "code", "link in bio", "limited offer", "free shipping",
    "new post", "sale", "deal", "official store", "use my code", "partnership",
    "affiliate", "sponsored", "collab", "win", "contest", "flash sale",
    "click here", "buy now", "shop now", "big announcement", "launch",
    "premiere", "live now", "join now", "special offer",

    # French
    "nouvelle vidéo", "abonnez", "abonnez-vous", "nouveau post", "concours",
    "gagnez", "offre", "code promo", "réduction", "promo", "découvrez",
    "lien en bio", "regardez", "suivez-moi", "partagez", "inscrivez-vous",
    "expédition gratuite", "vente", "soldes", "offre limitée", "nouvelle offre",
    "partenariat", "collaboration", "sponsorisé", "gagnez maintenant",
    "cliquez ici", "achetez maintenant", "boutique officielle", "tirage au sort",
    "mise en ligne", "lancement", "live maintenant"
    ]

    # Binary: tweet contains ANY promo word
    df["promo_words"] = df["text"].str.lower().apply(
        lambda t: any(w in t for w in promo_words)
    ).astype(int)

    # Count: how many promo words appear in the tweet
    df["num_promo_words"] = df["text"].str.lower().apply(
        lambda t: sum(w in t for w in promo_words)
    ).astype(int)

    df["statuses"] = df["user"].apply(
        lambda u: u.get("statuses_count", -1) if isinstance(u, dict) else -1
    )

    df["account_age_days"] = (pd.to_datetime(df["created_at"]) -
                          pd.to_datetime(df["user_created_at"], errors="coerce")).dt.days.fillna(-1)
    df["tweet_frequency"] = df["statuses"] / (df["account_age_days"] + 1)
    

    print("Loaded:", path)
    print("Rows:", len(df))
    return df


In [3]:
train_path = "data/Kaggle2025/train.jsonl"


df = load_jsonl(train_path)

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    df,
    test_size=0.1,
    random_state=42,
    stratify=df["label"]
)

print("Train size:", len(train_df))
print("Val size:", len(val_df))


Loaded: data/Kaggle2025/train.jsonl
Rows: 154914
Train size: 139422
Val size: 15492


In [4]:
# Present working
print("Using CUDA:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU")
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

MODEL_NAME = "camembert-base"

tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)
camembert = CamembertModel.from_pretrained(MODEL_NAME,
                                           output_hidden_states = True)
camembert.to("cuda")
camembert.eval()

Using CUDA: True
Device count: 1
Current device: 0
GPU name: NVIDIA RTX 4000 Ada Generation


CamembertModel(
  (embeddings): CamembertEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): CamembertEncoder(
    (layer): ModuleList(
      (0-11): 12 x CamembertLayer(
        (attention): CamembertAttention(
          (self): CamembertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): CamembertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
 

In [5]:
class EmbeddingDataset(Dataset):
    def __init__(self, df, text_col="text"):
        self.texts = df[text_col].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=160,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in enc.items()}




def mean_pooling(hidden_state, attention_mask):
    # hidden_state: (B, L, H)
    # attention_mask: (B, L)
    mask = attention_mask.unsqueeze(-1)       # (B, L, 1)
    masked_hidden = hidden_state * mask       # zero-out PAD tokens

    summed = masked_hidden.sum(dim=1)         # sum over tokens
    counts = mask.sum(dim=1).clamp(min=1)     # avoid division by zero

    return summed / counts                    # (B, H)


def last_four_layers_pooling(hidden_states, attention_mask):

    layers = hidden_states[-4:]  # list of 4 tensors

    pooled = []
    for hs in layers:
        pooled.append(mean_pooling(hs, attention_mask))  # each is (B, H)

    stacked = torch.stack(pooled, dim=0)  # (4, B, H)

    # weights: more weight for deeper layers
    weights = torch.tensor([1.0, 2.0, 3.0, 4.0], device=stacked.device)
    weights = weights / weights.sum()                # normalize
    weights = weights.view(4, 1, 1)                  # (4, 1, 1)

    return (stacked * weights).sum(dim=0)

    

def extract_embeddings(df, batch_size=32):
    ds = EmbeddingDataset(df)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False)

    all_embeddings = []

    for batch in dl:
        input_ids = batch["input_ids"].to("cuda")
        attn_mask = batch["attention_mask"].to("cuda")

        with torch.no_grad():
            outputs = camembert(
                input_ids=input_ids,
                attention_mask=attn_mask,
                output_hidden_states=True,
                return_dict=True
            )

            # use last 4 layers pooling
            hidden_states = outputs.hidden_states
            sentence_emb = last_four_layers_pooling(hidden_states, attn_mask)

        all_embeddings.append(sentence_emb.cpu())

    return torch.cat(all_embeddings, dim=0).numpy()



emb_train = extract_embeddings(train_df)
emb_val   = extract_embeddings(val_df)

print(emb_train.shape)   # (N_train, 768)
print(emb_val.shape)     # (N_val, 768)


(139422, 768)
(15492, 768)


In [6]:
# np.save("notebooks/emb_train.npy", emb_train)
# np.save("notebooks/emb_val.npy", emb_val)


In [7]:
def flatten_record(prefix, obj, out_dict):
    if isinstance(obj, dict):
        for k, v in obj.items():
            new_key = f"{prefix}_{k}" if prefix else k
            flatten_record(new_key, v, out_dict)
    elif isinstance(obj, list):
        out_dict[prefix + "_len"] = len(obj)
    else:
        out_dict[prefix] = obj


def extract_features_from_row(row):
    feature_dict = {}
    for col in row.index:
        if col in ["text", "label"]:  # skip text + label
            continue
        flatten_record(col, row[col], feature_dict)
    return feature_dict


def build_metadata_dataframe(df):
    records = []
    for _, row in df.iterrows():
        records.append(extract_features_from_row(row))
    return pd.DataFrame(records)


meta_train = build_metadata_dataframe(train_df)
meta_val   = build_metadata_dataframe(val_df)


meta_val = meta_val.reindex(columns=meta_train.columns, fill_value=-1)


print("Train metadata shape:", meta_train.shape)
print("Val metadata shape:", meta_val.shape)

Train metadata shape: (139422, 214)
Val metadata shape: (15492, 214)


In [8]:
# Remove constants
constant_cols = meta_train.columns[meta_train.nunique() <= 1].tolist()
meta_train.drop(columns=constant_cols, inplace=True)
meta_val.drop(columns=constant_cols, inplace=True)

# Remove very sparse
sparse_cols = meta_train.columns[meta_train.isna().mean() > 0.80].tolist()
meta_train.drop(columns=sparse_cols, inplace=True)
meta_val.drop(columns=sparse_cols, inplace=True)

# Remove datetime
datetime_cols = meta_train.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns
meta_train.drop(columns=datetime_cols, inplace=True)
meta_val.drop(columns=datetime_cols, inplace=True)

# Handle object columns
object_cols = meta_train.select_dtypes(include=["object"]).columns

keep = []
drop = []

for col in object_cols:
    if meta_train[col].nunique() <= 20:
        keep.append(col)
    else:
        drop.append(col)

# Label encode small-categorical columns
for col in keep:
    le = LabelEncoder()
    meta_train[col] = le.fit_transform(meta_train[col].astype(str))
    meta_val[col]   = le.transform(meta_val[col].astype(str))

# Drop high-cardinality object columns
meta_train.drop(columns=drop, inplace=True)
meta_val.drop(columns=drop, inplace=True)

# Convert to numeric + fill NA
meta_train = meta_train.apply(pd.to_numeric, errors="ignore").fillna(-1)
meta_val   = meta_val.apply(pd.to_numeric, errors="ignore").fillna(-1)

print("Final shapes:", meta_train.shape, meta_val.shape)


Final shapes: (139422, 79) (15492, 79)


In [9]:
X_train = np.hstack([emb_train, meta_train.values])
X_val   = np.hstack([emb_val, meta_val.values])

y_train = train_df["label"].values
y_val   = val_df["label"].values

print(X_train.shape, X_val.shape)


(139422, 847) (15492, 847)


In [10]:
from xgboost import XGBClassifier

xgb_best = None
xgb_best_acc = 0

xgb_grid = [

    {"n_estimators": 600,  "max_depth": 6, "learning_rate": 0.03},
    {"n_estimators": 600, "max_depth": 8, "learning_rate": 0.02},
    {"n_estimators": 900, "max_depth": 6, "learning_rate": 0.02},
    {"n_estimators": 900, "max_depth": 6, "learning_rate": 0.03},
    {"n_estimators": 900, "max_depth": 8, "learning_rate": 0.05},
    {"n_estimators": 1200, "max_depth": 8, "learning_rate": 0.03},
    {"n_estimators": 1200, "max_depth": 6, "learning_rate": 0.02},
    {"n_estimators": 1200, "max_depth": 8, "learning_rate": 0.05},

]


for params in xgb_grid:
    model = XGBClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        learning_rate=params["learning_rate"],
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        
        tree_method="gpu_hist",
        predictor="gpu_predictor",
        gpu_id=0)

    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    acc = ((preds >= 0.5).astype(int) == y_val).mean()

    print(f"XGB {params} → acc={acc:.4f}")

    if acc > xgb_best_acc:
        xgb_best_acc = acc
        xgb_best = model

preds_xgb = xgb_best.predict_proba(X_val)[:, 1]
print("Best XGBoost accuracy:", xgb_best_acc)


XGB {'n_estimators': 600, 'max_depth': 6, 'learning_rate': 0.03} → acc=0.8430
XGB {'n_estimators': 600, 'max_depth': 8, 'learning_rate': 0.02} → acc=0.8449
XGB {'n_estimators': 900, 'max_depth': 6, 'learning_rate': 0.02} → acc=0.8442
XGB {'n_estimators': 900, 'max_depth': 6, 'learning_rate': 0.03} → acc=0.8426
XGB {'n_estimators': 900, 'max_depth': 8, 'learning_rate': 0.05} → acc=0.8479
XGB {'n_estimators': 1200, 'max_depth': 8, 'learning_rate': 0.03} → acc=0.8470
XGB {'n_estimators': 1200, 'max_depth': 6, 'learning_rate': 0.02} → acc=0.8435
XGB {'n_estimators': 1200, 'max_depth': 8, 'learning_rate': 0.02} → acc=0.8468
Best XGBoost accuracy: 0.8478569584301575


In [11]:
from lightgbm import LGBMClassifier

lgb_best = None 
lgb_best_acc = 0

lgb_grid = [
    {"num_leaves": 64, "learning_rate": 0.05, "feature_fraction": 0.8},
    {"num_leaves": 128, "learning_rate": 0.03, "feature_fraction": 0.9},
    {"num_leaves": 128, "learning_rate": 0.02, "feature_fraction": 0.8},
    {"num_leaves": 256, "learning_rate": 0.02, "feature_fraction": 0.9},
    {"num_leaves": 256, "learning_rate": 0.03, "feature_fraction": 0.8},
    {"num_leaves": 256, "learning_rate": 0.03, "feature_fraction": 1.0},
    {"num_leaves": 512, "learning_rate": 0.02, "feature_fraction": 0.9},
    {"num_leaves": 512, "learning_rate": 0.015, "feature_fraction": 0.8},

]

for params in lgb_grid:
    model = LGBMClassifier(
    device = 'gpu',gpu_platform_id=0,
    gpu_device_id=0,
        num_leaves=params["num_leaves"],
        learning_rate=params["learning_rate"],
        n_estimators=1000,
        feature_fraction=params["feature_fraction"],
        bagging_fraction=0.9,
        bagging_freq=3,
        objective="binary"
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="logloss",
    )

    preds = model.predict_proba(X_val)[:, 1]
    acc = ((preds >= 0.5).astype(int) == y_val).mean()

    print(f"LGBM {params} → acc={acc:.4f}")

    if acc > lgb_best_acc:
        lgb_best_acc = acc
        lgb_best = model

preds_lgb = lgb_best.predict_proba(X_val)[:, 1]
print("Best LightGBM accuracy:", lgb_best_acc)


[LightGBM] [Info] Number of positive: 65016, number of negative: 74406
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 203103
[LightGBM] [Info] Number of data points in the train set: 139422, number of used features: 847
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX 4000 Ada Generation, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 818 dense feature groups (109.03 MB) transferred to GPU in 0.013977 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466325 -> initscore=-0.134903
[LightGBM] [Info] Start training from score -0.134903
LGBM {'num_leaves': 64, 'learning_rate': 0.05, 'feature_fraction': 0.8} → acc=0.8444
[LightGBM] [Info] Number of positive: 65016, number of negative: 74406
[LightGBM] [Info] This is the GPU tra

In [14]:
from catboost import CatBoostClassifier

cat_best = None
cat_best_acc = 0

cat_grid = [
    {"iterations": 1200, "depth": 6, "learning_rate": 0.02},
    {"iterations": 1200, "depth": 6, "learning_rate": 0.05},
    {"iterations": 1200, "depth": 8, "learning_rate": 0.02},
    {"iterations": 1200, "depth": 8, "learning_rate": 0.05},
    {"iterations": 1500, "depth": 6, "learning_rate": 0.02},
    {"iterations": 1500, "depth": 6, "learning_rate": 0.05},
    {"iterations": 1500, "depth": 8, "learning_rate": 0.02},
    {"iterations": 1500, "depth": 8, "learning_rate": 0.05},
]

for params in cat_grid:
    model = CatBoostClassifier(
        iterations=params["iterations"],
        depth=params["depth"],
        learning_rate=params["learning_rate"],
        loss_function="Logloss",
        verbose=False,
    )

    model.fit(X_train, y_train)
    preds = model.predict_proba(X_val)[:, 1]
    acc = ((preds >= 0.5).astype(int) == y_val).mean()

    print(f"CatBoost {params} → acc={acc:.4f}")

    if acc > cat_best_acc:
        cat_best_acc = acc
        cat_best = model

preds_cat = cat_best.predict_proba(X_val)[:, 1]
print("Best CatBoost accuracy:", cat_best_acc)


CatBoost {'iterations': 1200, 'depth': 6, 'learning_rate': 0.02} → acc=0.8391
CatBoost {'iterations': 1200, 'depth': 6, 'learning_rate': 0.05} → acc=0.8425
CatBoost {'iterations': 1200, 'depth': 8, 'learning_rate': 0.02} → acc=0.8410
CatBoost {'iterations': 1200, 'depth': 8, 'learning_rate': 0.05} → acc=0.8428
CatBoost {'iterations': 1500, 'depth': 6, 'learning_rate': 0.02} → acc=0.8391
CatBoost {'iterations': 1500, 'depth': 6, 'learning_rate': 0.05} → acc=0.8416
CatBoost {'iterations': 1500, 'depth': 8, 'learning_rate': 0.02} → acc=0.8410
CatBoost {'iterations': 1500, 'depth': 8, 'learning_rate': 0.05} → acc=0.8431
Best CatBoost accuracy: 0.8430802995094242


In [15]:
p_final = (preds_xgb + preds_lgb + preds_cat) / 3
acc_final = ((p_final >= 0.5).astype(int) == y_val).mean()

print("Final Ensemble Accuracy:", acc_final)


Final Ensemble Accuracy: 0.8490833978827782


In [17]:

full_df = df.copy()   # df was the train.jsonl loaded at the start
print("Full training size:", len(full_df))

emb_full = extract_embeddings(full_df)
# np.save("emb_full.npy", emb_full)

test_df = load_jsonl("data/Kaggle2025/kaggle_test.jsonl")

emb_test = extract_embeddings(test_df)
# np.save("emb_test.npy", emb_test)




Loaded: data/Kaggle2025/kaggle_test.jsonl
Rows: 103380


In [18]:
# ==========================================================
# 1) Build metadata for FULL TRAINING SET + TEST SET
# ==========================================================
meta_full = build_metadata_dataframe(full_df)
meta_test = build_metadata_dataframe(test_df)


# ==========================================================
# 2) Apply SAME cleaning rules learned from meta_train
# ==========================================================

# (A) Drop constant columns
meta_full = meta_full.drop(columns=constant_cols, errors="ignore")
meta_test = meta_test.drop(columns=constant_cols, errors="ignore")

# (B) Drop sparse columns
meta_full = meta_full.drop(columns=sparse_cols, errors="ignore")
meta_test = meta_test.drop(columns=sparse_cols, errors="ignore")

# (C) Drop datetime columns
meta_full = meta_full.drop(columns=datetime_cols, errors="ignore")
meta_test = meta_test.drop(columns=datetime_cols, errors="ignore")

# (D) Drop same high-cardinality object columns
meta_full = meta_full.drop(columns=drop, errors="ignore")
meta_test = meta_test.drop(columns=drop, errors="ignore")


# ==========================================================
# 3) Label encode the SAME small-category columns (keep)
# ==========================================================

for col in keep:
    le = LabelEncoder()
    
    # Fit on TRAIN metadata only
    le.fit(meta_train[col].astype(str))
    
    known_classes = set(le.classes_)
    
    # Replace unseen categories with a universal placeholder
    meta_full[col] = meta_full[col].astype(str).apply(
        lambda x: x if x in known_classes else '___UNKNOWN___'
    )
    meta_test[col] = meta_test[col].astype(str).apply(
        lambda x: x if x in known_classes else '___UNKNOWN___'
    )
    
    # Add placeholder to encoder classes
    le.fit(list(known_classes) + ['___UNKNOWN___'])
    
    # Transform safely
    meta_full[col] = le.transform(meta_full[col].astype(str))
    meta_test[col] = le.transform(meta_test[col].astype(str))



# ==========================================================
# 4) Convert to numeric and fill missing
# ==========================================================
meta_full = meta_full.apply(pd.to_numeric, errors="ignore").fillna(-1)
meta_test = meta_test.apply(pd.to_numeric, errors="ignore").fillna(-1)


# ==========================================================
# 5) Force FULL + TEST columns to match EXACTLY meta_train columns
# ==========================================================

# Add missing columns
for col in meta_train.columns:
    if col not in meta_full.columns:
        meta_full[col] = 0
    if col not in meta_test.columns:
        meta_test[col] = 0

# Drop extra columns
extra_cols_full = [c for c in meta_full.columns if c not in meta_train.columns]
meta_full = meta_full.drop(columns=extra_cols_full, errors="ignore")

extra_cols_test = [c for c in meta_test.columns if c not in meta_train.columns]
meta_test = meta_test.drop(columns=extra_cols_test, errors="ignore")

# Reorder columns to match EXACT training order
meta_full = meta_full[meta_train.columns]
meta_test = meta_test[meta_train.columns]


# ==========================================================
# 6) Final matrices for full training + test prediction
# ==========================================================
X_full = np.hstack([emb_full, meta_full.values])
y_full = full_df["label"].values

X_test_final = np.hstack([emb_test, meta_test.values])

print("X_full:", X_full.shape)
print("X_test_final:", X_test_final.shape)


X_full: (154914, 847)
X_test_final: (103380, 847)


In [19]:
best_xgb_params = {
    "n_estimators":  900 ,
    "max_depth":     8 ,
    "learning_rate": 0.05
}

best_lgb_params = {
    "num_leaves":      512 ,
    "learning_rate":   0.015 ,
    "feature_fraction": 0.8
}

best_cat_params = {
    "iterations":    1500 ,
    "depth":         8 ,
    "learning_rate": 0.05
}


In [20]:
# Final Training XGBoost

from xgboost import XGBClassifier

xgb_final = XGBClassifier(
    n_estimators=best_xgb_params["n_estimators"],
    max_depth=best_xgb_params["max_depth"],
    learning_rate=best_xgb_params["learning_rate"],
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    tree_method="hist"
)

xgb_final.fit(X_full, y_full)
pred_xgb_test = xgb_final.predict_proba(X_test_final)[:, 1]




In [21]:
# Final Training LGBM Classifier

from lightgbm import LGBMClassifier

lgb_final = LGBMClassifier(
    num_leaves=best_lgb_params["num_leaves"],
    learning_rate=best_lgb_params["learning_rate"],
    feature_fraction=best_lgb_params["feature_fraction"],
    n_estimators=1000,
    bagging_fraction=0.9,
    bagging_freq=3,
    objective="binary"
)

lgb_final.fit(X_full, y_full)
pred_lgb_test = lgb_final.predict_proba(X_test_final)[:, 1]


[LightGBM] [Info] Number of positive: 72240, number of negative: 82674
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.119199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 203073
[LightGBM] [Info] Number of data points in the train set: 154914, number of used features: 836
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466323 -> initscore=-0.134911
[LightGBM] [Info] Start training from score -0.134911


In [22]:
# Final Training Catboost Classifier


from catboost import CatBoostClassifier

cat_final = CatBoostClassifier(
    iterations=best_cat_params["iterations"],
    depth=best_cat_params["depth"],
    learning_rate=best_cat_params["learning_rate"],
    loss_function="Logloss",
    verbose=False
)

cat_final.fit(X_full, y_full)
pred_cat_test = cat_final.predict_proba(X_test_final)[:, 1]


In [23]:
final_probs = (pred_xgb_test + pred_lgb_test + pred_cat_test) / 3
final_preds = (final_probs >= 0.5).astype(int)


In [24]:
submission = pd.DataFrame({
    "ID": test_df["challenge_id"],
    "Prediction": final_preds
})

submission.to_csv("final_submission.csv", index=False)
print("Saved final_submission.csv")
    

Saved final_submission.csv
