# Evaluation of Multi-label Baselines on BoVW Features

This notebook trains and evaluates multiple baselines for multi-label genre classification on precomputed BoVW features. It performs per-class threshold tuning on a validation split, applies calibration where needed, produces confusion-style per-class summaries, and aggregates a unified results table.

Data:
- CSV: data/dataset2.csv (metadata and genre labels)
- NPZ: data/bovw_vectors_kmeans_1000.npz (bovw_vectors, kmeans_centers, filenames)

Outline:
1) Load data and align features/labels by filename
2) Evaluation utilities: F1 (micro/macro), Jaccard, Hamming, mAP, ROC-AUC, LRAP, top-k
3) Confusion-style utilities: per-class TP/FP/FN/TN and micro totals
4) Validation split and per-class threshold tuning by F1
5) Models: LinearSVC (raw, calibrated), SGD(log), Ridge (raw, calibrated), Logistic (saga), scikit-learn MLP, PyTorch MLP, improved Torch BigMLP; optionally XGBoost (OVR)
6) Unified summary and artifact saving

In [None]:
# Load Data and Align Features/Labels
import os
import json
import time
import numpy as np
import pandas as pd
from pathlib import Path

DATA_DIR = Path('/kaggle/input/wikiart-sift-pca-uint8')
CSV_PATH = DATA_DIR / 'dataset.csv'
NPZ_PATH = DATA_DIR / 'bovw_vectors_kmeans_1000.npz'

print('Loading CSV:', CSV_PATH)
df = pd.read_csv(CSV_PATH)
print('CSV shape:', df.shape)

print('Loading NPZ:', NPZ_PATH)
npz = np.load(NPZ_PATH, allow_pickle=True)
X_bovw = npz['bovw_vectors']
centers = npz['kmeans_centers']
filenames_npz = npz['filenames'].astype(str)
print('BoVW:', X_bovw.shape, 'Centers:', centers.shape, 'Files:', len(filenames_npz))

# Build labels
NON_LABEL_COLS = {'Unnamed: 0','filename','width','height','genre_count','subset'}
label_cols = [c for c in df.columns if c not in NON_LABEL_COLS]
Y_all = df[label_cols].astype(int).values

# Sanity checks
label_counts = Y_all.sum(axis=1)
print('Label count per sample: min', label_counts.min(), 'max', label_counts.max())
assert label_counts.min() >= 1 and label_counts.max() <= 2

# Align features order by filename
fname_to_idx = {fn: i for i, fn in enumerate(filenames_npz)}
indices = df['filename'].map(fname_to_idx)
assert indices.isna().sum() == 0, 'CSV filenames not found in NPZ.'
X_all = X_bovw[indices.astype(int).values]

# Train/Test split from CSV
is_train = df['subset'].str.lower().eq('train').values
is_test = df['subset'].str.lower().eq('test').values
X_train, Y_train = X_all[is_train], Y_all[is_train]
X_test, Y_test = X_all[is_test], Y_all[is_test]
print('Train:', X_train.shape, Y_train.shape, 'Test:', X_test.shape, Y_test.shape)

Loading CSV: /kaggle/input/wikiart-sift-pca-uint8/dataset.csv
CSV shape: (78204, 33)
Loading NPZ: /kaggle/input/wikiart-sift-pca-uint8/bovw_vectors_kmeans_1000.npz
BoVW: (79998, 1000) Centers: (1000, 64) Files: 79998
Label count per sample: min 1 max 2
Train: (63365, 1000) (63365, 27) Test: (14839, 1000) (14839, 27)


In [None]:
# Evaluation Utilities (metrics, mAP, ROC, LRAP)
from sklearn.metrics import (
    f1_score, jaccard_score, hamming_loss,
    average_precision_score, roc_auc_score,
    label_ranking_average_precision_score, coverage_error, label_ranking_loss
)

def evaluate_multilabel(Y_true, scores, threshold=0.5, label_names=None):
    Y_pred = (scores >= threshold).astype(int)
    metrics = {
        'f1_micro': f1_score(Y_true, Y_pred, average='micro', zero_division=0),
        'f1_macro': f1_score(Y_true, Y_pred, average='macro', zero_division=0),
        'jaccard_micro': jaccard_score(Y_true, Y_pred, average='micro', zero_division=0),
        'hamming_loss': hamming_loss(Y_true, Y_pred),
    }
    try:
        ap_per_class = average_precision_score(Y_true, scores, average=None)
        metrics['mAP_macro'] = float(np.nanmean(ap_per_class))
    except Exception:
        ap_per_class = np.full(Y_true.shape[1], np.nan)
        metrics['mAP_macro'] = np.nan
    try:
        metrics['roc_auc_micro'] = roc_auc_score(Y_true, scores, average='micro')
        metrics['roc_auc_macro'] = roc_auc_score(Y_true, scores, average='macro')
    except Exception:
        metrics['roc_auc_micro'] = np.nan
        metrics['roc_auc_macro'] = np.nan
    try:
        metrics['lrap'] = label_ranking_average_precision_score(Y_true, scores)
        metrics['coverage_error'] = coverage_error(Y_true, scores)
        metrics['label_ranking_loss'] = label_ranking_loss(Y_true, scores)
    except Exception:
        metrics['lrap'] = np.nan
        metrics['coverage_error'] = np.nan
        metrics['label_ranking_loss'] = np.nan

    if label_names is None:
        label_names = [f'class_{i}' for i in range(Y_true.shape[1])]
    ap_df = pd.DataFrame({'label': label_names, 'AP': ap_per_class})
    ap_df = ap_df.sort_values('AP', ascending=False, na_position='last').reset_index(drop=True)
    return metrics, ap_df, Y_pred

# Ranking/top-k utility

def topk_multilabel_accuracy(Y_true, scores, k):
    idx = np.argsort(-scores, axis=1)[:, :k]
    hits = []
    Y_true_bool = (Y_true == 1)
    for i in range(Y_true.shape[0]):
        hits.append(Y_true_bool[i, idx[i]].any())
    return float(np.mean(hits))

In [None]:
# Confusion-Matrix Style Utilities for Multilabel
from sklearn.metrics import multilabel_confusion_matrix, precision_score, recall_score


def per_class_confusion(Y_true, Y_pred, label_names):
    mcm = multilabel_confusion_matrix(Y_true, Y_pred)
    rows = []
    for j, cm in enumerate(mcm):
        tn, fp, fn, tp = cm.ravel()
        support = int((Y_true[:, j] == 1).sum())
        prec = precision_score(Y_true[:, j], Y_pred[:, j], zero_division=0)
        rec = recall_score(Y_true[:, j], Y_pred[:, j], zero_division=0)
        f1 = 0.0 if (prec + rec) == 0 else 2 * prec * rec / (prec + rec)
        rows.append({
            'label': label_names[j], 'TP': int(tp), 'FP': int(fp), 'FN': int(fn), 'TN': int(tn),
            'precision': float(prec), 'recall': float(rec), 'f1': float(f1), 'support': support
        })
    return pd.DataFrame(rows).sort_values('f1', ascending=False).reset_index(drop=True)


def micro_confusion_totals(Y_true, Y_pred):
    mcm = multilabel_confusion_matrix(Y_true, Y_pred)
    tn = mcm[:, 0, 0].sum()
    fp = mcm[:, 0, 1].sum()
    fn = mcm[:, 1, 0].sum()
    tp = mcm[:, 1, 1].sum()
    return {'TP': int(tp), 'FP': int(fp), 'FN': int(fn), 'TN': int(tn)}

In [None]:
# Validation Split for Per-Class Threshold Tuning
from sklearn.model_selection import train_test_split

# Stratify by number of labels (1 vs 2)
label_counts_train = Y_train.sum(axis=1)
strata = (label_counts_train >= 2).astype(int)
X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_train, Y_train, test_size=0.15, random_state=42, stratify=strata
)

print('Sub-train:', X_tr.shape, 'Validation:', X_val.shape)

from sklearn.metrics import precision_recall_curve

def tune_thresholds_by_f1(Y_true, scores, min_pos=1):
    n_classes = Y_true.shape[1]
    thresholds = np.zeros(n_classes, dtype=float)
    f1_best = np.full(n_classes, np.nan)
    for j in range(n_classes):
        y = Y_true[:, j]
        s = scores[:, j]
        if y.sum() < min_pos:
            thresholds[j] = 0.5
            f1_best[j] = np.nan
            continue
        p, r, t = precision_recall_curve(y, s)
        f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
        idx = int(np.nanargmax(f1))
        thr = 0.5 if idx == 0 else t[idx - 1]
        thresholds[j] = thr
        f1_best[j] = f1[idx]
    return thresholds, f1_best

Sub-train: (53860, 1000) Validation: (9505, 1000)


In [6]:
# Helper to evaluate a model given validation/test scores

def evaluate_with_thresholds(Y_val, scores_val, Y_test, scores_test, label_names, model_name, extras=None):
    thr, f1_val = tune_thresholds_by_f1(Y_val, scores_val)
    # metrics on test with tuned thresholds
    metrics, ap_df, Y_pred = evaluate_multilabel(Y_test, scores_test, threshold=thr.reshape(1, -1), label_names=label_names)
    per_cls_df = per_class_confusion(Y_test, Y_pred, label_names)
    micro_totals = micro_confusion_totals(Y_test, Y_pred)
    topk = {
        'top1_acc': topk_multilabel_accuracy(Y_test, scores_test, k=1),
        'top2_acc': topk_multilabel_accuracy(Y_test, scores_test, k=2),
        'top3_acc': topk_multilabel_accuracy(Y_test, scores_test, k=3),
    }
    out = {
        'model': model_name,
        'metrics': metrics,
        'ap': ap_df,
        'per_class': per_cls_df.assign(threshold=thr),
        'micro_totals': micro_totals,
        'topk': topk,
        'thresholds': thr,
    }
    if extras:
        out.update(extras)
    return out

In [None]:
# LinearSVC OVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler, Normalizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

results = {}

# Raw LinearSVC 
svm_raw = Pipeline([
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LinearSVC(C=1.0, class_weight='balanced', dual=False, random_state=42), n_jobs=-1))
])
start = time.time()
svm_raw.fit(X_tr, Y_tr)
train_time = time.time() - start
val_scores_raw = svm_raw.named_steps['clf'].decision_function(svm_raw.named_steps['scale'].transform(X_val))
test_scores_raw = svm_raw.named_steps['clf'].decision_function(svm_raw.named_steps['scale'].transform(X_test))
res_raw = evaluate_with_thresholds(Y_val, val_scores_raw, Y_test, test_scores_raw, label_cols, 'LinearSVC_raw', extras={'train_time_sec': train_time, 'calibrated': False})
results['LinearSVC_raw'] = res_raw
print('LinearSVC raw done in {:.1f}s'.format(train_time))

# Calibrated LinearSVC 
lsvc = LinearSVC(C=1.0, class_weight='balanced', dual=False, random_state=42)
try:
    calibrated_base = CalibratedClassifierCV(estimator=lsvc, method='sigmoid', cv=3)
except TypeError:
    calibrated_base = CalibratedClassifierCV(base_estimator=lsvc, method='sigmoid', cv=3)
svm_cal = Pipeline([
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(calibrated_base, n_jobs=-1))
])
start = time.time()
svm_cal.fit(X_tr, Y_tr)
train_time = time.time() - start
val_proba_cal = svm_cal.predict_proba(X_val)
test_proba_cal = svm_cal.predict_proba(X_test)
res_cal = evaluate_with_thresholds(Y_val, val_proba_cal, Y_test, test_proba_cal, label_cols, 'LinearSVC_calibrated', extras={'train_time_sec': train_time, 'calibrated': True})
results['LinearSVC_calibrated'] = res_cal
print('LinearSVC calibrated done in {:.1f}s'.format(train_time))

  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.

LinearSVC raw done in 368.5s


  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.

LinearSVC calibrated done in 731.1s


In [None]:
# SGDClassifier(log) OVR
from sklearn.linear_model import SGDClassifier

sigmoid = lambda x: 1.0 / (1.0 + np.exp(-x))

sgd = Pipeline([
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(SGDClassifier(loss='log_loss', alpha=1e-4, max_iter=20, tol=1e-3, random_state=42), n_jobs=-1))
])
start = time.time()
sgd.fit(X_tr, Y_tr)
train_time = time.time() - start
val_margins = sgd.named_steps['clf'].decision_function(sgd.named_steps['scale'].transform(X_val))
val_probas = sigmoid(val_margins)
test_margins = sgd.named_steps['clf'].decision_function(sgd.named_steps['scale'].transform(X_test))
test_probas = sigmoid(test_margins)
res_sgd = evaluate_with_thresholds(Y_val, val_probas, Y_test, test_probas, label_cols, 'SGD_log_OVR', extras={'train_time_sec': train_time, 'calibrated': False})
results['SGD_log_OVR'] = res_sgd
print('SGD(log) done in {:.1f}s'.format(train_time))

  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)


SGD(log) done in 19.6s


In [None]:
# RidgeClassifier OVR (raw + calibrated)
from sklearn.linear_model import RidgeClassifier

ridge_raw = Pipeline([
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(RidgeClassifier(alpha=1.0, class_weight='balanced', random_state=42), n_jobs=-1))
])
start = time.time()
ridge_raw.fit(X_tr, Y_tr)
train_time = time.time() - start
val_scores_ridge = ridge_raw.named_steps['clf'].decision_function(ridge_raw.named_steps['scale'].transform(X_val))
test_scores_ridge = ridge_raw.named_steps['clf'].decision_function(ridge_raw.named_steps['scale'].transform(X_test))
res_ridge_raw = evaluate_with_thresholds(Y_val, val_scores_ridge, Y_test, test_scores_ridge, label_cols, 'Ridge_OVR_raw', extras={'train_time_sec': train_time, 'calibrated': False})
results['Ridge_OVR_raw'] = res_ridge_raw
print('Ridge raw done in {:.1f}s'.format(train_time))

# Calibrated Ridge via CalibratedClassifierCV 
ridge_base = RidgeClassifier(alpha=1.0, class_weight='balanced', random_state=42)
try:
    ridge_cal_base = CalibratedClassifierCV(estimator=ridge_base, method='sigmoid', cv=3)
except TypeError:
    ridge_cal_base = CalibratedClassifierCV(base_estimator=ridge_base, method='sigmoid', cv=3)

ridge_cal = Pipeline([
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(ridge_cal_base, n_jobs=-1))
])
start = time.time()
ridge_cal.fit(X_tr, Y_tr)
train_time = time.time() - start
val_proba_ridge_cal = ridge_cal.predict_proba(X_val)
test_proba_ridge_cal = ridge_cal.predict_proba(X_test)
res_ridge_cal = evaluate_with_thresholds(Y_val, val_proba_ridge_cal, Y_test, test_proba_ridge_cal, label_cols, 'Ridge_OVR_calibrated', extras={'train_time_sec': train_time, 'calibrated': True})
results['Ridge_OVR_calibrated'] = res_ridge_cal
print('Ridge calibrated done in {:.1f}s'.format(train_time))

  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.

Ridge raw done in 17.0s


  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.

Ridge calibrated done in 39.2s


In [None]:
# LogisticRegression(saga) OVR
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([
    ('l2norm', Normalizer(norm='l2')),
    ('scale', MaxAbsScaler()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='saga', C=1.0, class_weight='balanced', penalty='l2', max_iter=100, n_jobs=-1, random_state=42), n_jobs=-1))
])
start = time.time()
logreg.fit(X_tr, Y_tr)
train_time = time.time() - start
val_proba_logreg = logreg.predict_proba(X_val)
test_proba_logreg = logreg.predict_proba(X_test)
res_logreg = evaluate_with_thresholds(Y_val, val_proba_logreg, Y_test, test_proba_logreg, label_cols, 'LogReg_saga_OVR', extras={'train_time_sec': train_time, 'calibrated': False})
results['LogReg_saga_OVR'] = res_logreg
print('LogisticRegression(saga) done in {:.1f}s'.format(train_time))

  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)


LogisticRegression(saga) done in 855.0s


In [None]:
# scikit-learn MLP 
from sklearn.neural_network import MLPClassifier

mlp = Pipeline([
    ('scale', MaxAbsScaler()),
    ('clf', MLPClassifier(hidden_layer_sizes=(512,256), activation='relu', alpha=1e-4, batch_size=512,
                          learning_rate_init=1e-3, max_iter=40, early_stopping=True, n_iter_no_change=3,
                          random_state=42, verbose=False))
])
start = time.time()
mlp.fit(X_tr, Y_tr)
train_time = time.time() - start
proba_val = mlp.named_steps['clf'].predict_proba(mlp.named_steps['scale'].transform(X_val))
if isinstance(proba_val, list):
    proba_val = np.column_stack([p[:,1] for p in proba_val])
proba_test = mlp.named_steps['clf'].predict_proba(mlp.named_steps['scale'].transform(X_test))
if isinstance(proba_test, list):
    proba_test = np.column_stack([p[:,1] for p in proba_test])
res_mlp = evaluate_with_thresholds(Y_val, proba_val, Y_test, proba_test, label_cols, 'MLP_sklearn', extras={'train_time_sec': train_time, 'calibrated': False})
results['MLP_sklearn'] = res_mlp
print('MLP(sklearn) done in {:.1f}s'.format(train_time))

  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)


MLP(sklearn) done in 58.7s


In [None]:
# PyTorch MLP 
try:
    import torch
    import torch.nn as nn
    from torch.utils.data import TensorDataset, DataLoader
    torch_available = True
except Exception as e:
    print('PyTorch not available')
    torch_available = False

if torch_available:
    if torch.backends.mps.is_available():
        device = torch.device('mps')
    elif torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    Xtr_t = torch.from_numpy(X_tr.astype(np.float32))
    Ytr_t = torch.from_numpy(Y_tr.astype(np.float32))
    Xval_t = torch.from_numpy(X_val.astype(np.float32))
    Xte_t = torch.from_numpy(X_test.astype(np.float32))

    train_ds = TensorDataset(Xtr_t, Ytr_t)
    train_loader = DataLoader(train_ds, batch_size=1024, shuffle=True, num_workers=0)

    class TorchMLP(nn.Module):
        def __init__(self, in_dim=1000, num_classes=27):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, 512), nn.ReLU(), nn.Dropout(0.3),
                nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3),
                nn.Linear(256, num_classes)
            )
        def forward(self, x):
            return self.net(x)

    model = TorchMLP(in_dim=X_tr.shape[1], num_classes=Y_tr.shape[1]).to(device)
    pos_counts = Y_tr.sum(axis=0).astype(np.float32)
    neg_counts = Y_tr.shape[0] - pos_counts
    pos_weight = torch.from_numpy((neg_counts / np.maximum(pos_counts, 1e-6))).to(device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

    epochs = 8
    start = time.time()
    model.train()
    for ep in range(epochs):
        running = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad(set_to_none=True)
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running += loss.item() * xb.size(0)
        print(f'Epoch {ep+1}/{epochs} - loss: {running/len(train_loader.dataset):.4f}')
    train_time = time.time() - start

    model.eval()
    with torch.no_grad():
        def batched_logits(Xt, bs=2048):
            out = []
            for i in range(0, Xt.size(0), bs):
                out.append(model(Xt[i:i+bs].to(device)).cpu())
            return torch.cat(out, dim=0)
        val_logits = batched_logits(Xval_t)
        test_logits = batched_logits(Xte_t)
        val_proba = torch.sigmoid(val_logits).numpy()
        test_proba = torch.sigmoid(test_logits).numpy()

    res_torch = evaluate_with_thresholds(Y_val, val_proba, Y_test, test_proba, label_cols, 'Torch_MLP', extras={'train_time_sec': train_time, 'calibrated': False})
    results['Torch_MLP'] = res_torch
    print('Torch MLP done in {:.1f}s'.format(train_time))

Using device: cuda
Epoch 1/8 - loss: 1.2325
Epoch 2/8 - loss: 1.0675
Epoch 3/8 - loss: 0.9883
Epoch 4/8 - loss: 0.9382
Epoch 5/8 - loss: 0.9032
Epoch 6/8 - loss: 0.8771
Epoch 7/8 - loss: 0.8602
Epoch 8/8 - loss: 0.8410


  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.where((p + r) > 0, 2 * p * r / (p + r), 0.0)
  f1 = np.

Torch MLP done in 5.6s


In [None]:
# XGBoost
try:
    from xgboost import XGBClassifier
    print("Training XGBoost OVR (GPU=%s)..." % gpu_available)
    xgb_base = XGBClassifier(n_estimators=800, learning_rate=0.05,
                             use_label_encoder=False,
                             tree_method='gpu_hist' if gpu_available else 'hist',
                             predictor='gpu_predictor' if gpu_available else 'auto',
                             verbosity=0, n_jobs=8)
    xgb_ovr = Pipeline([('scale', MaxAbsScaler()), ('clf', OVR(xgb_base, n_jobs=-1))])
    t0 = time.time()
    xgb_ovr.fit(X_tr, Y_tr)
    t_xgb = time.time() - t0
    val_proba_xgb = xgb_ovr.predict_proba(X_val)
    test_proba_xgb = xgb_ovr.predict_proba(X_test)
    res_xgb = evaluate_with_thresholds(Y_val, val_proba_xgb, Y_test, test_proba_xgb, label_cols, 'XGBoost_OVR', extras={'train_time_sec': t_xgb})
    results['XGBoost_OVR'] = res_xgb
    print('XGBoost done in {:.1f}s'.format(t_xgb))
except Exception as e:
    print('XGBoost skipped:', e)



Training XGBoost OVR (GPU=True)...



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_metho

XGBoost done in 776.4s


In [None]:
# Improved Torch MLP with better architecture and training
try:
    import torch
    import torch.nn as nn
    from torch.utils.data import TensorDataset, DataLoader

    if torch.cuda.is_available():
        device = torch.device('cuda')
    elif getattr(torch.backends, 'mps', None) is not None and torch.backends.mps.is_available():
        device = torch.device('mps')
    else:
        device = torch.device('cpu')
    print('Torch device:', device)

    # Prepare tensors and dataloaders
    Xtr_t = torch.from_numpy(X_tr.astype(np.float32))
    Ytr_t = torch.from_numpy(Y_tr.astype(np.float32))
    Xval_t = torch.from_numpy(X_val.astype(np.float32))
    Yval_t = torch.from_numpy(Y_val.astype(np.float32))            
    Xte_t = torch.from_numpy(X_test.astype(np.float32))
    Yte_t = torch.from_numpy(Y_test.astype(np.float32))          

    bs = 1024
    train_ds = TensorDataset(Xtr_t, Ytr_t)
    train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=4 if device.type=='cuda' else 0, pin_memory=(device.type=='cuda'))

    class BigMLP(nn.Module):
        def __init__(self, in_dim=1000, n_classes=27):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, 2048), nn.ReLU(), nn.Dropout(0.4),
                nn.Linear(2048, 1024), nn.ReLU(), nn.Dropout(0.4),
                nn.Linear(1024, 512), nn.ReLU(), nn.Dropout(0.3),
                nn.Linear(512, n_classes)
            )
        def forward(self, x):
            return self.net(x)

    model = BigMLP(in_dim=X_tr.shape[1], n_classes=Y_tr.shape[1]).to(device)
    pos_counts = Y_tr.sum(axis=0).astype(np.float32)
    neg_counts = Y_tr.shape[0] - pos_counts
    pos_weight = torch.from_numpy((neg_counts / np.maximum(pos_counts, 1e-6))).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2, verbose=False)

    # move validation tensors to device 
    Xval_device = Xval_t.to(device)
    Yval_device = Yval_t.to(device)

    epochs = 25
    model.train()
    t0 = time.time()
    for ep in range(epochs):
        running = 0.0
        for xb, yb in train_loader:
            xb = xb.to(device, non_blocking=(device.type=='cuda'))
            yb = yb.to(device, non_blocking=(device.type=='cuda'))
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running += loss.item() * xb.size(0)
        # validate for scheduler
        model.eval()
        with torch.no_grad():
            val_logits = model(Xval_device)                       
            val_loss = float(criterion(val_logits, Yval_device).item())  
        scheduler.step(val_loss)
        model.train()
        print(f'Epoch {ep+1}/{epochs} loss={running/len(train_loader.dataset):.4f} val_loss={val_loss:.4f}')
    t_torch = time.time() - t0

    # eval 
    model.eval()
    def batched_sigmoid_predict(X_np, bs=2048):
        out = []
        Xt = torch.from_numpy(X_np.astype(np.float32))
        for i in range(0, Xt.size(0), bs):
            b = Xt[i:i+bs].to(device)
            with torch.no_grad():
                out.append(torch.sigmoid(model(b)).cpu())
        return torch.cat(out, dim=0).numpy()

    val_proba_big = batched_sigmoid_predict(X_val)
    test_proba_big = batched_sigmoid_predict(X_test)

    res_big = evaluate_with_thresholds(Y_val, val_proba_big, Y_test, test_proba_big, label_cols, 'Torch_BigMLP', extras={'train_time_sec': t_torch})
    results['Torch_BigMLP'] = res_big
    print('Torch BigMLP done in {:.1f}s'.format(t_torch))
except Exception as e:
    print('Improved Torch MLP skipped:', e)

Torch device: cuda




Epoch 1/25 loss=1.1948 val_loss=1.0762
Epoch 2/25 loss=1.0340 val_loss=1.0090
Epoch 3/25 loss=0.9642 val_loss=0.9380
Epoch 4/25 loss=0.9044 val_loss=0.8902
Epoch 5/25 loss=0.8619 val_loss=0.8777
Epoch 6/25 loss=0.8308 val_loss=0.8703
Epoch 7/25 loss=0.8002 val_loss=0.8706
Epoch 8/25 loss=0.7850 val_loss=0.8413
Epoch 9/25 loss=0.7679 val_loss=0.8646
Epoch 10/25 loss=0.7477 val_loss=0.8608
Epoch 11/25 loss=0.7388 val_loss=0.8466
Epoch 12/25 loss=0.7047 val_loss=0.8388
Epoch 13/25 loss=0.6864 val_loss=0.8444
Epoch 14/25 loss=0.6759 val_loss=0.8807
Epoch 15/25 loss=0.6674 val_loss=0.8795
Epoch 16/25 loss=0.6494 val_loss=0.8714
Epoch 17/25 loss=0.6445 val_loss=0.8908
Epoch 18/25 loss=0.6346 val_loss=0.8986
Epoch 19/25 loss=0.6240 val_loss=0.9179
Epoch 20/25 loss=0.6221 val_loss=0.9103
Epoch 21/25 loss=0.6169 val_loss=0.9192
Epoch 22/25 loss=0.6130 val_loss=0.9304
Epoch 23/25 loss=0.6107 val_loss=0.9275
Epoch 24/25 loss=0.6089 val_loss=0.9326
Epoch 25/25 loss=0.6065 val_loss=0.9384
Torch Big

In [None]:
# summary across models
import pandas as pd

summary_rows = []
for name, res in results.items():
    m = res['metrics']
    topk = res.get('topk', {})
    row = {
        'model': name,
        'f1_micro': m.get('f1_micro'),
        'f1_macro': m.get('f1_macro'),
        'jaccard_micro': m.get('jaccard_micro'),
        'hamming_loss': m.get('hamming_loss'),
        'mAP_macro': m.get('mAP_macro'),
        'roc_auc_micro': m.get('roc_auc_micro'),
        'roc_auc_macro': m.get('roc_auc_macro'),
        'lrap': m.get('lrap'),
        'coverage_error': m.get('coverage_error'),
        'label_ranking_loss': m.get('label_ranking_loss'),
        'top1_acc': topk.get('top1_acc'),
        'top2_acc': topk.get('top2_acc'),
        'top3_acc': topk.get('top3_acc'),
        'calibrated': res.get('calibrated', False),
        'train_time_sec': res.get('train_time_sec', None),
    }
    summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows).set_index('model').sort_values('f1_micro', ascending=False)
summary_df

Unnamed: 0_level_0,f1_micro,f1_macro,jaccard_micro,hamming_loss,mAP_macro,roc_auc_micro,roc_auc_macro,lrap,coverage_error,label_ranking_loss,top1_acc,top2_acc,top3_acc,calibrated,train_time_sec
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
XGBoost_OVR,0.363049,0.326968,0.221784,0.058599,0.305768,0.892879,0.871754,0.523127,4.020824,0.114426,0.344295,0.514118,0.623829,False,776.354267
MLP_sklearn,0.356916,0.319454,0.217224,0.059236,0.288211,0.892415,0.869138,0.519476,4.034167,0.114977,0.336411,0.510547,0.622616,False,58.681356
SGD_log_OVR,0.331819,0.297997,0.198911,0.06573,0.249041,0.887448,0.852059,0.50499,4.112946,0.118062,0.318283,0.49161,0.608869,False,19.560748
Torch_BigMLP,0.330274,0.300509,0.197801,0.07358,0.244508,0.880271,0.873534,0.459033,4.305681,0.125673,0.257025,0.439854,0.566615,False,19.390507
Ridge_OVR_calibrated,0.311096,0.269048,0.1842,0.073532,0.210998,0.881392,0.836462,0.488774,4.266527,0.12385,0.299279,0.473549,0.591617,True,39.181445
LinearSVC_calibrated,0.307533,0.255514,0.181707,0.07188,0.202766,0.877844,0.8333,0.483159,4.339848,0.126643,0.29409,0.465463,0.585956,True,731.073826
Ridge_OVR_raw,0.306017,0.261162,0.18065,0.073028,0.205482,0.849896,0.834689,0.445342,5.067929,0.154426,0.261001,0.419233,0.533459,False,16.964864
LinearSVC_raw,0.300429,0.245187,0.176767,0.074521,0.190402,0.861454,0.829804,0.439118,4.714873,0.141058,0.244289,0.406833,0.534672,False,368.510962
Torch_MLP,0.289545,0.255508,0.16928,0.079088,0.193293,0.837108,0.846803,0.398863,5.319563,0.164432,0.203046,0.365793,0.483321,False,5.570176
LogReg_saga_OVR,0.266077,0.229103,0.153454,0.091895,0.162454,0.820685,0.820585,0.297821,5.733338,0.180191,0.087877,0.21612,0.354943,False,855.01248
