In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from nltk import data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm

Data loading

In [2]:
sns.set_theme("notebook", "whitegrid")

In [10]:
df = pd.read_csv("multi_label_dataset.csv")

In [4]:
df.head()

Unnamed: 0,File,Text,adulting-101,big-read,commentary,gen-y-speaks,gen-z-speaks,singapore,voices,world
0,5_smart_ways_to_stretch_your_dollar_with_GrabF...,app new feature serve convenience value whethe...,1,1,1,1,1,1,1,1
1,Adulting_101_People_around_me_are_job_hopping_...,adulthood invigorating stage life young people...,1,1,1,1,1,1,1,1
2,Airbnb_bans_security_cameras_inside_guest_home...,san francisco airbnb monday march 11 said bann...,1,1,1,1,1,1,1,1
3,As_Swiftonomics_sweeps_through_Singapore_small...,singapore swiftonomics swept singapore america...,1,1,1,1,1,1,1,1
4,As_it_happened_Pritam_Singh_pleads_not_guilty_...,singapore leader opposition pritam singh charg...,1,1,1,1,1,1,1,1


In [5]:
for root, _, files in os.walk("./articles"):
    for file in files:
        if file.endswith(".txt"):
            with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                text = f.read()
                df.loc[df["File"] == file, "Text"] = text
                df.loc[df["File"] == file, "fp"] = os.path.join(root, file)

In [11]:
vectorizer = TfidfVectorizer(tokenizer=word_tokenize)
X = vectorizer.fit_transform(df["Text"])
vectorizer.get_feature_names_out()



array(['0', '00', '000', ..., 'à', '白鹿视频', '陳奕迅'], dtype=object)

In [12]:
X.shape

(110, 8915)

In [13]:
y = df[df.columns[2:]].to_numpy()

In [14]:
y

array([[1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 0, 1, 1, 0, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1,

Multilabel Classification

In [108]:
from typing import Callable

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, coverage_error, f1_score,
                             label_ranking_average_precision_score,
                             label_ranking_loss, multilabel_confusion_matrix,
                             precision_score, recall_score, roc_auc_score)
from sklearn.neighbors import KNeighborsClassifier

In [17]:
NUM_FOLDS = 5

In [19]:
mskf = MultilabelStratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

In [122]:
def train_and_eval(
    mskf: MultilabelStratifiedKFold, model_class: Callable, X: np.ndarray, y: np.ndarray
) -> dict[str, float]:
    """Train and evaluate a model using MultilabelStratifiedKFold cross-validation.

    Args:
        mskf (MultilabelStratifiedKFold): MultilabelStratifiedKFold object
        model_class (Callable): Class of the model to be trained
        X (np.ndarray): Dataset of input features
        y (np.ndarray): Dataset of target labels

    Returns:
        dict[str, float]: Dictionary of metrics
    """
    print(f"Model type: {model_class.__name__}")

    # Initialize lists to store metrics
    accs, lraps, f1s, lrls, precs, recs, cov_errs, aurocs = (
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
    )
    y_preds, y_probs, y_tests = [], [], []

    for i, (train_idx, test_idx) in enumerate(mskf.split(X, y)):
        # Split the dataset into training and testing sets
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Initialize and fit the model to training data
        model = model_class()
        model.fit(X_train, y_train)

        # Retrieve the predicted probabilities and labels
        y_prob = np.array(model.predict_proba(X_test))
        y_prob = y_prob[:, :, 1].reshape(len(y_test), -1)
        y_pred = model.predict(X_test)

        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        cov_error = coverage_error(y_test, y_prob)
        f1 = f1_score(y_test, y_pred, average="micro")
        lrap = label_ranking_average_precision_score(y_test, y_prob)
        lrl = label_ranking_loss(y_test, y_prob)
        prec = precision_score(y_test, y_pred, average="micro")
        rec = recall_score(y_test, y_pred, average="micro")
        ap_chance_level = y_test.sum() / y_test.size # Chance level for AP
        mlm = multilabel_confusion_matrix(y_test, y_pred)
        auroc = roc_auc_score(y_test, y_prob, average="micro")

        # Append metrics to lists
        accs.append(acc)
        lraps.append(lrap)
        f1s.append(f1)
        lrls.append(lrl)
        precs.append(prec)
        recs.append(rec)
        cov_errs.append(cov_error)
        y_preds.append(y_pred)
        y_probs.append(y_prob)
        y_tests.append(y_test)
        aurocs.append(auroc)

        # Print metrics
        print(f"Fold: {i + 1}")
        print(
            f"acc: {acc:.4f}",
            f"lrap: {lrap:.4f} / {ap_chance_level:.4f}",
            f"f1: {f1:.4f}",
            f"lrl: {lrl:.4f}",
            f"prec: {prec:.4f}",
            f"rec: {rec:.4f}",
            f"cov_err: {cov_error:.4f}",
            f"auroc: {auroc:.4f}",
            sep="\n",
            end="\n\n",
        )

    # Return metrics as a dictionary
    return {
        "acc": accs,
        "lrap": lraps,
        "f1": f1s,
        "lrl": lrls,
        "prec": precs,
        "rec": recs,
        "cov_err": cov_errs,
        "y_pred": y_preds,
        "y_test": y_tests,
        "y_prob": y_probs,
        "auroc": aurocs,
        "mlm": mlm,
    }

In [123]:
results = {}
for model in [RandomForestClassifier, KNeighborsClassifier]:
    res = train_and_eval(mskf, model, X, y)
    results[model.__name__]= res

for model, res in results.items():
    print("\n\n")
    print("=" * 80)
    print(f"model: {model}")
    print(
        f"acc: {np.mean(res['acc']):.4f} +/- {np.std(res['acc']):.4f}",
        f"lrap: {np.mean(res['lrap']):.4f} +/- {np.std(res["lrap"]):.4f} / {y.sum() / y.size:.4f}",
        f"f1: {np.mean(res['f1']):.4f} +/- {np.std(res['f1']):.4f}",
        f"lrl: {np.mean(res['lrl']):.4f} +/- {np.std(res['lrl']):.4f}",
        f"prec: {np.mean(res['prec']):.4f} +/- {np.std(res['prec']):.4f}",
        f"rec: {np.mean(res['rec']):.4f} +/- {np.std(res['rec']):.4f}",
        f"cov_err: {np.mean(res['cov_err']):.4f} +/- {np.std(res['cov_err']):.4f}",
        f"auroc: {np.mean(res['auroc']):.4f} +/- {np.std(res['auroc']):.4f}",
        "=" * 80,
        sep="\n",
        end="\n\n",
    )

best_model = max(results.items(), key=lambda x: x[1]["f1"])
print(f"Best model: {best_model[0]}, F1: {np.mean(best_model[1]['f1']):.4f}")

Model type: RandomForestClassifier
Fold: 1
acc: 0.7273
lrap: 0.8663 / 0.8352
f1: 0.9000
lrl: 0.1381
prec: 0.8324
rec: 0.9796
cov_err: 7.4545
auroc: 0.7101

Fold: 2
acc: 0.6818
lrap: 0.8834 / 0.8352
f1: 0.9125
lrl: 0.1634
prec: 0.8439
rec: 0.9932
cov_err: 7.7273
auroc: 0.4986

Fold: 3
acc: 0.6818
lrap: 0.8433 / 0.8295
f1: 0.8896
lrl: 0.1989
prec: 0.8246
rec: 0.9658
cov_err: 7.9091
auroc: 0.4427

Fold: 4
acc: 0.6957
lrap: 0.8535 / 0.7880
f1: 0.8869
lrl: 0.1486
prec: 0.7967
rec: 1.0000
cov_err: 7.3913
auroc: 0.4925

Fold: 5
acc: 0.8095
lrap: 0.8969 / 0.8869
f1: 0.9401
lrl: 0.1272
prec: 0.8869
rec: 1.0000
cov_err: 7.9048
auroc: 0.5639

Model type: KNeighborsClassifier
Fold: 1
acc: 0.6364
lrap: 0.8426 / 0.8352
f1: 0.8931
lrl: 0.2182
prec: 0.8304
rec: 0.9660
cov_err: 7.6364
auroc: 0.4514

Fold: 2
acc: 0.6364
lrap: 0.8236 / 0.8352
f1: 0.9034
lrl: 0.2616
prec: 0.8333
rec: 0.9864
cov_err: 8.0000
auroc: 0.5195

Fold: 3
acc: 0.5909
lrap: 0.8315 / 0.8295
f1: 0.8718
lrl: 0.2636
prec: 0.8193
rec: 0.