In [32]:
import os
import re
from functools import partial

import torch
import numpy as np
import pandas as pd

from datasets import Dataset, Value, ClassLabel, Features
from transformers import pipeline, AutoTokenizer
from tqdm.auto import tqdm

from dataset.textdataset import ArticleDataset
from dataset.transformers_dataset import load_data, get_dict

In [2]:
df = load_data("multi_label_dataset.csv", "./articles")

Loading articles: 0file [00:00, ?file/s]

Loading files: 0file [00:00, ?file/s]

Loading files:   0%|          | 0/59 [00:00<?, ?file/s]

Loading files:   0%|          | 0/58 [00:00<?, ?file/s]

Loading files:   0%|          | 0/59 [00:00<?, ?file/s]

Loading files:   0%|          | 0/58 [00:00<?, ?file/s]

Loading files:   0%|          | 0/51 [00:00<?, ?file/s]

Loading files:   0%|          | 0/58 [00:00<?, ?file/s]

Loading files:   0%|          | 0/58 [00:00<?, ?file/s]

Loading files:   0%|          | 0/51 [00:00<?, ?file/s]

In [13]:
def get_dict(df):
    dataset = {}
    for _, row in df.iterrows():
        targets = row[2:]
        labels = df.columns[2:][targets == 1]
        targets = [df.columns[2:].get_loc(label) for label in labels]
        labels = list(map(lambda x: x.replace("-", " "), labels))
        if dataset.get("text") is None:
            dataset["text"] = [row["Text"]]
            dataset["binary_targets"] = [targets]
            dataset["labels"] = [labels]
        else:
            dataset["text"].append(row["Text"])
            dataset["binary_targets"].append(targets)
            dataset["labels"].append(labels)
    return dataset

In [55]:
# find examples that are not every label
df[df.iloc[:, 2:].sum(axis=1) < len(df.columns[2:])]

Unnamed: 0,File,Text,adulting-101,big-read,commentary,gen-y-speaks,gen-z-speaks,singapore,voices,world
7,Chinese_content_creator_uses_AI_to_resurrect_C...,sit well netizens music producer used artifici...,1,0,1,1,0,1,1,0
19,Court_finds_employee_wrongfully_dismissed_due_...,SINGAPORE: A district court has awarded the fo...,0,1,1,0,0,0,0,0
21,Distance_based_charging_for_motorists_would_be...,SINGAPORE: Introducing a distance- or usage-ba...,1,1,0,0,0,1,1,0
23,Energy_stick_inhalers_gaining_popularity_among...,"SINGAPORE: “Energy stick” inhalers, which are ...",0,0,0,1,0,0,0,0
27,Families_and_enthusiasts_in_Singapore_switch_f...,singapore father five mr zulkifli aziz 38 cons...,0,1,0,0,1,0,0,1
31,Gen_Z_Speaks_How_Taylor_Swift_week_in_Singapor...,never heard taylor swift song life must left h...,1,0,0,1,0,0,0,1
33,Gen_Zen_My_husband_spends_a_lot_of_time_rottin...,increasingly people becoming aware importance ...,0,1,0,0,1,0,0,0
36,Hong_Kong_stars_Joey_Yung_Charlene_Choi_and_Gi...,cantopop star joey yung charlene choi gillian ...,1,1,1,0,1,1,1,1
39,How_a_scammer_used_forged_emails_to_convince_h...,singapore judge called heartrending case elder...,1,1,1,1,1,0,0,0
41,How_to_avoid_a_midlife_crisis_and_celebrate_th...,new york midlife according american psychologi...,0,0,1,0,0,0,0,0


In [33]:
dataset = Dataset.from_dict(
    get_dict(df),
    features=Features(
        {
            "text": Value("string"),
            "binary_targets": ClassLabel(num_classes=8, names=list(range(8))),
            "labels": ClassLabel(names=df.columns[2:].tolist()),
        }
    ),
)

TypeError: '<=' not supported between instances of 'int' and 'list'

In [None]:
train, test = dataset.train_test_split(test_size=0.2)

In [28]:
dataset.features

{'text': Value(dtype='string', id=None),
 'binary_targets': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [30]:
dataset[0]

{'text': 'The app’s new features serve up both convenience and value, whether you’re looking to dine in or take out.\nHave you noticed that prices are creeping up – and that your wallet is taking a hit? From transport and daily essentials to dining out, the cost of living is steadily rising, leaving both consumers and businesses feeling the pinch.\nWith food being a significant household expenditure in Singapore, Grab is taking a proactive approach to ease your financial burden without compromising your food cravings. This includes reducing delivery fees and presenting value-for-money dine-in deals, all aimed at making daily life a bit more affordable.\n“As an everyday superapp, it’s important that we find ways to create lower-priced services for consumers who want to be more prudent with their spending while still enjoying the convenience of on-demand services,” said Mr Tay Chuen Jein, head of deliveries at Grab Singapore.\nWhether you’re gearing up for a night out or opting for a cos

In [29]:
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    dtype=torch.bfloat16,
    device="cuda",
    fp16=True
)
candidate_labels = list(map(lambda x: x.replace("-", " "), df.columns[2:]))

In [40]:
def tokenize_text(instance, tokenizer):
    return tokenizer(instance["text"], truncation=True)

In [45]:
dataset[0]

{'text': 'The app’s new features serve up both convenience and value, whether you’re looking to dine in or take out.\nHave you noticed that prices are creeping up – and that your wallet is taking a hit? From transport and daily essentials to dining out, the cost of living is steadily rising, leaving both consumers and businesses feeling the pinch.\nWith food being a significant household expenditure in Singapore, Grab is taking a proactive approach to ease your financial burden without compromising your food cravings. This includes reducing delivery fees and presenting value-for-money dine-in deals, all aimed at making daily life a bit more affordable.\n“As an everyday superapp, it’s important that we find ways to create lower-priced services for consumers who want to be more prudent with their spending while still enjoying the convenience of on-demand services,” said Mr Tay Chuen Jein, head of deliveries at Grab Singapore.\nWhether you’re gearing up for a night out or opting for a cos

In [50]:
results = classifier(dataset["text"], candidate_labels=candidate_labels, multi_label=True)

In [76]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [82]:
results[1]

{'sequence': 'Adulthood is an invigorating stage of life as young people join the workforce, take on more responsibilities and set their sights on the future. But its many facets — from managing finances and buying a home to achieving work-life balance — can be overwhelming.\nIn this series, TODAY’s journalists help young Singaporeans navigate this stage of their lives and learn something themselves in the process.\nSINGAPORE — I have a friend who is earning at least 30 per cent more than me, even though we work in the same industry.\nWe catch up once or twice a year, and every time we do, I can\'t help but wonder: How did that happen?\xa0\nI suspect it has a lot to do with the fact that he has changed jobs many times since we both graduated. Almost every time I meet him, he has a new job, and each time he moves to a new company, he negotiates a higher salary.\nJob-hopping is quite common among people my age, with many advocates saying it is a good way to quickly raise your salary and 

In [67]:
mlb = MultiLabelBinarizer(classes=candidate_labels)
sample_labels = df[df.columns[2:]].apply(lambda x: list(df.columns[2:][x == 1]), axis=1)

In [62]:
label_to_dashed_labels = {label: label.replace(" ", "-") for label in candidate_labels}
dashed_lables_to_labels = {label.replace(" ", "-"): label for label in candidate_labels}

In [66]:
mlb.classes

['adulting 101',
 'big read',
 'commentary',
 'gen y speaks',
 'gen z speaks',
 'singapore',
 'voices',
 'world']

In [68]:
mlb.fit(sample_labels)

In [70]:
def get_scores(results, mlb):
    for result in results:
        score = result["scores"]
        labels = result["labels"]
        scores = [score[labels.index(label)] if label in labels else 0 for label in mlb.classes]
        yield scores

In [71]:
y_scores = np.array([score for score in get_scores(results, mlb)])

In [72]:
y_true = np.array(df[df.columns[2:]])

In [73]:
y_true.shape, y_scores.shape

((110, 8), (110, 8))

In [74]:
y_true.ndim, y_scores.ndim

(2, 2)

In [78]:
cov_error = coverage_error(y_true, y_scores)
ap = label_ranking_average_precision_score(y_true, y_scores)
rank_loss = label_ranking_loss(y_true, y_scores)
acc = accuracy_score(y_true, y_scores > 0.5)

print(
    f"coverage error: {cov_error}",
    f"average precision: {ap}",
    f"ranking loss: {rank_loss}",
    f"accuracy: {acc}",
    sep="\n",
)

coverage error: 7.5
average precision: 0.8847203153988866
ranking loss: 0.12537878787878787
accuracy: 0.21818181818181817


In [86]:
((y_scores > 0.5) == y_true).mean()

0.6929824561403509