In [17]:
import os
import re

import torch
import numpy as np

from transformers import pipeline
from tqdm.auto import tqdm

from dataset.textdataset import ArticleDataset

In [60]:
categories = os.listdir(path="./articles")
candidate_labels = [category.replace("-", " ") for category in categories]
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    dtype=torch.bfloat16,
    device="cuda",
    fp16=True
)
results = []

mapping = {}

for category in tqdm(categories, desc="Category", leave=True, position=0):
    files = os.listdir(f"./articles/{category}")
    for file in tqdm(files, desc="File", leave=False, position=1):
        with open(f"./articles/{category}/{file}", "r", encoding="utf-8") as f:
            article = f.read()
            text = article.lower()
            text = re.sub(r"[^\w\s]", "", text)
            if len(text) == 0:
                continue
        if file in mapping:
            mapping[file]["categories"].append(category)
        else:
            mapping[file] = {
                "categories": [category],
                "fp": f"./articles/{category}/{file}"
            }

Category:   0%|          | 0/8 [00:00<?, ?it/s]

File:   0%|          | 0/59 [00:00<?, ?it/s]

File:   0%|          | 0/58 [00:00<?, ?it/s]

File:   0%|          | 0/59 [00:00<?, ?it/s]

File:   0%|          | 0/58 [00:00<?, ?it/s]

File:   0%|          | 0/51 [00:00<?, ?it/s]

File:   0%|          | 0/58 [00:00<?, ?it/s]

File:   0%|          | 0/58 [00:00<?, ?it/s]

File:   0%|          | 0/51 [00:00<?, ?it/s]

In [61]:
for file, properties in tqdm(mapping.items(), desc="Category", leave=True, position=0):
    with open(properties["fp"], "r", encoding="utf-8") as f:
        article = f.read()
        text = article.lower()
        text = re.sub(r"[^\w\s]", "", text)
        if len(text) == 0:
            continue
        result = classifier(text, candidate_labels=candidate_labels, multi_label=True)
        results.append((file, properties["categories"], result["labels"], result["scores"]))

Category:   0%|          | 0/57 [00:00<?, ?it/s]



In [78]:
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
from sklearn.preprocessing import MultiLabelBinarizer

In [62]:
results[0]

('5_smart_ways_to_stretch_your_dollar_with_GrabFood.txt',
 ['adulting-101',
  'big-read',
  'commentary',
  'gen-y-speaks',
  'gen-z-speaks',
  'singapore',
  'voices',
  'world'],
 ['commentary',
  'big read',
  'voices',
  'singapore',
  'world',
  'gen z speaks',
  'gen y speaks',
  'adulting 101'],
 [0.997778058052063,
  0.9974443316459656,
  0.9972078204154968,
  0.9966781735420227,
  0.9966431856155396,
  0.9958672523498535,
  0.9957605600357056,
  0.9948845505714417])

In [63]:
reverse_labels = {val: key for key, val in dict(zip(categories, candidate_labels)).items()}

In [64]:
mlb = MultiLabelBinarizer()
mlb.fit([mapping[file]["categories"] for file in mapping])

In [65]:
mlb.classes_

array(['adulting-101', 'big-read', 'commentary', 'gen-y-speaks',
       'gen-z-speaks', 'singapore', 'voices', 'world'], dtype=object)

In [66]:
y_scores = []
for i, (file, y_true, y_pred, scores) in enumerate(results):
    y_pred_mapped = [reverse_labels[label] for label in y_pred]
    y_score = sorted(list(zip(y_pred_mapped, scores)), key=lambda x: x[0])
    y_score = [score for _, score in y_score]
    y_scores.append(y_score)

y_scores[0]

[0.9948845505714417,
 0.9974443316459656,
 0.997778058052063,
 0.9957605600357056,
 0.9958672523498535,
 0.9966781735420227,
 0.9972078204154968,
 0.9966431856155396]

In [67]:
y_true = mlb.transform([properties["categories"] for properties in mapping.values()])
y_true

array([[1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 0, 1, 1, 0],
       [1, 1, 1, 1, 0, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1],
       [1,

In [74]:
y_scores = np.row_stack([np.array(score) for score in y_scores])
y_scores

array([[0.99488455, 0.99744433, 0.99777806, 0.99576056, 0.99586725,
        0.99667817, 0.99720782, 0.99664319],
       [0.97387147, 0.97091997, 0.96090406, 0.957735  , 0.95742565,
        0.97552443, 0.96059352, 0.93828017],
       [0.22627474, 0.87190437, 0.80760914, 0.54452765, 0.51152682,
        0.27524859, 0.83669001, 0.74927747],
       [0.63530296, 0.50191396, 0.60294002, 0.56132275, 0.57596254,
        0.54324049, 0.54456139, 0.45332053],
       [0.20020068, 0.80608612, 0.62634921, 0.49116912, 0.41430649,
        0.07633096, 0.88697147, 0.82320684],
       [0.16623895, 0.96001887, 0.72953594, 0.4144958 , 0.309607  ,
        0.99787211, 0.59777731, 0.68387413],
       [0.98842859, 0.99370241, 0.99521601, 0.98103905, 0.98233175,
        0.98947614, 0.99258089, 0.97959948],
       [0.99224591, 0.99070704, 0.99133879, 0.98189002, 0.98167872,
        0.98865271, 0.99063605, 0.98533589],
       [0.72156578, 0.97581249, 0.90271699, 0.48369277, 0.39166099,
        0.91368169, 0.941778

In [75]:
y_true.shape, y_scores.shape

((57, 8), (57, 8))

In [77]:
y_true.ndim, y_scores.ndim

(2, 2)

In [83]:
cov_error = coverage_error(y_true, y_scores)
ap = label_ranking_average_precision_score(y_true, y_scores)
rank_loss = label_ranking_loss(y_true, y_scores)

print(f"coverage error: {cov_error}", f"average precision: {ap}", f"ranking loss: {rank_loss}", sep="\n")

coverage error: 7.56140350877193
average precision: 0.908004037872459
ranking loss: 0.10734126984126985


In [86]:
((y_scores > 0.5) == y_true).mean()

0.6929824561403509