In [21]:
import polars as pl
import torch
import os
import numpy as np
import random
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EvalPrediction
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss


random_state = 42
dataset_size = 10**5 // 2
torch.manual_seed(random_state)
random.seed(random_state)
np.random.seed(random_state)

In [2]:
!nvidia-smi

Fri May  9 11:59:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             10W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

### Read dataset

In [3]:
df = pl.read_parquet("/kaggle/input/nlp-project/medium_articles.parquet").limit(dataset_size)
df.select(pl.len())

len
u32
50000


### Fit multilabel binarizer from sklearn

In [4]:
mb = MultiLabelBinarizer()
labels = mb.fit_transform(df["tags"]).astype("float32")
labels[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [5]:
texts = list(df["text"])
texts[0]

'Photo by Josh Riemer on Unsplash\n\nMerry Christmas and Happy Holidays, everyone!\n\nWe just wanted everyone to know how much we appreciate everyone and how thankful we are for all our readers and writers here. We wouldn’t be anywhere without you, so thank you all for bringing informative, vulnerable, and important pieces that destigmatize mental illness and mental health.\n\nWithout further ado, here are ten of our top stories from last week, all of which were curated:\n\n“Just as the capacity to love and inspire is universal so is the capacity to hate and discourage. Irrespective of gender, race, age or religion none of us are exempt from aggressive proclivities. Those who are narcissistically disordered, and accordingly repress deep seated feelings of inferiority with inflated delusions of grandeur and superiority, are more prone to aggression and violence. They infiltrate our interactions in myriad environments from home, work, school and the cyber world. Hence, bullying does not 

### Split train, test and validation sets

In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels,
                                                                    test_size=0.2, random_state=random_state)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels,
                                                                    test_size=0.25, random_state=random_state)

In [7]:
len(train_texts), len(val_texts), len(test_texts)

(30000, 10000, 10000)

### Define DistilBert model

In [10]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels[0]),
                                                            problem_type="multi_label_classification")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define custom dataset class

In [15]:
class TextTagsDataset(torch.utils.data.Dataset):
    def __init__(self, texts: np.ndarray, labels: np.ndarray, tokenizer: DistilBertTokenizer, max_len: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx])
    
        encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': label
        }

In [16]:
train_dataset = TextTagsDataset(train_texts, train_labels, tokenizer, max_len=528)
val_dataset = TextTagsDataset(val_texts, val_labels, tokenizer, max_len=528)
test_dataset = TextTagsDataset(test_texts, test_labels, tokenizer, max_len=528)

### MultiLabel text classification define metrics

In [20]:
def multi_labels_metrics(predictions: np.ndarray, labels: np.ndarray, threshold: float = 0.3):
      sigmoid = torch.nn.Sigmoid()
      probs = sigmoid(torch.Tensor(predictions))
    
      y_pred = np.zeros(probs.shape)
      y_pred[np.where(probs>=threshold)] = 1
      y_true = labels
    
      f1 = f1_score(y_true, y_pred, average="macro")
      roc_auc = roc_auc_score(y_true, y_pred, average="macro")
      hamming = hamming_loss(y_true, y_pred)
    
      metrics = {
          "roc_auc": roc_auc,
          "hamming_loss": hamming,
          "f1": f1
      }

      return metrics

def compute_metrics(p: EvalPrediction):
      preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    
      result = multi_labels_metrics(predictions=preds,
                                    labels=p.label_ids)
    
      return result

### Train model

In [23]:
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir = './results',
    num_train_epochs=5,
    save_steps=1000,
    save_total_limit=2
)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset,
                  eval_dataset = val_dataset,
                  compute_metrics=compute_metrics)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

In [None]:
trainer.evaluate()