## Train Multiclass Classifier: BERT

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
import random
import numpy as np
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
topics = ["cannabis", "energie", "kinder"]

## Load Dataset

**Map class-names to class-ids:**

In [3]:
id_to_class = {0: "other",1: "cannabis", 2: "energie", 3: "kinder"}
class_to_id = {"other": 0, "cannabis": 1, "energie": 2, "kinder": 3}

In [4]:
MAX_CONTENT_LENGTH = 384
file_path = f"../data/tmp/processed_dataset_multiclass_chunkified_{MAX_CONTENT_LENGTH}"
dataset = load_from_disk(file_path)

#dataset = dataset["valid"]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
        num_rows: 15828
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
        num_rows: 816025
    })
    valid: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
        num_rows: 827
    })
})

## Load Model

In [6]:
model_path = "../models/bert_multiclass_model_buff_filtered"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path).eval()

In [7]:
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Using 2 GPUs!


DataParallel(
  (module): XLMRobertaForSequenceClassification(
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRobertaSelfOutput(


## Prepare Dataset

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
        num_rows: 15828
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
        num_rows: 816025
    })
    valid: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
        num_rows: 827
    })
})

In [15]:
seen_urls = set()

dataset["test"] = dataset["test"].filter(lambda example: example['view_url'] not in seen_urls and not seen_urls.add(example['view_url']), num_proc=16)

Filter (num_proc=16): 100%|██████████| 816025/816025 [00:05<00:00, 147142.53 examples/s]


In [16]:
dataset["test"]

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'token_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'chunk_id', 'url_path'],
    num_rows: 153543
})

In [20]:
def clear_text(examples):
    examples["text"] = ""  # Set all entries to an empty string
    return examples

# Apply the function to the desired dataset split (e.g., 'train')
dataset = dataset.map(clear_text)

Map: 100%|██████████| 15828/15828 [00:04<00:00, 3948.38 examples/s]
Map: 100%|██████████| 153543/153543 [00:33<00:00, 4575.03 examples/s]
Map: 100%|██████████| 827/827 [00:00<00:00, 4973.65 examples/s]


In [21]:
# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["url_path"], examples["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 15828/15828 [00:03<00:00, 4240.16 examples/s]
Map: 100%|██████████| 153543/153543 [00:40<00:00, 3798.52 examples/s]
Map: 100%|██████████| 827/827 [00:00<00:00, 3903.77 examples/s]


In [23]:
#dataset["test"][0]

## Get Predictions

In [24]:
def predict_batch(batch):
    """ Perform prediction on a batch of samples in a multiclass setting. """
    
    # Ensure input tensors are on the correct device
    input_ids = torch.tensor(batch['input_ids']).to(device)
    attention_mask = torch.tensor(batch['attention_mask']).to(device)
    
    # Perform prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Extract probabilities for all classes and predicted classes
    batch['probas'] = predictions.cpu().tolist()  # Move results back to CPU and convert to list
    batch['preds'] = torch.argmax(predictions, dim=-1).cpu().tolist()

    return batch


In [25]:
dataset = dataset.map(predict_batch, batched=True, batch_size=512)

Map: 100%|██████████| 15828/15828 [02:37<00:00, 100.28 examples/s]
Map: 100%|██████████| 153543/153543 [24:44<00:00, 103.44 examples/s]
Map: 100%|██████████| 827/827 [00:08<00:00, 102.89 examples/s]


In [27]:
# id = 100
# print(dataset[id]["preds"])
# print(dataset[id]["probas"])

In [28]:
dataset.save_to_disk(file_path + "_preds_url_only")

Saving the dataset (0/1 shards):   0%|          | 0/15828 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 15828/15828 [00:00<00:00, 131310.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 153543/153543 [00:00<00:00, 228134.87 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 827/827 [00:00<00:00, 87495.95 examples/s]


In [11]:
# # Assuming labels and preds are lists or arrays containing the true labels and predicted labels respectively
# accuracy = accuracy_score(labels, preds)
# precision_per_class = precision_score(labels, preds, average=None)
# recall_per_class = recall_score(labels, preds, average=None)
# f1_per_class = f1_score(labels, preds, average=None)

# print("Overall Accuracy: {:.2f}%".format(accuracy * 100))
# print("Precision per class: {}".format(np.round(precision_per_class, 2)))
# print("Recall per class: {}".format(np.round(recall_per_class, 2)))
# print("F1 Score per class: {}".format(np.round(f1_per_class, 2)))