In [None]:
# installs
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [None]:
# imports
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import csv

In [None]:
# access Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ds_path = '/content/drive/MyDrive/metaU/pb_by_sentence.csv'

In [None]:
dataset = load_dataset("csv", data_files=ds_path)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_testvalid = dataset["train"].train_test_split(test_size=0.2)

In [None]:
train_dataset = train_testvalid['train']
test_dataset = train_testvalid['test']

In [None]:
print(f"Sizes of datasets - train: {len(train_dataset)}, test: {len(test_dataset)}")

Sizes of datasets - train: 1179, test: 295


In [None]:
labels = [label for label in train_dataset.features.keys() if label not in ['statement']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['left-leaning', 'right-leaning', 'neutral']

In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["statement"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
encoded_train_dataset = dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)

Map:   0%|          | 0/1474 [00:00<?, ? examples/s]

In [None]:
encoded_test_dataset = dataset.map(preprocess_data, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/1474 [00:00<?, ? examples/s]

In [None]:
example = encoded_train_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [None]:
tokenizer.decode(example['input_ids'])

'[CLS] this is not satire. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [None]:
example['labels']

[0.0, 1.0, 0.0]

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['right-leaning']

In [None]:
encoded_train_dataset.set_format("torch")

In [None]:
encoded_test_dataset.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
batch_size = 8
metric_name = "eval_train_f1"

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
encoded_train_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [None]:
encoded_train_dataset['train']['input_ids'][0]

tensor([  101,  2023,  2003,  2025, 18312,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
outputs = model(input_ids=encoded_train_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_train_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.6026, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.1268,  0.5797, -0.1859]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset["train"],
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Train Loss,Train F1,Train Roc Auc,Train Accuracy
1,No log,No log,0.46302,0.70245,0.775271,0.590231
2,No log,No log,0.246844,0.917437,0.936228,0.894844
3,0.426000,No log,0.112431,0.980074,0.986092,0.97422
4,0.426000,No log,0.051099,0.991186,0.993555,0.989145
5,0.426000,No log,0.037341,0.995929,0.996947,0.995929


TrainOutput(global_step=925, training_loss=0.2735088286528716, metrics={'train_runtime': 434.1387, 'train_samples_per_second': 16.976, 'train_steps_per_second': 2.131, 'total_flos': 484786472163840.0, 'train_loss': 0.2735088286528716, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_train_loss': 0.03734087944030762,
 'eval_train_f1': 0.9959294436906377,
 'eval_train_roc_auc': 0.9969470827679784,
 'eval_train_accuracy': 0.9959294436906377,
 'eval_train_runtime': 10.4737,
 'eval_train_samples_per_second': 140.733,
 'eval_train_steps_per_second': 17.663,
 'epoch': 5.0}

In [None]:
text = "this is a commercial."

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [None]:
logits = outputs.logits
logits.shape

torch.Size([1, 3])

In [None]:
# # apply sigmoid + threshold

# sigmoid = torch.nn.Sigmoid()
# probs = sigmoid(logits.squeeze().cpu())
# predictions = np.zeros(probs.shape)
# predictions[np.where(probs >= 0.5)] = 1
# # turn predicted id's into actual label names
# predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
# print(predicted_labels)


import numpy as np
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())

# Convert probabilities to percentages
percentages = probs.detach().numpy() * 100

# If you need to display labels with their associated percentages
predicted_labels_with_percentages = [(id2label[idx], percentages[idx]) for idx in range(len(percentages))]

# Print the labels with their corresponding percentage scores
for label, percentage in predicted_labels_with_percentages:
    print(f"{label}: {percentage:.2f}%")

left-leaning: 50.22%
right-leaning: 1.65%
neutral: 54.28%


In [None]:
model_save_path = "/content/drive/MyDrive/metaU/bert-finetuned-model"
tokenizer_save_path = "/content/drive/MyDrive/metaU/bert-finetuned-tokenizer"

In [None]:
model.save_pretrained(model_save_path)

In [None]:
tokenizer.save_pretrained(tokenizer_save_path)

('/content/drive/MyDrive/metaU/bert-finetuned-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/metaU/bert-finetuned-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/metaU/bert-finetuned-tokenizer/vocab.txt',
 '/content/drive/MyDrive/metaU/bert-finetuned-tokenizer/added_tokens.json',
 '/content/drive/MyDrive/metaU/bert-finetuned-tokenizer/tokenizer.json')