# Model Training

### Imports

In [None]:
# Imports
import os
import pandas as pd
import numpy as np
# from functions import *

# Paths
save_data = '../data'
data_path_train = os.path.join(save_data, 'train.csv')
data_path_test = os.path.join(save_data, 'test.csv')

save_images = '../images'
save_model = '../model'

if os.path.exists(save_data):
  print('Data: Save point initialized.')
if os.path.exists(data_path_train):
  print(f'Train loaded')
if os.path.exists(data_path_test):
  print(f'Test loaded\n')

if os.path.exists(save_images):
  print('Images: Save point initialized.')
if os.path.exists(save_model):
  print('Model: Save point initialized.')

In [None]:
# More imports
import torch
import gc
import pickle
from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, Training
from datasets import Dataset

import gc
gc.collect()
torch.cuda.empty_cache()

### Data

In [None]:
# Load Data
df_train = pd.read_csv(data_path_train)
df_test = pd.read_csv(data_path_test)

df_train.shape, df_test.shape

In [None]:
# Convert to HuggingFace Datasets
train = Dataset.from_pandas(df_train)
test = Dataset.from_pandas(df_test)

train, test

### Model

In [None]:
# Load Model
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=3,
                                                           device_map={"": 0})
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get model device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n=== === === Model loaded on: {device} === === ===\n")

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

model.train()

In [None]:
# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], 
                     padding="max_length", 
                     truncation=True)

# Tokenize the dataset
train = train.map(tokenize_function, batched=True)
test = test.map(tokenize_function, batched=True)

# Convert labels to torch tensors
train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

### Training

Evaluate

In [None]:
# Custom Metrics
import evaluate
from scipy.special import softmax
from scipy.stats import entropy

# Load evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
cross_entropy = evaluate.load("log_loss")  # Approximate Cross-Entropy Loss

# Function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = softmax(logits, axis=-1)  # logits to probs
    predictions = np.argmax(probs, axis=-1)  # Get class preds

    labels_one_hot = np.eye(probs.shape[-1])[labels]  # kl divergence requires one-hot labels
    kl_divs = [entropy(labels_one_hot[i], probs[i]) for i in range(len(labels))]

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average="weighted")["f1"],
        "cross_entropy": cross_entropy.compute(predictions=probs, references=labels)["log_loss"],
        "kl_divergence": np.mean(kl_divs)  # Average KL-Divergence across all samples
    }

Train

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
model_path = os.path.join(save_model, 'roberta-classifier-bias-base')

In [None]:
training_args = TrainingArguments(
  output_dir=model_path,
  do_train=True,
  do_eval=True,
  do_predict=True,
  
  evaluation_strategy="steps",
  eval_steps=150,
  eval_accumulation_steps=4,
  
  logging_strategy="steps",
  logging_steps=300,
  
  save_strategy="steps",
  save_steps=300,
  num_train_epochs=5,
  
  learning_rate=2e-5,
  lr_scheduler_type="linear",
  warmup_ratio=0.1,
  weight_decay=0.01,
  
  load_best_model_at_end=True,
  metric_for_best_model="f1",
  greater_is_better=True,
  
  report_to="tensorboard",
  resume_from_checkpoint=True,
  
  per_device_eval_batch_size=8,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=4,
  gradient_checkpointing=True
)

In [None]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train,
  eval_dataset=test,
  compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model(os.path.join(model_path, 
                                'roberta-political-bias-classifier'))
tokenizer.save_pretrained(os.path.join(model_path,
                                       'roberta-political-bias-classifier'))

In [None]:
eval_results = trainer.evaluate()
model_performance = pd.DataFrame(eval_results, index=["Value"]).T

print(model_performance.to_markdown())