<a href="https://colab.research.google.com/github/larajakl/Computational-Linguistics/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments



In [39]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate --upgrade
!pip install optuna
!pip install optuna-integration[pytorch_lightning]



In [40]:
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorWithPadding

from transformers import AutoTokenizer

from transformers import set_seed
from collections import Counter


In [41]:
dataset = load_dataset("mrjunos/depression-reddit-cleaned")

set_seed(24)

In [42]:
# Check distribution of labels in full dataset:

full_label_distribution = Counter(dataset['train']['label'])
print("Full dataset label distribution:", full_label_distribution)

Full dataset label distribution: Counter({0: 3900, 1: 3831})


In [43]:
# Just take the first n tokens for speed on CPU
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:100]),
        'label': example['label']
    }

# Random examples for train, validation and test
# Limit the dataset to the first 200 entries, JUST FOR NOW (ADAPT THESE LINES LATER)
subset_dataset = dataset['train'].shuffle(seed=24).select(range(200))
# Define the train/val/test split proportions:
train_ratio, val_ratio = 0.8, 0.1  # 80% train, 10% val, 10% test
# Shuffle the dataset once:
shuffled_dataset = subset_dataset.shuffle(seed=24)
# Compute the split indices:
total_size = len(shuffled_dataset)
train_end = int(train_ratio * total_size)
val_end = train_end + int(val_ratio * total_size)
# Create splits:
train = shuffled_dataset.select(range(train_end)).map(truncate)
val = shuffled_dataset.select(range(train_end, val_end)).map(truncate)
test = shuffled_dataset.select(range(val_end, total_size)).map(truncate)

# Print the sizes of the splits:
print(f"Train size: {len(train)}, Validation size: {len(val)}, Test size: {len(test)}")

dataset_dict = DatasetDict({
    "train": train,
    "val": val,
    "test": test
})

Train size: 160, Validation size: 20, Test size: 20


In [44]:
print(shuffled_dataset)

print(dataset_dict)

Dataset({
    features: ['text', 'label'],
    num_rows: 200
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 160
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 20
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 20
    })
})


In [None]:
# Model 1: Distil BERT cased
# cased models: they treat words like "Word" and "word" as separate tokens

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

def tokenize_function_distilbert(examples):
    return tokenizer_distilbert(examples["text"], padding=True, truncation=True)

small_tokenized_dataset_distilbert = dataset_dict.map(tokenize_function_distilbert, batched=True, batch_size=16)
data_collator_distilbert = DataCollatorWithPadding(tokenizer=tokenizer_distilbert)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [None]:
print(small_tokenized_dataset_distilbert)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 160
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
})


In [45]:
# Model 2: RoBERTa base
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function_roberta(examples):
    return tokenizer_roberta(examples["text"], padding=True, truncation=True)

# Apply the tokenize function to the dataset
small_tokenized_dataset_roberta = dataset_dict.map(tokenize_function_roberta, batched=True, batch_size=16)

# Create a data collator with padding
data_collator_roberta = DataCollatorWithPadding(tokenizer=tokenizer_roberta)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [46]:
# Mounting Google Drive to store the checkpoints in Google Drive instead of my runtime:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

In [None]:
# Training the Distil BERT cased model:

set_seed(24)

model_distilbert = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=2) # 2 labels: depression/no depression
accuracy = evaluate.load("accuracy")

arguments_distilbert = TrainingArguments(
    output_dir="/content/drive/MyDrive/comp_ling_project_model_distilbert",
    per_device_train_batch_size=16, # adapt
    per_device_eval_batch_size=16, # adapt
    logging_steps=10, # because 8 times 16 is 128 - adapt for my project!!!
    num_train_epochs=5, # adapt
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5, # adapt
    weight_decay=0.01, # adapt
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return accuracy.compute(predictions=predictions, references=labels)


trainer_distilbert = Trainer(
    model=model_distilbert, # adapt for roberta later
    args=arguments_distilbert, # adapt for roberta later
    train_dataset=small_tokenized_dataset_distilbert['train'], # adapt for roberta later
    eval_dataset=small_tokenized_dataset_distilbert['val'], # change to test when you do your final evaluation! # adapt for roberta later
    processing_class=tokenizer_distilbert, # adapt for roberta later
    data_collator=data_collator_distilbert, # adapt for roberta later
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
trainer_distilbert.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3762,0.138107,1.0
2,0.0702,0.020795,1.0
3,0.0166,0.008385,1.0
4,0.0084,0.005747,1.0
5,0.0067,0.005174,1.0


TrainOutput(global_step=50, training_loss=0.09560806900262833, metrics={'train_runtime': 779.9196, 'train_samples_per_second': 1.026, 'train_steps_per_second': 0.064, 'total_flos': 33626021227584.0, 'train_loss': 0.09560806900262833, 'epoch': 5.0})

In [50]:
# Training the RoBERTa base model:

set_seed(24)

model_roberta = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # 2 labels: depression/no depression
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

arguments_roberta = TrainingArguments(
    output_dir="/content/drive/MyDrive/comp_ling_project_model_roberta3",
    per_device_train_batch_size=16, # adapt
    per_device_eval_batch_size=16, # adapt
    logging_steps=8, # because 8 times 16 is 128 - adapt for my project!!!
    num_train_epochs=5, # adapt
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5, # adapt
    weight_decay=0.01, # adapt
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    """Called at the end of validation."""
    logits, labels = eval_pred
    accuracy_metric = accuracy.compute(predictions=predictions, references=labels)
    precision_metric = precision.compute(predictions=predictions, references=labels)
    recall_metric = recall.compute(predictions=predictions, references=labels)
    f1_metric = f1.compute(predictions=predictions, references=labels)

    # Return the results
    return {
        "accuracy": accuracy_metric["accuracy"],
        "precision": precision_metric["precision"],
        "recall": recall_metric["recall"],
        "f1": f1_metric["f1"]
    }


trainer_roberta = Trainer(
    model=model_roberta,
    args=arguments_roberta,
    train_dataset=small_tokenized_dataset_roberta['train'],
    eval_dataset=small_tokenized_dataset_roberta['val'], # change to test when you do your final evaluation!
    processing_class=tokenizer_roberta,
    data_collator=data_collator_roberta,
    compute_metrics=compute_metrics
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# to check the label distribution:
print("Train label distribution:", Counter(train['label']))
print("Validation label distribution:", Counter(val['label']))
print("Test label distribution:", Counter(test['label']))

Train label distribution: Counter({1: 87, 0: 73})
Validation label distribution: Counter({1: 11, 0: 9})
Test label distribution: Counter({1: 11, 0: 9})


In [52]:
trainer_roberta.train()

Epoch,Training Loss,Validation Loss


ValueError: Mismatch in the number of predictions (3) and references (20)

In [24]:
test_results = trainer_roberta.evaluate(small_tokenized_dataset_roberta['test'])
print(test_results)

{'eval_loss': 0.2556311786174774, 'eval_accuracy': 0.9, 'eval_runtime': 8.2372, 'eval_samples_per_second': 2.428, 'eval_steps_per_second': 0.243, 'epoch': 5.0}


In [26]:
predictions = trainer_roberta.predict(small_tokenized_dataset_roberta['test'])
predicted_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

for i, (true, pred) in enumerate(zip(true_labels, predicted_labels)):
    if true != pred:
        # Get the corresponding text for the misclassified sample
        misclassified_text = small_tokenized_dataset_roberta['test'][i]['text']
        print(f"Sample {i}:")
        print(f"True label = {true}, Predicted label = {pred}")
        print(f"Text: {misclassified_text}")
        print("-" * 50)

Sample 0:
True label = 1, Predicted label = 0
Text: what do you guy think will this finally change my life buspasfar mg day escitalopram 0mg bupropion 0
--------------------------------------------------
Sample 13:
True label = 1, Predicted label = 0
Text: mai asher 9 lynnestactia the guy look depressed depression is real
--------------------------------------------------


In [None]:
# In the following code cells, I use Optuna to test hyperparameters. New needed imports:
import optuna
import torch

In [None]:
# Optuna hyperparameter tuning:

In [None]:
# Testing:

metric = evaluate.load("accuracy")
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/XXXXXXXXXXXXX")

model_inputs = tokenizer(small_tokenized_dataset['test']['text'], padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model(**model_inputs, output_hidden_states=True)

predictions = torch.argmax(outputs.logits, dim=-1)
accuracy = metric.compute(predictions=predictions, references=small_tokenized_dataset['test']['label'])
print(accuracy)

In [23]:
# Visualisations:

!pip install bertviz transformers
!pip install bertviz

In [None]:
import os
from torch.utils.tensorboard import SummaryWriter
import re
import tensorflow as tf
import tensorboard as tb  #this is important to be able to store embeddings in a format so that I can use tensorflow visualiation

In [None]:
# Minimum 2 visualisations -> of ONE model!
# (Every change in hyperparameters is a new model.)
# Make 2 visualisations of the FINAL model only (in 2 checkpoints/layers)
# For your project, you need to use a separate test set for the visualisations.

In [None]:
# ADAPT FOR MY OWN DATASET: (look at tutorial 5 for further steps and details)

# the code associates our evaluation step with movie review and label

# for project: load different layers from different epochs to see how it changes!

path = "results_vis" # creates directory, can change it to drive location
layer=0 # sets layer i want to start from
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    sp_token_position = 0
    for token in model_inputs['input_ids'][example]:
      if token != 101:
        sp_token_position += 1
      else:
        tensor = outputs['hidden_states'][layer][example][sp_token_position]
        tensors.append(tensor)
        break

    label = [small_tokenized_dataset['val']['text'][example],str(small_tokenized_dataset['val']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Text','Emotion'])

  layer+=1