<a href="https://colab.research.google.com/github/larajakl/Computational-Linguistics/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate --upgrade
!pip install optuna
!pip install optuna-integration[pytorch_lightning]



In [2]:
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorWithPadding

from transformers import AutoTokenizer

from transformers import set_seed
from collections import Counter


In [3]:
dataset = load_dataset("mrjunos/depression-reddit-cleaned")

set_seed(24)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
# Check distribution of labels in full dataset:

full_label_distribution = Counter(dataset['train']['label'])
print("Full dataset label distribution:", full_label_distribution)

Full dataset label distribution: Counter({0: 3900, 1: 3831})


In [5]:
# Truncation function and shuffling and splitting of dataset:
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:300]),
        'label': example['label']
    }

# Random examples for train, validation and test
# Limit the dataset to the first 200 entries, JUST FOR NOW (ADAPT THESE LINES LATER)
subset_dataset = dataset['train'].shuffle(seed=24).select(range(1250))
# Define the train/val/test split proportions:
train_ratio, val_ratio = 0.7, 0.15  # 70% train, 15% val, 15% test
# Shuffle the dataset once:
shuffled_dataset = subset_dataset.shuffle(seed=24)
# Compute the split indices:
total_size = len(shuffled_dataset)
train_end = int(train_ratio * total_size)
val_end = train_end + int(val_ratio * total_size)
# Create splits:
train = shuffled_dataset.select(range(train_end)).map(truncate)
val = shuffled_dataset.select(range(train_end, val_end)).map(truncate)
test = shuffled_dataset.select(range(val_end, total_size)).map(truncate)

# Print the sizes of the splits:
print(f"Train size: {len(train)}, Validation size: {len(val)}, Test size: {len(test)}")

dataset_dict = DatasetDict({
    "train": train,
    "val": val,
    "test": test
})

Train size: 875, Validation size: 187, Test size: 188


In [6]:
# To check structure of dataset:
print(shuffled_dataset)
print(dataset_dict)

Dataset({
    features: ['text', 'label'],
    num_rows: 1250
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 875
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 187
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 188
    })
})


In [7]:
# Model 1: Tokenizer DistilBERT cased
tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

def tokenize_function_distilbert(examples):
    return tokenizer_distilbert(examples["text"], padding=True, truncation=True)

small_tokenized_dataset_distilbert = dataset_dict.map(tokenize_function_distilbert, batched=True, batch_size=16)
data_collator_distilbert = DataCollatorWithPadding(tokenizer=tokenizer_distilbert)

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

In [8]:
print(small_tokenized_dataset_distilbert)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 875
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 187
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 188
    })
})


In [9]:
# Model 2: Tokenizer RoBERTa base
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function_roberta(examples):
    return tokenizer_roberta(examples["text"], padding=True, truncation=True)

# Apply the tokenize function to the dataset
small_tokenized_dataset_roberta = dataset_dict.map(tokenize_function_roberta, batched=True, batch_size=16)

# Create a data collator with padding
data_collator_roberta = DataCollatorWithPadding(tokenizer=tokenizer_roberta)

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

In [10]:
# Mounting Google Drive to store the checkpoints in Google Drive instead of my runtime:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

In [12]:
# Training the Distil BERT cased model:

set_seed(24)

model_distilbert = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=2) # 2 labels: depression/no depression
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

arguments_distilbert = TrainingArguments(
    output_dir="/content/drive/MyDrive/distilbert",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    logging_steps=10,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_recall", # so that the model that performs best in the recall score is loaded at the end
    learning_rate=3e-5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    """Called at the end of validation."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Debugging: Print shapes and values
    print(f"Logits shape: {logits.shape}")
    print(f"Labels: {labels}")
    print(f"Predictions: {predictions}")

    # Scikit-learn for metric computation
    acc = accuracy_score(labels, predictions)
    prec = precision_score(labels, predictions, average="binary", zero_division=0)
    rec = recall_score(labels, predictions, average="binary", zero_division=0)
    f1 = f1_score(labels, predictions, average="binary", zero_division=0)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }


trainer_distilbert = Trainer(
    model=model_distilbert,
    args=arguments_distilbert,
    train_dataset=small_tokenized_dataset_distilbert['train'],
    eval_dataset=small_tokenized_dataset_distilbert['val'],
    processing_class=tokenizer_distilbert,
    data_collator=data_collator_distilbert,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer_distilbert.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1194,0.129073,0.957219,0.978261,0.9375,0.957447
2,0.0539,0.121382,0.957219,0.968085,0.947917,0.957895
3,0.0464,0.135013,0.962567,0.968421,0.958333,0.963351
4,0.0412,0.155069,0.973262,1.0,0.947917,0.973262
5,0.0174,0.167772,0.962567,0.978495,0.947917,0.962963


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

TrainOutput(global_step=275, training_loss=0.09883374588056044, metrics={'train_runtime': 170.2127, 'train_samples_per_second': 25.703, 'train_steps_per_second': 1.616, 'total_flos': 391548016613724.0, 'train_loss': 0.09883374588056044, 'epoch': 5.0})

In [None]:
# To see which model was actually the best one (which checkpoint):
print(trainer_distilbert.state.best_model_checkpoint)

/content/drive/MyDrive/distilbert/checkpoint-165


In [13]:
# Training the RoBERTa base model:

set_seed(24)

model_roberta = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # 2 labels: depression/no depression
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

arguments_roberta = TrainingArguments(
    output_dir="/content/drive/MyDrive/roberta",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    logging_steps=10,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_recall",
    learning_rate=3e-5,
    weight_decay=0.02,
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    """Called at the end of validation."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Debugging: Print shapes and values
    print(f"Logits shape: {logits.shape}")
    print(f"Labels: {labels}")
    print(f"Predictions: {predictions}")

    # Scikit-learn for metric computation
    acc = accuracy_score(labels, predictions)
    prec = precision_score(labels, predictions, average="binary", zero_division=0)
    rec = recall_score(labels, predictions, average="binary", zero_division=0)
    f1 = f1_score(labels, predictions, average="binary", zero_division=0)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }


trainer_roberta = Trainer(
    model=model_roberta,
    args=arguments_roberta,
    train_dataset=small_tokenized_dataset_roberta['train'],
    eval_dataset=small_tokenized_dataset_roberta['val'],
    processing_class=tokenizer_roberta,
    data_collator=data_collator_roberta,
    compute_metrics=compute_metrics
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# To check the label distribution:
print("Train label distribution:", Counter(train['label']))
print("Validation label distribution:", Counter(val['label']))
print("Test label distribution:", Counter(test['label']))

Train label distribution: Counter({0: 442, 1: 433})
Validation label distribution: Counter({1: 96, 0: 91})
Test label distribution: Counter({1: 98, 0: 90})


In [None]:
trainer_roberta.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1423,0.154774,0.962567,0.989011,0.9375,0.962567
2,0.0823,0.169628,0.962567,1.0,0.927083,0.962162
3,0.0841,0.245844,0.957219,0.968085,0.947917,0.957895
4,0.0401,0.239554,0.957219,0.968085,0.947917,0.957895
5,0.0363,0.235422,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

TrainOutput(global_step=275, training_loss=0.10543111869218674, metrics={'train_runtime': 332.8467, 'train_samples_per_second': 13.144, 'train_steps_per_second': 0.826, 'total_flos': 734661566553360.0, 'train_loss': 0.10543111869218674, 'epoch': 5.0})

In [None]:
# To see which model was actually the best one (which checkpoint):
print(trainer_roberta.state.best_model_checkpoint)

/content/drive/MyDrive/roberta/checkpoint-165


In [None]:
'''
This code cell was used to fine-tune the weight decay hyperparameter, and does not need to be run again.
'''
# Fine-tuning weight decay for distilbert:
import tempfile

# Temporary directory for saving models during training
temp_dir = tempfile.mkdtemp()

model_distilbert = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=2) # 2 labels: depression/no depression

weight_decay_values = [0.001, 0.005, 0.01, 0.02, 0.1]
for wd in weight_decay_values:
    model_distilbert = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=2) # 2 labels: depression/no depression

    arguments_distilbert = TrainingArguments(
    output_dir=temp_dir,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    logging_steps=10,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    weight_decay=wd,
    load_best_model_at_end=False,
    report_to='none',
    seed=224,
    )

    trainer = Trainer(
    model=model_distilbert,
    args=arguments_distilbert,
    train_dataset=small_tokenized_dataset_distilbert['train'],
    eval_dataset=small_tokenized_dataset_distilbert['val'],
    processing_class=tokenizer_distilbert,
    data_collator=data_collator_distilbert,
    compute_metrics=compute_metrics
    )

    trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1593,0.159394,0.946524,0.967391,0.927083,0.946809
2,0.0561,0.137504,0.957219,0.968085,0.947917,0.957895
3,0.0531,0.160694,0.962567,0.978495,0.947917,0.962963
4,0.0232,0.205225,0.962567,0.978495,0.947917,0.962963
5,0.0119,0.20435,0.962567,0.978495,0.947917,0.962963


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1974,0.173542,0.957219,0.978261,0.9375,0.957447
2,0.0615,0.124575,0.962567,0.978495,0.947917,0.962963
3,0.0446,0.146596,0.962567,0.978495,0.947917,0.962963
4,0.0263,0.19579,0.962567,0.978495,0.947917,0.962963
5,0.0136,0.199333,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1974,0.173635,0.957219,0.978261,0.9375,0.957447
2,0.0619,0.125119,0.962567,0.978495,0.947917,0.962963
3,0.0446,0.147395,0.957219,0.968085,0.947917,0.957895
4,0.0265,0.196028,0.962567,0.978495,0.947917,0.962963
5,0.0135,0.199559,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1974,0.17359,0.957219,0.978261,0.9375,0.957447
2,0.0617,0.125061,0.962567,0.978495,0.947917,0.962963
3,0.0443,0.146963,0.962567,0.978495,0.947917,0.962963
4,0.0265,0.196332,0.962567,0.978495,0.947917,0.962963
5,0.0135,0.199962,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1975,0.173638,0.957219,0.978261,0.9375,0.957447
2,0.0615,0.124508,0.962567,0.978495,0.947917,0.962963
3,0.0447,0.146848,0.957219,0.968085,0.947917,0.957895
4,0.0259,0.195251,0.962567,0.978495,0.947917,0.962963
5,0.0136,0.199201,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

In [None]:
'''
This code cell was used to fine-tune the weight decay hyperparameter, and does not need to be run again.
'''
# Fine-tuning weight decay RoBERTa:

model_roberta = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # 2 labels: depression/no depression

weight_decay_values = [0.001, 0.005, 0.01, 0.02, 0.1]
for wd in weight_decay_values:
    model_roberta = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2) # 2 labels: depression/no depression

    arguments_roberta = TrainingArguments(
    output_dir=temp_dir,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    logging_steps=10,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    weight_decay=wd,
    load_best_model_at_end=False,
    report_to='none',
    seed=224
    )

    trainer = Trainer(
    model=model_roberta,
    args=arguments_roberta,
    train_dataset=small_tokenized_dataset_roberta['train'],
    eval_dataset=small_tokenized_dataset_roberta['val'],
    processing_class=tokenizer_roberta,
    data_collator=data_collator_roberta,
    compute_metrics=compute_metrics
    )

    trainer.train()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.124,0.147685,0.957219,0.978261,0.9375,0.957447
2,0.1024,0.21876,0.957219,0.978261,0.9375,0.957447
3,0.0889,0.149748,0.962567,0.989011,0.9375,0.962567
4,0.0468,0.210094,0.962567,1.0,0.927083,0.962162
5,0.0703,0.171725,0.967914,0.98913,0.947917,0.968085


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1213,0.170164,0.941176,0.947368,0.9375,0.942408
2,0.0655,0.148758,0.973262,1.0,0.947917,0.973262
3,0.1048,0.141748,0.962567,0.978495,0.947917,0.962963
4,0.0493,0.207006,0.967914,0.98913,0.947917,0.968085
5,0.0308,0.216965,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1471,0.134157,0.962567,0.989011,0.9375,0.962567
2,0.1314,0.215511,0.957219,0.978261,0.9375,0.957447
3,0.0822,0.184043,0.962567,1.0,0.927083,0.962162
4,0.0358,0.243897,0.962567,0.978495,0.947917,0.962963
5,0.0066,0.254545,0.962567,0.978495,0.947917,0.962963


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 1 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1722,0.130436,0.967914,1.0,0.9375,0.967742
2,0.1613,0.180014,0.962567,0.989011,0.9375,0.962567
3,0.1089,0.138797,0.967914,0.978723,0.958333,0.968421
4,0.0375,0.174659,0.967914,0.98913,0.947917,0.968085
5,0.0379,0.213773,0.957219,0.968085,0.947917,0.957895


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1786,0.141853,0.967914,1.0,0.9375,0.967742
2,0.0516,0.207029,0.962567,0.989011,0.9375,0.962567
3,0.0913,0.230908,0.946524,1.0,0.895833,0.945055
4,0.066,0.190382,0.957219,0.978261,0.9375,0.957447
5,0.0346,0.223656,0.962567,0.978495,0.947917,0.962963


Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 1
 1 1]
Predictions: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 0 1 1
 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 0 1 0
 0 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 0 1 1
 1 1]
Logits shape: (187, 2)
Labels: [0 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1
 1 1 1 1 0 0 

In [14]:
import os
import torch

In [15]:
# Testing the final DistilBERT model saved in Drive with Trainer.evaluate() (less memory intensive):

fine_tuned_model_distilbert = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/distilbert/checkpoint-165")

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Create a new Trainer instance for evaluation
new_trainer = Trainer(
    model=fine_tuned_model_distilbert,
    eval_dataset=small_tokenized_dataset_distilbert["test"],
    compute_metrics=compute_metrics,
)

# Evaluate the model
test_results = new_trainer.evaluate()
print(test_results)

# Get predictions and labels
predictions, labels, _ = new_trainer.predict(small_tokenized_dataset_distilbert["test"])

# Get the predicted class (most probable class)
predicted_classes = torch.argmax(torch.tensor(predictions), dim=-1)

# Find wrongly classified samples
wrongly_classified = []
for i in range(len(predicted_classes)):
    if predicted_classes[i] != labels[i]:
        wrongly_classified.append({
            "text": small_tokenized_dataset_distilbert["test"][i]["text"],
            "true_label": labels[i],
            "predicted_label": predicted_classes[i]
        })

# Output the wrongly classified samples
for example in wrongly_classified:
    print(f"Text: {example['text']}")
    print(f"True Label: {example['true_label']}")
    print(f"Predicted Label: {example['predicted_label']}")
    print("-" * 100)

# Evaluate the model on the entire test set
test_results = new_trainer.evaluate()
print(test_results)

# This gives me results for best DistilBERT model from Drive.

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Logits shape: (188, 2)
Labels: [1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 0 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
Predictions: [1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
{'eval_loss': 0.0935538038611412, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': 0.973404255319149, 'eval_precision': 0.9894736842105263, 'eval_recall': 0.9591836734693877, 'eval_f1

Logits shape: (188, 2)
Labels: [1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 0 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
Predictions: [1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
{'eval_loss': 0.0935538038611412, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': 0.973404255319149, 'eval_precision': 0.9894736842105263, 'eval_recall': 0.9591836734693877, 'eval_f1

In [16]:
# Testing the final RoBERTa model saved in Drive with Trainer.evaluate() (less memory intensive):

fine_tuned_model_roberta = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/roberta/checkpoint-165")

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Create a new Trainer instance for evaluation
new_trainer = Trainer(
    model=fine_tuned_model_roberta,
    eval_dataset=small_tokenized_dataset_roberta["test"],
    compute_metrics=compute_metrics,
)

# Evaluate the model
test_results = new_trainer.evaluate()
print(test_results)

# Get predictions and labels
predictions, labels, _ = new_trainer.predict(small_tokenized_dataset_roberta["test"])

# Get the predicted class (most probable class)
predicted_classes = torch.argmax(torch.tensor(predictions), dim=-1)

# Find wrongly classified samples
wrongly_classified = []
for i in range(len(predicted_classes)):
    if predicted_classes[i] != labels[i]:
        wrongly_classified.append({
            "text": small_tokenized_dataset_roberta["test"][i]["text"],
            "true_label": labels[i],
            "predicted_label": predicted_classes[i]
        })

# Output the wrongly classified samples
for example in wrongly_classified:
    print(f"Text: {example['text']}")
    print(f"True Label: {example['true_label']}")
    print(f"Predicted Label: {example['predicted_label']}")
    print("-" * 100)

# Evaluate the model on the entire test set
test_results = new_trainer.evaluate()
print(test_results)

# This gives me results for best RoBERTa model from Drive.

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Logits shape: (188, 2)
Labels: [1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 0 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
Predictions: [1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
{'eval_loss': 0.1415235996246338, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.9787234042553191, 'eval_precision': 0.9895833333333334, 'eval_recall': 0.9693877551020408, 'eval_f

Logits shape: (188, 2)
Labels: [1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 0 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
Predictions: [1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 0 1
 1 1 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1 1 1 0
 1 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0
 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0
 0 0 0]
{'eval_loss': 0.1415235996246338, 'eval_model_preparation_time': 0.0031, 'eval_accuracy': 0.9787234042553191, 'eval_precision': 0.9895833333333334, 'eval_recall': 0.9693877551020408, 'eval_f

In [17]:
# Visualisations:

!pip install bertviz transformers
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3 (from bertviz)
  Downloading boto3-1.35.98-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.36.0,>=1.35.98 (from boto3->bertviz)
  Downloading botocore-1.35.98-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->bertviz)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->bertviz)
  Downloading s3transfer-0.10.4-py3-none-any.whl.metadata (1.7 kB)
Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.35.98-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.98-py3-none-any.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
from torch.utils.tensorboard import SummaryWriter
import re
import tensorflow as tf
import tensorboard as tb  #this is important to be able to store embeddings in a format so that I can use tensorflow visualiation

In [None]:
'''
This is the code cell where I run into the RAM issue. (and the one below for RoBERTa should be similar)
'''
# The following code saves the hidden states of all samples in the test set of all layers of this DistilBERT checkpoint:

# Load model:
fine_tuned_model_distilbert = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/distilbert/checkpoint-165")

model_inputs = tokenizer_distilbert(small_tokenized_dataset_distilbert['test']['text'], padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model_distilbert(**model_inputs, output_hidden_states=True)

# Code for visualisation:
path = "results_vis_distilbert" # creates directory, can change it to drive location
layer=0 # sets layer i want to start from
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    sp_token_position = 0
    for token in model_inputs['input_ids'][example]:
      if token != tokenizer_distilbert.pad_token_id and token != tokenizer_distilbert.cls_token_id: # changed this code here a bit
        sp_token_position += 1
      else:
        tensor = outputs['hidden_states'][layer][example][sp_token_position]
        tensors.append(tensor)
        break

    label = [small_tokenized_dataset_distilbert['test']['text'][example],str(small_tokenized_dataset_distilbert['test']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['text','label']) # changed "Text" to "text" and "Emotion" to "label"

  layer+=1

In [None]:
# The following code saves the hidden states of all samples in the test set of all layers of this RoBERTa checkpoint:

# Load model:
fine_tuned_model_roberta = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/roberta/checkpoint-165")

model_inputs = tokenizer_roberta(small_tokenized_dataset_distilbert['test']['text'], padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model_roberta(**model_inputs, output_hidden_states=True)

# Code for visualisation:
path = "results_vis_roberta" # creates directory, can change it to drive location
layer=0 # sets layer i want to start from
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    sp_token_position = 0
    for token in model_inputs['input_ids'][example]:
      if token != tokenizer_roberta.pad_token_id and token != tokenizer_roberta.cls_token_id: # changed this code here a bit
        sp_token_position += 1
      else:
        tensor = outputs['hidden_states'][layer][example][sp_token_position]
        tensors.append(tensor)
        break

    label = [small_tokenized_dataset_roberta['test']['text'][example],str(small_tokenized_dataset_roberta['test']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['text','label']) # changed "Text" to "text" and "Emotion" to "label"

  layer+=1