<a href="https://colab.research.google.com/github/larajakl/Computational-Linguistics/blob/main/trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate --upgrade
!pip install optuna
!pip install optuna-integration[pytorch_lightning]

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorWithPadding

from transformers import AutoTokenizer

from transformers import set_seed
from collections import Counter


In [None]:
dataset = load_dataset("christinacdl/binary_hate_speech")

set_seed(24)

In [None]:
# Check distribution of labels in full dataset:

full_label_distribution = Counter(dataset['train']['label'])
print("Full dataset label distribution:", full_label_distribution)

Full dataset label distribution: Counter({'NOT_OFF_HATEFUL_TOXIC': 15530, 'OFF_HATEFUL_TOXIC': 15530})


In [None]:
label_mapping = {
    'NOT_OFF_HATEFUL_TOXIC': 0,
    'OFF_HATEFUL_TOXIC': 1  # Add more mappings if there are additional labels
}

In [None]:
def map_labels(example):
    example['label'] = label_mapping[example['label']]
    return example

dataset = dataset.map(map_labels)

In [None]:
print(dataset['train'][0])  # Inspect one example

{'text': "She won't be there for long.", 'label': 0}


In [None]:
# Just take the first n tokens for speed on CPU
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:300]),
        'label': example['label']
    }

# Random examples for train, validation and test
# Limit the dataset to the first 200 entries, JUST FOR NOW (ADAPT THESE LINES LATER)
subset_dataset = dataset['train'].shuffle(seed=24).select(range(1250))
# Define the train/val/test split proportions:
train_ratio, val_ratio = 0.7, 0.15  # 70% train, 15% val, 15% test
# Shuffle the dataset once:
shuffled_dataset = subset_dataset.shuffle(seed=24)
# Compute the split indices:
total_size = len(shuffled_dataset)
train_end = int(train_ratio * total_size)
val_end = train_end + int(val_ratio * total_size)
# Create splits:
train = shuffled_dataset.select(range(train_end)).map(truncate)
val = shuffled_dataset.select(range(train_end, val_end)).map(truncate)
test = shuffled_dataset.select(range(val_end, total_size)).map(truncate)

# Print the sizes of the splits:
print(f"Train size: {len(train)}, Validation size: {len(val)}, Test size: {len(test)}")

dataset_dict = DatasetDict({
    "train": train,
    "val": val,
    "test": test
})

Map:   0%|          | 0/875 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Train size: 875, Validation size: 187, Test size: 188


In [None]:
print(shuffled_dataset)

print(dataset_dict)

Dataset({
    features: ['text', 'label'],
    num_rows: 1250
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 875
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 187
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 188
    })
})


In [None]:
# Model 1: Distil BERT cased
# cased models: they treat words like "Word" and "word" as separate tokens

tokenizer_distilbert = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

def tokenize_function_distilbert(examples):
    return tokenizer_distilbert(examples["text"], padding=True, truncation=True)

small_tokenized_dataset_distilbert = dataset_dict.map(tokenize_function_distilbert, batched=True, batch_size=16)
data_collator_distilbert = DataCollatorWithPadding(tokenizer=tokenizer_distilbert)

Map:   0%|          | 0/875 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

In [None]:
print(small_tokenized_dataset_distilbert)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 875
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 187
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 188
    })
})


In [None]:
# Model 2: RoBERTa base
tokenizer_roberta = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function_roberta(examples):
    return tokenizer_roberta(examples["text"], padding=True, truncation=True)

# Apply the tokenize function to the dataset
small_tokenized_dataset_roberta = dataset_dict.map(tokenize_function_roberta, batched=True, batch_size=16)

# Create a data collator with padding
data_collator_roberta = DataCollatorWithPadding(tokenizer=tokenizer_roberta)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/875 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

In [None]:
# Mounting Google Drive to store the checkpoints in Google Drive instead of my runtime:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

In [None]:
# Training the Distil BERT cased model:

set_seed(24)

model_distilbert = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=2) # 2 labels: depression/no depression
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

arguments_distilbert = TrainingArguments(
    output_dir="/content/drive/MyDrive/distilbert_trial",
    per_device_train_batch_size=16, # adapt
    per_device_eval_batch_size=8, # adapt
    logging_steps=10,
    num_train_epochs=5, # adapt
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5, # adapt
    weight_decay=0.01, # adapt
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    """Called at the end of validation."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Debugging: Print shapes and values
    print(f"Logits shape: {logits.shape}")
    print(f"Labels: {labels}")
    print(f"Predictions: {predictions}")

    # Use scikit-learn for reliable metric computation
    acc = accuracy_score(labels, predictions)
    prec = precision_score(labels, predictions, average="binary", zero_division=0)
    rec = recall_score(labels, predictions, average="binary", zero_division=0)
    f1 = f1_score(labels, predictions, average="binary", zero_division=0)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1
    }


trainer_distilbert = Trainer(
    model=model_distilbert,
    args=arguments_distilbert,
    train_dataset=small_tokenized_dataset_distilbert['train'],
    eval_dataset=small_tokenized_dataset_distilbert['val'], # change to test when you do your final evaluation!
    processing_class=tokenizer_distilbert,
    data_collator=data_collator_distilbert,
    compute_metrics=compute_metrics
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer_distilbert.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.569,0.62094,0.668449,0.688525,0.494118,0.575342
2,0.4264,0.645869,0.647059,0.646154,0.494118,0.56
3,0.2606,0.921802,0.647059,0.661017,0.458824,0.541667
4,0.1047,1.139668,0.657754,0.636364,0.576471,0.604938
5,0.0644,1.255847,0.641711,0.6125,0.576471,0.593939


Logits shape: (187, 2)
Labels: [0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 1 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 0 0
 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 0
 0 1]
Predictions: [0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 1
 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 1 1 0
 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 1 1 0 0
 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 1
 1 0]
Logits shape: (187, 2)
Labels: [0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 1 0 

TrainOutput(global_step=275, training_loss=0.3080614731528542, metrics={'train_runtime': 113.3857, 'train_samples_per_second': 38.585, 'train_steps_per_second': 2.425, 'total_flos': 242087275748496.0, 'train_loss': 0.3080614731528542, 'epoch': 5.0})