<a href="https://colab.research.google.com/github/kaaath-i/emotion-classification-bert-models/blob/main/Final_Project_ZEILNHOFER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**VU Introduction to Computational Linguistics**

Katharina Zeilnhofer

## **Final Project**


**Task**: Sentiment Analysis - Emotion Classification

*Dataset* : https://huggingface.co/datasets/mteb/emotion

*Model 1* : https://huggingface.co/distilbert/distilbert-base-cased

*Model 2* : https://huggingface.co/FacebookAI/roberta-base

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install bertviz transformers
!pip install accelerate --upgrade

### Tokenizer

In [None]:
from transformers import AutoTokenizer

# distilbert-base-cased
tokenizer1 = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
print(tokenizer1)

# roberta-base
tokenizer2 = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
print(tokenizer2)

### Models

In [None]:
from transformers import AutoModelForSequenceClassification

# distilbert-base-cased
model1 = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=6)

# roberta-base
model2 = AutoModelForSequenceClassification.from_pretrained('FacebookAI/roberta-base', num_labels=6)

### Dataset

In [None]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("mteb/emotion")

def truncate(example):
    return {
        'text': " ".join(example['text'].split()),
        'label': example['label'],
        'label_text': example['label_text']
    }

small_ds = DatasetDict(
    train=ds['train'].shuffle(seed=24).select(range(128)).map(truncate),
    val=ds['validation'].shuffle(seed=24).select(range(32)).map(truncate),
    test=ds['test'].shuffle(seed=24).select(range(32)).map(truncate)
    )

In [None]:
small_ds

In [None]:
print(small_ds["train"][:10])

### Training #1: distilbert-base-cased

From above:

```
tokenizer1 = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

model1 = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-cased', num_labels=6)
```



In [None]:
from transformers import DataCollatorWithPadding

def tokenize_function1(examples):
    return tokenizer1(examples["text"], padding=True, truncation=True)

small_tokenized_ds = small_ds.map(tokenize_function1, batched=True, batch_size=16)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer1)

In [None]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

accuracy = evaluate.load("accuracy")

arguments1 = TrainingArguments(
    output_dir="sample_cl_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=8,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer1 = Trainer(
    model=model1,
    args=arguments1,
    train_dataset=small_tokenized_ds['train'],
    eval_dataset=small_tokenized_ds['val'],
    processing_class=tokenizer1,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer1.train()

Evaluation

In [None]:
fine_tuned_model1 = AutoModelForSequenceClassification.from_pretrained("sample_cl_trainer/checkpoint-40")

result1 = trainer1.evaluate(small_tokenized_ds['test'])
print(result1)

In [None]:
import evaluate
from transformers import AutoModelForSequenceClassification
import torch

metric = evaluate.load("accuracy")

model_inputs = tokenizer1(list(small_tokenized_ds['test']['text']), padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model1(**model_inputs, output_hidden_states=True)
predictions = torch.argmax(outputs.logits, dim=-1)
test_accuracy = metric.compute(predictions=predictions, references=small_tokenized_ds['test']['label'])
print(test_accuracy)

Visualization

In [None]:
from torch.utils.tensorboard import SummaryWriter
import re
import torch
import tensorflow as tf
import tensorboard as tb

In [None]:
fine_tuned_model1 = AutoModelForSequenceClassification.from_pretrained("sample_cl_trainer/checkpoint-40")

model_inputs = tokenizer1(list(small_tokenized_ds['val']['text']), padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model1(**model_inputs, output_hidden_states=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import os

path = "results_vis"
layer=0
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    sp_token_position = 0
    for token in model_inputs['input_ids'][example]:
      if token != 101:
        sp_token_position += 1
      else:
        tensor = outputs['hidden_states'][layer][example][sp_token_position]
        tensors.append(tensor)
        break

    label = [small_tokenized_ds['test']['text'][example],str(small_tokenized_ds['test']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Text','Emotion'])

  layer+=1

### Training #2: roberta-base

From above:


```
tokenizer2 = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

model2 = AutoModelForSequenceClassification.from_pretrained('FacebookAI/roberta-base', num_labels=6)
```



In [None]:
def tokenize_function2(examples):
    return tokenizer2(examples["text"], padding=True, truncation=True)

small_tokenized_ds = small_ds.map(tokenize_function2, batched=True, batch_size=16)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer2)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

arguments = TrainingArguments(
    output_dir="sample_cl_trainer2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=8,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none',
    seed=224
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer2 = Trainer(
    model=model2,
    args=arguments,
    train_dataset=small_tokenized_ds['train'],
    eval_dataset=small_tokenized_ds['val'],
    processing_class=tokenizer1,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer2.train()

Evaluation

In [None]:
fine_tuned_model2 = AutoModelForSequenceClassification.from_pretrained("sample_cl_trainer2/checkpoint-40")

result2 = trainer2.evaluate(small_tokenized_ds['test'])
print(result2)

In [None]:
metric = evaluate.load("accuracy")
fine_tuned_model2 = AutoModelForSequenceClassification.from_pretrained("sample_cl_trainer2/checkpoint-40")

model_inputs = tokenizer2(list(small_tokenized_ds['test']['text']), padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model2(**model_inputs, output_hidden_states=True)
predictions = torch.argmax(outputs.logits, dim=-1)
accuracy = metric.compute(predictions=predictions, references=small_tokenized_ds['test']['label'])
print(accuracy)

Visualization

In [None]:
model_inputs = tokenizer2(list(small_tokenized_ds['val']['text']), padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model2(**model_inputs, output_hidden_states=True)

In [None]:
path = "results_vis"
layer=0
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    tensor = outputs['hidden_states'][layer][example][0]
    tensors.append(tensor)

    label = [small_tokenized_ds['test']['text'][example],str(small_tokenized_ds['test']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Text','Emotion'])

  layer+=1