In [None]:
from google.colab import drive
import pandas as pd
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from transformers import BertTokenizerFast, BertForSequenceClassification

drive.mount('/content/drive')

In [None]:
emotion_to_label = {
    'admiration': 0, 'amusement': 1, 'anger': 2, 'annoyance': 3, 'approval': 4,
    'caring': 5, 'confusion': 6, 'curiosity': 7, 'desire': 8, 'disappointment': 9,
    'disapproval': 10, 'disgust': 11, 'embarrassment': 12, 'excitement': 13,
    'fear': 14, 'gratitude': 15, 'grief': 16, 'joy': 17, 'love': 18,
    'nervousness': 19, 'optimism': 20, 'pride': 21, 'realization': 22,
    'relief': 23, 'remorse': 24, 'sadness': 25, 'surprise': 26, 'neutral': 27
}
data = pd.read_csv('/content/drive/MyDrive/goemotions_1.csv')
emodata = data.drop(['id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear'], axis=1)
emodata['label'] = emodata.loc[:, emotion_to_label.keys()].idxmax(axis=1).map(emotion_to_label)
emodata = emodata[['text', 'label']]

In [None]:
emodata = emodata.groupby('label').apply(lambda x: x.sample(n=300, replace=True) if len(x) > 150 else x)
label_counts = emodata['label'].value_counts()

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(emotion_to_label))
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(emotion_to_label))

In [None]:
def tokenize_data(texts, labels, tokenizer, max_length=128):
    tokenized = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    return tokenized, torch.tensor(labels)

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

texts = emodata['text'].tolist()
labels = emodata['label'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)


train_encodings, train_labels = tokenize_data(train_texts, train_labels, tokenizer)
val_encodings, val_labels = tokenize_data(val_texts, val_labels, tokenizer)
train_dataset = EmotionDataset(train_encodings, train_labels)
val_dataset = EmotionDataset(val_encodings, val_labels)


bert_train_encodings, bert_train_labels = tokenize_data(train_texts, train_labels, bert_tokenizer)
bert_val_encodings, bert_val_labels = tokenize_data(val_texts, val_labels, bert_tokenizer)
bert_train_dataset = EmotionDataset(bert_train_encodings, bert_train_labels)
bert_val_dataset = EmotionDataset(bert_val_encodings, bert_val_labels)

In [None]:
pip install accelerate -U

In [None]:
from sklearn.metrics import accuracy_score
from transformers import EvalPrediction
from transformers import Trainer, TrainingArguments

def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

bert_trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=bert_train_dataset,
    eval_dataset=bert_val_dataset,
    compute_metrics=compute_metrics
)


trainer.train()
results = trainer.evaluate()
print("RoBERTa Test Accuracy:", results["eval_accuracy"])

bert_trainer.train()
bert_results = bert_trainer.evaluate()
print("BERT Test Accuracy:", bert_results["eval_accuracy"])