In [1]:
# Install necessary packages
!pip install transformers datasets scikit-learn torch --quiet



In [2]:
import pandas as pd

df = pd.read_csv("go_emotions_dataset.csv")

print(df.head())
print(df.columns)



        id                                               text  \
0  eew5j0j                                    That game hurt.   
1  eemcysk   >sexuality shouldn’t be a grouping category I...   
2  ed2mah1     You do right, if you don't care then fuck 'em!   
3  eeibobj                                 Man I love reddit.   
4  eda6yn6  [NAME] was nowhere near them, he was by the Fa...   

   example_very_unclear  admiration  amusement  anger  annoyance  approval  \
0                 False           0          0      0          0         0   
1                  True           0          0      0          0         0   
2                 False           0          0      0          0         0   
3                 False           0          0      0          0         0   
4                 False           0          0      0          0         0   

   caring  confusion  ...  love  nervousness  optimism  pride  realization  \
0       0          0  ...     0            0         0      0 

In [3]:
# These are the columns representing emotion labels (1 = emotion present, 0 = absent)
emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
    'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
    'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
    'sadness', 'surprise', 'neutral'
]

# Optional: Drop 'example_very_unclear' if you don’t want to use those samples
df = df[df['example_very_unclear'] != 1].reset_index(drop=True)

print(f"Total samples after filtering unclear: {len(df)}")


Total samples after filtering unclear: 207814


In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

print(f"Train samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")


Train samples: 187032
Validation samples: 20782


In [5]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)


In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/187032 [00:00<?, ? examples/s]

Map:   0%|          | 0/20782 [00:00<?, ? examples/s]

In [7]:
import torch

def format_labels(example):
    labels = [example[emotion] for emotion in emotion_columns]
    example['labels'] = torch.tensor(labels, dtype=torch.float)
    return example

train_ds = train_ds.map(format_labels)
val_ds = val_ds.map(format_labels)


Map:   0%|          | 0/187032 [00:00<?, ? examples/s]

Map:   0%|          | 0/20782 [00:00<?, ? examples/s]

In [8]:
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [9]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)



In [10]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/187032 [00:00<?, ? examples/s]

Map:   0%|          | 0/20782 [00:00<?, ? examples/s]

In [11]:
import torch

def format_labels(example):
    labels = [example[emotion] for emotion in emotion_columns]
    example['labels'] = torch.tensor(labels, dtype=torch.float)
    return example

train_ds = train_ds.map(format_labels)
val_ds = val_ds.map(format_labels)


Map:   0%|          | 0/187032 [00:00<?, ? examples/s]

Map:   0%|          | 0/20782 [00:00<?, ? examples/s]

In [12]:
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [13]:
!pip install -U transformers




In [14]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, hamming_loss
import numpy as np

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(emotion_columns),
    problem_type="multi_label_classification"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    labels = labels.numpy()
    return {
        'hamming_loss': hamming_loss(labels, preds),
        'micro_f1': f1_score(labels, preds, average='micro'),
        'macro_f1': f1_score(labels, preds, average='macro')
    }

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
trainer.train()
