In [None]:
!pip3 install transformers datasets torch scikit-learn pandas numpy
!pip3 install accelerate


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments

# Load dataset
df = pd.read_csv("goemotions_1.csv")  # Update path if needed

# Emotion Mapping (from your predefined categories)
emotion_map = {
    "Very Sad": ['sadness', 'grief'],
    "Moderately Sad": ['disappointment'],
    "Little Sad": ['remorse'],
    "Okayish": ['neutral','approval'],
    "Giddy": ['excitement', 'joy','love','desire'],
    "Pleasant": ['admiration','relief','caring'],
    "Party!!": ['amusement', 'pride','joy','excitement'],
    "Yikes": ['annoyance','surprise','disapproval'],
    "Angry": ['anger', 'disgust'],
    "Spooky": ['fear']
}


# Convert one-hot encoding into a single label
def map_emotions(row):
    for category, emotions in emotion_map.items():
        if any(row[emotion] == 1 for emotion in emotions):
            return category
    return "Unknown"

df["mapped_emotion"] = df.apply(map_emotions, axis=1)

# Remove 'Unknown' labels
df = df[df["mapped_emotion"] != "Unknown"]

# Encode categories as numbers
emotion_labels = {emotion: idx for idx, emotion in enumerate(emotion_map.keys())}
df["label"] = df["mapped_emotion"].map(emotion_labels)

# Split Data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

print("✅ Data Loaded and Processed!")


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(emotion_labels)  # Number of emotion classes
)


In [None]:
!pip install 'accelerate>=0.26.0'

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-emotion-model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


In [None]:
model.save_pretrained("./bert-emotion-classifier")
tokenizer.save_pretrained("./bert-emotion-classifier")

# Load for testing
model = BertForSequenceClassification.from_pretrained("./bert-emotion-classifier")
tokenizer = BertTokenizer.from_pretrained("./bert-emotion-classifier")

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    
    emotion_mapping = {v: k for k, v in emotion_labels.items()}
    return emotion_mapping[predicted_class]


