In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load emotions list
EMOTIONS = [
    "joy", "sadness", "anger", "fear", 
    "love", "neutral"
]

# Define tokenizer and model paths
MODEL_NAME = "bert-base-uncased"
MODEL_SAVE_PATH = "./emotion_model"

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.float),
        }

def load_data(file_path, tokenizer, max_length):
    df = pd.read_csv(file_path, sep="\t", names=["text", "labels", "id"])
    texts = df["text"].tolist()
    labels = []
    for label_str in df["labels"]:
        # Handle single or multiple labels
        label = [0] * len(EMOTIONS)
        for idx in map(int, str(label_str).split(",")):
            if 0 <= idx < len(EMOTIONS):
                label[idx] = 1
        labels.append(label)
    return EmotionDataset(texts, labels, tokenizer, max_length)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# Load training and validation datasets
train_data = load_data("train.tsv", tokenizer, max_length=128)
val_data = load_data("test.tsv", tokenizer, max_length=128)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(EMOTIONS), problem_type="multi_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Prediction function with improved thresholding
def predict(text, threshold=0.3):
    encoding = tokenizer(
        text, max_length=128, padding="max_length", truncation=True, return_tensors="pt"
    )
    with torch.no_grad():
        outputs = model(**encoding)
    probabilities = torch.sigmoid(outputs.logits).squeeze().numpy()
    predicted_labels = [EMOTIONS[i] for i, prob in enumerate(probabilities) if prob > threshold]
    return predicted_labels

# Comprehensive model evaluation
def evaluate_model(dataset):
    model.eval()
    all_preds = []
    all_true_labels = []
    
    for i in range(len(dataset)):
        # Decode the input text
        input_ids = dataset[i]['input_ids']
        text = tokenizer.decode(input_ids, skip_special_tokens=True)
        
        # Get true labels
        true_label = dataset[i]['labels'].numpy()
        true_labels = [EMOTIONS[j] for j in range(len(EMOTIONS)) if true_label[j] == 1]
        
        # Predict labels
        pred_labels = predict(text)
        
        all_preds.append(pred_labels)
        all_true_labels.append(true_labels)
    
    # Convert to binary matrix for multi-label classification report
    y_true = np.zeros((len(dataset), len(EMOTIONS)))
    y_pred = np.zeros((len(dataset), len(EMOTIONS)))
    
    for i, (true_labels, pred_labels) in enumerate(zip(all_true_labels, all_preds)):
        for label in true_labels:
            y_true[i, EMOTIONS.index(label)] = 1
        for label in pred_labels:
            y_pred[i, EMOTIONS.index(label)] = 1
    
    # Print detailed classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=EMOTIONS))
    
    # Confusion Matrix for Multi-label Classification
    print("\nMulti-label Confusion Matrix:")
    conf_matrices = multilabel_confusion_matrix(y_true, y_pred)
    for i, emotion in enumerate(EMOTIONS):
        print(f"\n{emotion}:")
        print(conf_matrices[i])

# Evaluate the model
evaluate_model(val_data)

# Save and reload model (optional)
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
