In [None]:
!pip install --upgrade transformers --quiet


In [None]:
!pip install --upgrade datasets fsspec


In [None]:
pip install transformers datasets scikit-learn torch pandas numpy tqdm


In [None]:
pip install datasets

# **IMPORTING LIBRARIES**

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, hamming_loss


# **LOADING DATASET**

In [None]:
from datasets import load_dataset
dataset = load_dataset("go_emotions")


# **INSPECTING DATASET**

In [None]:
print(dataset['train'][0])  # Inspect a sample from the training data


# **TEXT PREPROCESSING**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess(examples):
    # Tokenize
    encoding = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

    # Multi-hot encode the labels
    num_labels = 28
    multi_hot_labels = []
    for label_list in examples["labels"]:
        vec = [0.0] * num_labels
        for label in label_list:
            vec[label] = 1.0
        multi_hot_labels.append(vec)

    encoding["labels"] = multi_hot_labels
    return encoding

# Apply preprocessing
encoded_dataset = dataset.map(preprocess, batched=True)

In [None]:
from datasets import ClassLabel

# Explicitly set label type to float32
encoded_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels'],
    output_all_columns=False
)

# Manually cast labels to float
def cast_labels_to_float(batch):
    batch["labels"] = batch["labels"].type(torch.float32)
    return batch

encoded_dataset["train"] = encoded_dataset["train"].map(cast_labels_to_float)
encoded_dataset["validation"] = encoded_dataset["validation"].map(cast_labels_to_float)

In [None]:
print(encoded_dataset['train'][0]['labels'].dtype)  # should print torch.float32

# **LOADING PRE-TRAINED BERT**

In [None]:
# Load Pre-trained BERT for multi-label classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=28,
    problem_type="multi_label_classification"
)

# **PREPARING DATA COLLATOR**

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# **DEFINING EVALUATION METRICS**

In [None]:
def compute_metrics(p):
    predictions = torch.sigmoid(torch.tensor(p.predictions)).numpy()
    labels = p.label_ids
    return {
        'accuracy': accuracy_score(labels, predictions > 0.5),
        'f1': f1_score(labels, predictions > 0.5, average='micro'),
        'hamming_loss': hamming_loss(labels, predictions > 0.5)
    }


# **SETTING UP TRAINING ARGUMENT**

In [None]:
from transformers import TrainerCallback

class EvalCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, **kwargs):
        # Manual evaluation after each epoch
        trainer.evaluate()

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EvalCallback]  # Add the callback here
)


# **INITIALIZE TRAINER**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
)


# **TRAINING MODEL**

In [None]:
import transformers
print(transformers.__version__)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import numpy as np

# 1. Load dataset
dataset = load_dataset("go_emotions")

# 2. Initialize tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Tokenize function
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize, batched=True)

# 4. Convert labels to multi-hot vectors (and ensure float32 now)
num_labels = 28  # 27 emotions + neutral

def encode_labels(example):
    multi_hot = np.zeros(num_labels, dtype=np.float32)  # ensures float32
    for label in example['labels']:
        multi_hot[label] = 1.0
    example['labels'] = multi_hot.astype(np.float32)  # enforce float32 explicitly
    return example

encoded_dataset = tokenized_dataset.map(encode_labels)

# 5. Set PyTorch format AFTER casting
encoded_dataset.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels'],
    output_all_columns=False
)

# 5. Set PyTorch format
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:
print(encoded_dataset["train"][0]["labels"].dtype)  # should print torch.float32

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)


In [None]:
from transformers import Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Define your metric functions
)


In [None]:
train_data = encoded_dataset["train"].select(range(500))
eval_data = encoded_dataset["validation"].select(range(100))

In [None]:
from torch.utils.data.dataloader import default_collate

def collate_fn(batch):
    batch = default_collate(batch)
    batch['labels'] = batch['labels'].float()  # force float tensor
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# **EVALUATE MODEL**

In [None]:
eval_results = trainer.evaluate()
print(eval_results)


# **TEST ON REAL WORLD TEXT**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt

# Sample texts
sample_texts = [
    "I'm so happy and excited about the new opportunity!",
    "This is absolutely terrible and I feel hopeless.",
    "I don't know how to feel, it's all so confusing."
]

# Load pre-trained GoEmotions model
model_name = "monologg/bert-base-cased-goemotions-original"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Label list (43 emotions + neutral for this model)
label_list = model.config.id2label.values() if hasattr(model.config, 'id2label') else list(range(model.config.num_labels))

# Tokenize
inputs = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True)

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits)

# Threshold for emotion detection
threshold = 0.2
preds = (probs >= threshold).int().numpy()

# Show predictions and probabilities
for i, (text, prob, pred) in enumerate(zip(sample_texts, probs.numpy(), preds)):
    print(f"\nText: {text}")

    # Print emotions with prob > 0.1
    print("Probabilities (top 5):")
    top_indices = prob.argsort()[-5:][::-1]
    for idx in top_indices:
        print(f"{list(label_list)[idx]}: {prob[idx]:.2f}")

    # Predicted emotions
    emotions = [list(label_list)[i] for i, val in enumerate(pred) if val == 1]
    print(f"Predicted Emotions (threshold {threshold}): {emotions}")

    # Optional: Plot emotion probabilities
    plt.figure(figsize=(8, 4))
    plt.bar([list(label_list)[i] for i in top_indices], [prob[i] for i in top_indices], color='skyblue')
    plt.title(f"Emotion Probabilities for: \"{text}\"")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
