In [6]:
pip uninstall -y transformers


[0m

In [7]:
!pip install -q transformers==4.44.2 datasets torch scikit-learn pandas tqdm


In [8]:
import transformers
print(transformers.__version__)


4.44.2


In [9]:
# ============================================
# 1. Install dependencies
# ============================================
!pip install -q transformers datasets torch scikit-learn pandas tqdm

# ============================================
# 2. Import libraries
# ============================================
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# ============================================
# 3. Load & preprocess GoEmotions dataset â†’ 4 classes
# ============================================
ds = load_dataset("go_emotions")

label_list = ds["train"].features["labels"].feature.names
print("GoEmotions labels:", label_list)

# Map 27 classes â†’ 4 custom classes
map_manual = {
    # Depression
    "sadness": "depression",
    "disappointment": "depression",
    "grief": "depression",

    # Anxiety
    "fear": "anxiety",
    "nervousness": "anxiety",
    "panic": "anxiety",
    "surprise": "anxiety",

    # Frustration
    "anger": "frustration",
    "annoyance": "frustration",
    "disgust": "frustration",

    # Calmness / Neutral
    "joy": "calmness",
    "relief": "calmness",
    "neutral": "calmness",
    "love": "calmness",
    "optimism": "calmness",
}

# Map indices â†’ 4-class labels
idx_to_4 = {}
for idx, name in enumerate(label_list):
    if name in map_manual:
        idx_to_4[idx] = map_manual[name]
    else:
        idx_to_4[idx] = "calmness"  # fallback


# Function to convert splits
def convert_split(split_name):
    rows = []
    for example in ds[split_name]:
        text = example["text"].strip()
        labels = example["labels"]
        mapped = [idx_to_4[l] for l in labels if idx_to_4.get(l)]
        if not mapped:
            continue
        final = max(set(mapped), key=mapped.count)
        rows.append((text, final))
    return pd.DataFrame(rows, columns=["text", "label"])


train_df = convert_split("train")
val_df = convert_split("validation")
test_df = convert_split("test")

# ============================================
# 4. Encode labels
# ============================================
LABEL2ID = {"depression":0, "anxiety":1, "frustration":2, "calmness":3}
ID2LABEL = {v:k for k,v in LABEL2ID.items()}

for df in [train_df, val_df, test_df]:
    df["label_id"] = df["label"].map(LABEL2ID)

train_ds = Dataset.from_pandas(train_df[["text", "label_id"]])
val_ds = Dataset.from_pandas(val_df[["text", "label_id"]])
test_ds = Dataset.from_pandas(test_df[["text", "label_id"]])

# ============================================
# 5. Tokenization
# ============================================
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(preprocess, batched=True)
val_ds = val_ds.map(preprocess, batched=True)
test_ds = test_ds.map(preprocess, batched=True)

train_ds = train_ds.rename_column("label_id", "labels")
val_ds = val_ds.rename_column("label_id", "labels")
test_ds = test_ds.rename_column("label_id", "labels")

train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
val_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
test_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# ============================================
# 6. Define model
# ============================================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL2ID),
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

# ============================================
# 7. Metrics
# ============================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

# ============================================
# 8. Training setup
# ============================================
training_args = TrainingArguments(
    output_dir="./models/goemotions_4class",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ============================================
# 9. Train the model
# ============================================
trainer.train()

# ============================================
# 10. Save final model + tokenizer
# ============================================
SAVE_DIR = "./models/goemotions_4class_final"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("âœ… Training complete! Final model saved at", SAVE_DIR)

# ============================================
# 11. Run Inference (test sentence)
# ============================================
from transformers import pipeline

inference_pipeline = pipeline("text-classification", model=SAVE_DIR, tokenizer=SAVE_DIR)

test_text = "I feel very nervous and worried about my exams."
result = inference_pipeline(test_text)
print("\nðŸ”® Inference Result:", result)


GoEmotions labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']




Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.3979,0.369871,0.860855,0.619555


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.3979,0.369871,0.860855,0.619555
2,0.2836,0.39363,0.86362,0.671846


âœ… Training complete! Final model saved at ./models/goemotions_4class_final


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



ðŸ”® Inference Result: [{'label': 'anxiety', 'score': 0.983787477016449}]


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import shutil
from google.colab import files

# Compress folder
shutil.make_archive("goemotions_4class_final", 'zip', "./models/goemotions_4class_final")

# Download zip
files.download("goemotions_4class_final.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>