In [None]:
import os

if not os.path.exists('data/full_dataset/'):
    os.makedirs('data/full_dataset/')

!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
!wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

--2025-12-10 06:28:18--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.210.207, 108.177.12.207, 173.194.212.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.210.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14174600 (14M) [application/octet-stream]
Saving to: ‘data/full_dataset/goemotions_1.csv’


2025-12-10 06:28:18 (71.4 MB/s) - ‘data/full_dataset/goemotions_1.csv’ saved [14174600/14174600]

--2025-12-10 06:28:19--  https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.210.207, 108.177.12.207, 173.194.212.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.210.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14173154 (14M) [application/octet-stream]
Saving to:

In [1]:
# ------------------------
# Energy-Level Classifier (GPU-ready Colab script)
# Uses GoEmotions -> mapped energy labels -> DistilBERT -> Gradio UI
# Copy-paste entire cell into Google Colab (enable GPU) and run.
# ------------------------

# 0) Install / upgrade dependencies (compatible versions)
!pip install -q --upgrade pip
!pip install -q "transformers>=4.41.0" "sentence-transformers" datasets gradio accelerate scikit-learn matplotlib sentencepiece

# 1) Imports
import os
import random
import time
from typing import List, Tuple, Dict

import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import load_dataset, Dataset, DatasetDict
import gradio as gr
import matplotlib.pyplot as plt

# 2) Device info
print("Torch version:", torch.__version__)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# 3) Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if device == "cuda":
    torch.cuda.manual_seed_all(RANDOM_SEED)

# 4) Energy labels and mapping (heuristic)
ENERGY_LABELS = ["energetic", "calm", "tired", "irritated"]
LABEL2ID = {lbl: i for i, lbl in enumerate(ENERGY_LABELS)}
ID2LABEL = {i: lbl for lbl, i in LABEL2ID.items()}

# Map many GoEmotions categories to energy classes (heuristic)
EMOTION_TO_ENERGY = {
    # energetic / high-arousal positive
    "excitement": "energetic",
    "joy": "energetic",
    "amusement": "energetic",
    "optimism": "energetic",
    "curiosity": "energetic",
    "surprise": "energetic",
    "approval": "energetic",
    "admiration": "energetic",
    "desire": "energetic",
    # calm / low arousal / neutral
    "neutral": "calm",
    "relief": "calm",
    "gratitude": "calm",
    "love": "calm",
    # tired / low-arousal negative
    "sadness": "tired",
    "disappointment": "tired",
    "grief": "tired",
    "remorse": "tired",
    # irritated / high-arousal negative
    "anger": "irritated",
    "annoyance": "irritated",
    "disgust": "irritated",
    "fear": "irritated",
    "embarrassment": "irritated",
    "confusion": "irritated"
}
# normalize keys to lowercase when comparing

# 5) Load GoEmotions (raw)
print("Loading GoEmotions dataset (this may take a moment)...")
raw = load_dataset("go_emotions", "raw")  # train / validation / test splits
print("Loaded raw splits:", list(raw.keys()))

# For 'raw' split, emotions are individual columns. We define the list explicitly.
GOEMOTIONS_LABELS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

# 6) Build mapped energy dataset (robust, returns Dataset objects)
def map_example_to_energy(example) -> Tuple[str,int] or None:
    """
    Convert a single raw example to (text, label_id) or None if not mappable.
    We only map examples that have exactly 1 emotion label to keep mapping clean.
    """
    active_emotions = []
    for emo_name in GOEMOTIONS_LABELS:
        if example.get(emo_name) == 1:
            active_emotions.append(emo_name)

    if len(active_emotions) != 1:
        # skip multi-label or zero-label examples for this simple prototype
        return None

    emo_name = active_emotions[0].lower()
    energy = EMOTION_TO_ENERGY.get(emo_name)
    if energy is None:
        return None
    return (example["text"], LABEL2ID[energy])

def build_full_energy_dataset(split_name: str) -> Dataset:
    texts = []
    labels = []
    for ex in raw[split_name]:
        out = map_example_to_energy(ex)
        if out is None:
            continue
        t, lab = out
        texts.append(t)
        labels.append(lab)
    # ensure we have something
    if len(texts) == 0:
        raise ValueError(f"No mappable examples found in split {split_name}")
    return Dataset.from_dict({"text": texts, "label": labels})

print("Mapping GoEmotions \u2192 energy labels (this may take ~1-2 mins)...")
# Since only 'train' split is available, build one large dataset and then split it.
ds_full_train = build_full_energy_dataset("train")

# Split the full train dataset into train, validation, and test
train_val_test = ds_full_train.train_test_split(test_size=0.2, seed=RANDOM_SEED)
ds_train = train_val_test['train']

# Split the remaining 20% into validation and test sets
val_test = train_val_test['test'].train_test_split(test_size=0.5, seed=RANDOM_SEED)
ds_val = val_test['train']
ds_test = val_test['test']

print("Raw mapped sizes:", len(ds_train), len(ds_val), len(ds_test))

# 7) Optional subsampling to limit GPU time in Colab
MAX_TRAIN = 8000  # change upward if you have more time & GPU
if len(ds_train) > MAX_TRAIN:
    ds_train = ds_train.shuffle(seed=RANDOM_SEED).select(range(MAX_TRAIN))
print("After optional subsample, train size:", len(ds_train))

dataset = DatasetDict({"train": ds_train, "validation": ds_val, "test": ds_test})
print({k: len(v) for k, v in dataset.items()})

# 8) Tokenizer & tokenization
MODEL_NAME = "distilbert-base-uncased"
print("Loading tokenizer:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=256)

print("Tokenizing datasets (batched)...")
tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 9) Model init
print("Loading model:", MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(ENERGY_LABELS),
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

# 10) Training arguments (GPU, fp16)
training_args = TrainingArguments(
    output_dir="energy_real_gpu_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    fp16=True if torch.cuda.is_available() else False,
    seed=RANDOM_SEED
)

# 11) Metrics
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

# 12) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 13) Train
print("Starting training (this will use GPU if available)...")
t0 = time.time()
trainer.train()
t1 = time.time()
print(f"Training done in {(t1-t0)/60:.2f} minutes")

# 14) Save best model & tokenizer
best_dir = "energy_real_gpu_model/best_model"
os.makedirs(best_dir, exist_ok=True)
trainer.save_model(best_dir)  # saves best model due to load_best_model_at_end
tokenizer.save_pretrained(best_dir)
print("Saved model & tokenizer to", best_dir)

# 15) Inference helpers (load best model from disk for safety)
print("Loading inference model from", best_dir)
inf_tokenizer = AutoTokenizer.from_pretrained(best_dir, use_fast=True)
inf_model = AutoModelForSequenceClassification.from_pretrained(best_dir)
inf_model.eval()
if torch.cuda.is_available():
    inf_model.cuda()

def predict_text(text: str) -> Tuple[str, float]:
    enc = inf_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    if torch.cuda.is_available():
        enc = {k: v.cuda() for k, v in enc.items()}
    with torch.no_grad():
        out = inf_model(**enc)
        logits = out.logits[0].cpu().numpy()
        probs = F.softmax(torch.tensor(logits), dim=0).numpy()
        pred_id = int(np.argmax(probs))
        label = ID2LABEL[pred_id]
        conf = float(probs[pred_id])
    return label, conf

# 16) Simple Gradio UI for inference
def classify_energy(text: str):
    if not text or len(text.strip()) == 0:
        return "No input text provided.", ""
    label, conf = predict_text(text)
    return f"Predicted energy: **{label}** (confidence {conf:.2f})", ""

with gr.Blocks() as demo:
    gr.Markdown("# Energy Level Classifier (GoEmotions \u2192 Energy mapping)")
    txt = gr.Textbox(lines=6, placeholder="Paste text to classify...", label="Input Text")
    btn = gr.Button("Predict Energy")
    out_md = gr.Markdown()
    btn.click(classify_energy, inputs=[txt], outputs=[out_md, gr.Text()])
    gr.Markdown("**Note:** energy labels are heuristically mapped from GoEmotions emotions; for research-grade claims collect human-labeled energy/arousal data.")
print("Launching Gradio app...")
demo.launch()

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m24.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hTorch version: 2.9.0+cu126
Using device: cpu
Loading GoEmotions dataset (this may take a moment)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

raw/train-00000-of-00001.parquet:   0%|          | 0.00/24.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

Loaded raw splits: ['train']
Mapping GoEmotions → energy labels (this may take ~1-2 mins)...
Raw mapped sizes: 123528 15441 15442
After optional subsample, train size: 8000
{'train': 8000, 'validation': 15441, 'test': 15442}
Loading tokenizer: distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing datasets (batched)...


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15441 [00:00<?, ? examples/s]

Map:   0%|          | 0/15442 [00:00<?, ? examples/s]

Loading model: distilbert-base-uncased


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training (this will use GPU if available)...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mharshinibalakumar2004[0m ([33mharshinibalakumar2004-iit-madras-foundation[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, mcp] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.0056,0.963616,0.580532,0.517561
2,0.8628,0.956517,0.588433,0.543497




Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,1.0056,0.963616,0.580532,0.517561
2,0.8628,0.956517,0.588433,0.543497
3,0.7463,0.999438,0.584224,0.539419


Training done in 114.59 minutes
Saved model & tokenizer to energy_real_gpu_model/best_model
Loading inference model from energy_real_gpu_model/best_model
Launching Gradio app...
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4a8286161971b4ea12.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


