# Dataset Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
CSV_PATHS = [
  "data/Emotion NLP/goemotions_1.csv",
  "data/Emotion NLP/goemotions_2.csv",
  "data/Emotion NLP/goemotions_3.csv",
]

KEPT_LABELS = ["joy", "sadness", "anger", "fear", "surprise", "love"]

TARGET_MIN_SAMPLES = 1000
TARGET_MAX_SAMPLES = 3000
MAX_LABEL_FRACTION = 0.30  # no emotion should dominate >30%

RANDOM_SEED = 42

In [3]:
dfs = [pd.read_csv(path) for path in CSV_PATHS]
df = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(df)} total samples")

Loaded 211225 total samples


In [4]:
if "example_very_unclear" in df.columns:
  before = len(df)
  df = df[df["example_very_unclear"] == False]
  print(f"Dropped {before - len(df)} unclear samples")

Dropped 3411 unclear samples


In [5]:
keep_cols = ["id", "text"] + KEPT_LABELS
df = df[keep_cols]

In [6]:
label_sum = df[KEPT_LABELS].sum(axis=1)
before = len(df)
df = df[label_sum > 0]
print(f"Dropped {before - len(df)} zero-label samples")

Dropped 169556 zero-label samples


In [7]:
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [8]:
def label_frequencies(frame):
  return frame[KEPT_LABELS].sum() / len(frame)

In [9]:
while True:
  freqs = label_frequencies(df)
  max_label = freqs.idxmax()
  max_frac = freqs[max_label]

  if max_frac <= MAX_LABEL_FRACTION:
    break

  # find rows dominated only by the max label
  mask = (df[max_label] == 1) & (df[KEPT_LABELS].sum(axis=1) == 1)
  removable = df[mask]

  if len(removable) == 0:
    break

  # remove a small chunk
  drop_n = min(50, len(removable))
  drop_idx = removable.sample(n=drop_n, random_state=RANDOM_SEED).index
  df = df.drop(drop_idx)
  

print("Final label distribution:")
print(df[KEPT_LABELS].sum().sort_values(ascending=False))

Final label distribution:
love        8191
anger       8084
joy         7983
sadness     6758
surprise    5514
fear        3197
dtype: int64


In [10]:
if len(df) > TARGET_MAX_SAMPLES:
  df = df.sample(n=TARGET_MAX_SAMPLES, random_state=RANDOM_SEED)

elif len(df) < TARGET_MIN_SAMPLES:
  print("Warning: dataset smaller than target minimum")

print(f"Final dataset size: {len(df)}")

Final dataset size: 3000


In [11]:
OUTPUT_PATH = "data/Emotion NLP/goemotions_eval_set.csv"
df.to_csv(OUTPUT_PATH, index=False)

print(f"Saved frozen evaluation set → {OUTPUT_PATH}")

Saved frozen evaluation set → data/Emotion NLP/goemotions_eval_set.csv


# Inference

In [12]:
import time
import json
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader

In [13]:
DIST_LABELS = ["sadness", "joy", "love", "anger", "fear", "surprise"]
ROBERTA_LABELS = ["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"]

MODEL_NAMES = {
  "distilbert": ["bhadresh-savani/distilbert-base-uncased-emotion", DIST_LABELS],
  "roberta": ["cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", ROBERTA_LABELS],
}

BATCH_SIZE = 16
THRESHOLD = 0.5
MAX_LENGTH = 128

DATA_PATH = "data/Emotion NLP/goemotions_eval_set.csv"
OUTPUT_DIR = "data/Emotion NLP/results/"


In [17]:
df = pd.read_csv(DATA_PATH)

texts = df["text"].tolist()
ids = df["id"].tolist()

n_samples = len(df)
print(f"Loaded {n_samples} samples")

Loaded 3000 samples


In [14]:
class TextDataset(Dataset):
  def __init__(self, texts, tokenizer):
    self.texts = texts
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    return self.tokenizer(
      self.texts[idx],
      padding="max_length",
      truncation=True,
      max_length=MAX_LENGTH,
      return_tensors="pt"
    )


In [15]:
def run_inference(model_name, hf_name, labels_len):
  print(f"\nRunning inference: {model_name}")

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")

  tokenizer = AutoTokenizer.from_pretrained(hf_name)
  model = AutoModelForSequenceClassification.from_pretrained(
    hf_name,
    num_labels=labels_len,
    problem_type="multi_label_classification"
  )

  model.to(device)
  model.eval()

  dataset = TextDataset(texts, tokenizer)
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

  all_probs = []

  start_time = time.perf_counter()

  with torch.no_grad():
    for batch in dataloader:
      batch = {k: v.squeeze(1).to(device) for k, v in batch.items()}
      outputs = model(**batch)
      logits = outputs.logits
      probs = torch.sigmoid(logits)
      all_probs.append(probs.cpu().numpy())

  end_time = time.perf_counter()

  probs = np.vstack(all_probs)

  timing = {
    "model": hf_name,
    "device": str(device),
    "batch_size": BATCH_SIZE,
    "num_samples": n_samples,
    "total_inference_time_sec": round(end_time - start_time, 3),
    "avg_time_per_sample_ms": round(
      (end_time - start_time) / n_samples * 1000, 3
    )
  }

  return probs, timing


## Sanity Check

In [7]:
SANITY_N = 5
df_sanity = df.head(SANITY_N)

texts = df_sanity["text"].tolist()
ids = df_sanity["id"].tolist()


probs, timing = run_inference(
  model_name="distilbert",
  hf_name="bhadresh-savani/distilbert-base-uncased-emotion",
  labels_len=len(DIST_LABELS)
)

print("Probabilities shape:", probs.shape)
print("Sample probabilities:\n", probs[:2])

preds = (probs >= THRESHOLD).astype(int)
print("Sample predictions:\n", preds[:2])

for i in range(df_sanity.shape[0]):
  print("\nTEXT:", texts[i])
  for j, label in enumerate(DIST_LABELS):
    print(f"{label:10s} → {probs[i][j]:.3f}")



Running inference: distilbert
Device: cpu
Probabilities shape: (5, 6)
Sample probabilities:
 [[0.15915503 0.99852896 0.28175303 0.1580044  0.0782557  0.12352173]
 [0.20308957 0.99863285 0.25943714 0.13026811 0.08543865 0.11956318]]
Sample predictions:
 [[0 1 0 0 0 0]
 [0 1 0 0 0 0]]

TEXT: Rusty you the bomb! I'm okish now but just knowing you are out there makes me smile! Cheers! xoxo
sadness    → 0.159
joy        → 0.999
love       → 0.282
anger      → 0.158
fear       → 0.078
surprise   → 0.124

TEXT: happy birthday, have some gold!
sadness    → 0.203
joy        → 0.999
love       → 0.259
anger      → 0.130
fear       → 0.085
surprise   → 0.120

TEXT: You're a mean person.
sadness    → 0.325
joy        → 0.171
love       → 0.140
anger      → 0.998
fear       → 0.227
surprise   → 0.074

TEXT: Honestly, State and Revolution is what made me ananarchist.
sadness    → 0.314
joy        → 0.910
love       → 0.104
anger      → 0.946
fear       → 0.197
surprise   → 0.057

TEXT: that's ace, 

In [8]:
probs, timing = run_inference(
  model_name="roberta",
  hf_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
  labels_len=len(ROBERTA_LABELS)
)

print("Probabilities shape:", probs.shape)
print("Sample probabilities:\n", probs[:2])

preds = (probs >= THRESHOLD).astype(int)
print("Sample predictions:\n", preds[:2])

for i in range(df_sanity.shape[0]):
  print("\nTEXT:", texts[i])
  for j, label in enumerate(DIST_LABELS):
    print(f"{label:10s} → {probs[i][j]:.3f}")


Running inference: roberta
Device: cpu
Probabilities shape: (5, 11)
Sample probabilities:
 [[0.0184311  0.03651299 0.0166482  0.01362048 0.98732346 0.8460089
  0.7814287  0.01076462 0.01864295 0.04037486 0.12000268]
 [0.01707987 0.03240285 0.01807778 0.01351091 0.98402    0.8782865
  0.6787714  0.01236995 0.02928596 0.03801468 0.09515911]]
Sample predictions:
 [[0 0 0 0 1 1 1 0 0 0 0]
 [0 0 0 0 1 1 1 0 0 0 0]]

TEXT: Rusty you the bomb! I'm okish now but just knowing you are out there makes me smile! Cheers! xoxo
sadness    → 0.018
joy        → 0.037
love       → 0.017
anger      → 0.014
fear       → 0.987
surprise   → 0.846

TEXT: happy birthday, have some gold!
sadness    → 0.017
joy        → 0.032
love       → 0.018
anger      → 0.014
fear       → 0.984
surprise   → 0.878

TEXT: You're a mean person.
sadness    → 0.980
joy        → 0.023
love       → 0.931
anger      → 0.039
fear       → 0.018
surprise   → 0.006

TEXT: Honestly, State and Revolution is what made me ananarchist.
sad

In [9]:
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# # Load tokenizer
# distil_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# # Load model
# distil_model = DistilBertForSequenceClassification.from_pretrained(
#   "distilbert-base-uncased",
#   problem_type="multi_label_classification",  # for multi-label
#   num_labels=len(kept_labels)               # your 8 labels
# )

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load tokenizer
distil_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Load model
distil_model = DistilBertForSequenceClassification.from_pretrained(
  "bhadresh-savani/distilbert-base-uncased-emotion",
  problem_type="multi_label_classification",  # for multi-label
  num_labels=len(["sadness", "joy", "love", "anger", "fear", "surprise"])               # your 8 labels
)


In [10]:
# from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

# # Load tokenizer
# roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# # Load model
# roberta_model = RobertaForSequenceClassification.from_pretrained(
#   "roberta-base",
#   problem_type="multi_label_classification",
#   num_labels=len(kept_labels)
# )

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

# Load tokenizer
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# Load model
roberta_model = RobertaForSequenceClassification.from_pretrained(
  "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
  problem_type="multi_label_classification",
  num_labels=len(["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])
)

In [11]:
# quick inference
import torch

def predict(model, tokenizer, texts, threshold=0.5):
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
  with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits)  # multi-label probabilities
    preds = (probs >= threshold).int()
  return preds, probs


In [12]:
print(["sadness", "joy", "love", "anger", "fear", "surprise"])
predict(distil_model, distil_tokenizer, ["I am so happy today!", "This is terrible..."])


['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


(tensor([[0, 1, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0]], dtype=torch.int32),
 tensor([[0.2172, 0.9986, 0.2412, 0.1349, 0.0895, 0.1017],
         [0.9974, 0.1571, 0.0907, 0.4971, 0.2588, 0.0890]]))

In [13]:
print(["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])
predict(roberta_model, roberta_tokenizer, ["I am so happy today!", "This is terrible..."])

['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']


(tensor([[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
         [1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0]], dtype=torch.int32),
 tensor([[0.0148, 0.0433, 0.0136, 0.0122, 0.9900, 0.6960, 0.8425, 0.0085, 0.0150,
          0.0384, 0.1132],
         [0.9562, 0.0266, 0.9608, 0.6311, 0.0089, 0.0081, 0.0064, 0.1003, 0.5182,
          0.0625, 0.0075]]))

## Full Inference

In [18]:
for short_name, (hf_name, labels) in MODEL_NAMES.items():

  probs, timing = run_inference(short_name, hf_name, len(labels))

  # Convert probabilities → predictions
  preds = (probs >= THRESHOLD).astype(int)

  # Build output dataframe
  out_df = pd.DataFrame({
    "id": ids,
    "text": texts
  })

  for i, label in enumerate(labels):
    out_df[f"conf_{label}"] = probs[:, i]
    out_df[f"pred_{label}"] = preds[:, i]

  # Save predictions
  pred_path = f"{OUTPUT_DIR}/{short_name}_predictions.csv"
  out_df.to_csv(pred_path, index=False)

  # Save timing
  timing_path = f"{OUTPUT_DIR}/{short_name}_timing.json"
  with open(timing_path, "w") as f:
    json.dump(timing, f, indent=2)

  print(f"Saved → {pred_path}")
  print(f"Saved → {timing_path}")



Running inference: distilbert
Device: cpu
Saved → data/Emotion NLP/results//distilbert_predictions.csv
Saved → data/Emotion NLP/results//distilbert_timing.json

Running inference: roberta
Device: cpu
Saved → data/Emotion NLP/results//roberta_predictions.csv
Saved → data/Emotion NLP/results//roberta_timing.json


# Evaluation

In [19]:
import pandas as pd
import numpy as np

from sklearn.metrics import (precision_recall_fscore_support,
                             classification_report)


In [20]:
def evaluate_multilabel(
  df_preds,
  df_gold,
  labels,
  threshold=0.5):
  """
  df_preds: dataframe with conf_<label> columns
  df_gold: dataframe with true <label> columns
  labels: list of labels to evaluate
  """

  # Build prediction matrix
  y_pred = np.vstack([
    (df_preds[f"conf_{lbl}"] >= threshold).astype(int)
    for lbl in labels
  ]).T

  # Build gold matrix
  y_true = df_gold[labels].values

  # Macro + micro
  macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro", zero_division=0
  )

  micro_p, micro_r, micro_f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="micro", zero_division=0
  )

  overall = pd.DataFrame([{
    "macro_precision": macro_p,
    "macro_recall": macro_r,
    "macro_f1": macro_f1,
    "micro_precision": micro_p,
    "micro_recall": micro_r,
    "micro_f1": micro_f1
  }])

  # Per-label metrics
  per_label = []
  for i, lbl in enumerate(labels):
    p, r, f1, _ = precision_recall_fscore_support(
      y_true[:, i],
      y_pred[:, i],
      average="binary",
      zero_division=0
    )
    per_label.append({
      "label": lbl,
      "precision": p,
      "recall": r,
      "f1": f1
    })

  per_label_df = pd.DataFrame(per_label)

  return overall, per_label_df


In [None]:
df_distil_preds = pd.read_csv("data/Emotion NLP/results/distilbert_predictions.csv")
df_gold = pd.read_csv("data/Emotion NLP/goemotions_eval_set.csv")

distil_overall, distil_per_label = evaluate_multilabel(
  df_preds=df_distil_preds,
  df_gold=df_gold,
  labels=KEPT_LABELS,
  threshold=0.5
)

print("DistilBERT - Overall")
print(distil_overall)

print("\nDistilBERT - Per emotion F1")
print(distil_per_label.sort_values("f1"))


DistilBERT - Overall
   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.413491      0.605861  0.435764         0.357182       0.62784   

   micro_f1  
0  0.455326  

DistilBERT - Per emotion F1
      label  precision    recall        f1
4  surprise   0.512438  0.227876  0.315467
3      fear   0.251603  0.554770  0.346196
0       joy   0.310972  0.879421  0.459471
2     anger   0.327097  0.889984  0.478376
5      love   0.666667  0.401899  0.501481
1   sadness   0.412170  0.681214  0.513591


Unnamed: 0,macro_precision,macro_recall,macro_f1,micro_precision,micro_recall,micro_f1
0,0.413491,0.605861,0.435764,0.357182,0.62784,0.455326


In [24]:
def collapse_roberta_predictions(
  df_roberta,
  mapping,
  threshold=0.5):
  """
  Converts RoBERTa predictions into 6-label emotion space
  """

  df_out = df_roberta[["id", "text"]].copy()

  for target_label, source_labels in mapping.items():
    conf_cols = [f"conf_{lbl}" for lbl in source_labels]

    # Max confidence across mapped labels
    df_out[f"conf_{target_label}"] = df_roberta[conf_cols].max(axis=1)

    # Binary prediction
    df_out[f"pred_{target_label}"] = (
      df_out[f"conf_{target_label}"] >= threshold
    ).astype(int)

  return df_out


In [28]:
ROBERTA_TO_6 = {
  "anger": ["anger", "disgust"],
  "fear": ["fear", "pessimism"],
  "joy": ["joy", "optimism"],
  "love": ["love", "trust"],
  "sadness": ["sadness"],
  "surprise": ["surprise", "anticipation"]
}


df_roberta_preds = pd.read_csv("data/Emotion NLP/results/roberta_predictions.csv")

df_roberta_6 = collapse_roberta_predictions(
  df_roberta_preds,
  ROBERTA_TO_6,
  threshold=0.5
)

df_roberta_6.head()

roberta_overall, roberta_per_label = evaluate_multilabel(
  df_preds=df_roberta_6,
  df_gold=df_gold,
  labels=KEPT_LABELS,
  threshold=0.5
)

print("RoBERTa - Overall")
print(roberta_overall)

print("\nRoBERTa - Per emotion F1")
print(roberta_per_label.sort_values("f1"))


RoBERTa - Overall
   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.518823       0.70346  0.577528         0.492351        0.7312   

   micro_f1  
0  0.588463  

RoBERTa - Per emotion F1
      label  precision    recall        f1
3      fear   0.368664  0.565371  0.446304
4  surprise   0.619632  0.446903  0.519280
0       joy   0.388453  0.876206  0.538272
2     anger   0.465442  0.873563  0.607306
1   sadness   0.558651  0.722960  0.630273
5      love   0.712098  0.735759  0.723735


In [31]:
distil_common_overall, distil_common_per_label = evaluate_multilabel(
  df_distil_preds, df_gold, KEPT_LABELS
)

roberta_common_overall, roberta_common_per_label = evaluate_multilabel(
  df_roberta_preds, df_gold, KEPT_LABELS
)

comparison = pd.concat([
  distil_common_overall.assign(model="DistilBERT"),
  roberta_common_overall.assign(model="RoBERTa")
])

comparison


Unnamed: 0,macro_precision,macro_recall,macro_f1,micro_precision,micro_recall,micro_f1,model
0,0.413491,0.605861,0.435764,0.357182,0.62784,0.455326,DistilBERT
0,0.590085,0.659299,0.5742,0.522958,0.69248,0.595897,RoBERTa
