# Dataset Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [3]:
CSV_PATHS = [
  "data/Emotion NLP/goemotions_1.csv",
  "data/Emotion NLP/goemotions_2.csv",
  "data/Emotion NLP/goemotions_3.csv",
]

KEPT_LABELS = [
  "joy", "sadness", "anger", "fear",
  "disgust", "surprise", "neutral"
]

TARGET_MIN_SAMPLES = 1000
TARGET_MAX_SAMPLES = 3000
MAX_LABEL_FRACTION = 0.30  # no emotion should dominate >30%

RANDOM_SEED = 42

In [4]:
dfs = [pd.read_csv(path) for path in CSV_PATHS]
df = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(df)} total samples")

Loaded 211225 total samples


In [5]:
if "example_very_unclear" in df.columns:
  before = len(df)
  df = df[df["example_very_unclear"] == False]
  print(f"Dropped {before - len(df)} unclear samples")

Dropped 3411 unclear samples


In [6]:
keep_cols = ["id", "text"] + KEPT_LABELS
df = df[keep_cols]

In [None]:
label_sum = df[KEPT_LABELS].sum(axis=1)
before = len(df)
df = df[label_sum > 0]
print(f"Dropped {before - len(df)} zero-label samples")

Dropped 117116 zero-label samples


In [9]:
df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

In [19]:
def label_frequencies(frame):
  return frame[KEPT_LABELS].sum() / len(frame)

In [21]:
while True:
  freqs = label_frequencies(df)
  max_label = freqs.idxmax()
  max_frac = freqs[max_label]

  if max_frac <= MAX_LABEL_FRACTION:
    break

  # find rows dominated only by the max label
  mask = (df[max_label] == 1) & (df[KEPT_LABELS].sum(axis=1) == 1)
  removable = df[mask]

  if len(removable) == 0:
    break

  # remove a small chunk
  drop_n = min(50, len(removable))
  drop_idx = removable.sample(n=drop_n, random_state=RANDOM_SEED).index
  df = df.drop(drop_idx)
  

print("Final label distribution:")
print(df[KEPT_LABELS].sum().sort_values(ascending=False))

Final label distribution:
neutral     15148
anger        8084
joy          7983
sadness      6758
surprise     5514
disgust      5301
fear         3197
dtype: int64


In [22]:
if len(df) > TARGET_MAX_SAMPLES:
  df = df.sample(n=TARGET_MAX_SAMPLES, random_state=RANDOM_SEED)

elif len(df) < TARGET_MIN_SAMPLES:
  print("Warning: dataset smaller than target minimum")

print(f"Final dataset size: {len(df)}")

Final dataset size: 3000


In [23]:
OUTPUT_PATH = "data/Emotion NLP/goemotions_eval_set.csv"
df.to_csv(OUTPUT_PATH, index=False)

print(f"Saved frozen evaluation set â†’ {OUTPUT_PATH}")

Saved frozen evaluation set â†’ data/Emotion NLP/goemotions_eval_set.csv


# Inference

In [24]:
import time
import json
import torch
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader

In [53]:
KEPT_LABELS = ["joy", "sadness", "anger", "fear", "disgust", "surprise", "neutral"]
DIST_LABELS = ["sadness", "joy", "love", "anger", "fear", "surprise"]
ROBERTA_LABELS = ["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"]

MODEL_NAMES = {
  "distilbert": ["bhadresh-savani/distilbert-base-uncased-emotion", DIST_LABELS],
  "roberta": ["cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", ROBERTA_LABELS],
}

BATCH_SIZE = 16
THRESHOLD = 0.5
MAX_LENGTH = 128

DATA_PATH = "data/Emotion NLP/goemotions_eval_set.csv"
OUTPUT_DIR = "data/Emotion NLP/results/"


In [56]:
df = pd.read_csv(DATA_PATH)

texts = df["text"].tolist()
ids = df["id"].tolist()

n_samples = len(df)
print(f"Loaded {n_samples} samples")

Loaded 3000 samples


In [27]:
class TextDataset(Dataset):
  def __init__(self, texts, tokenizer):
    self.texts = texts
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    return self.tokenizer(
      self.texts[idx],
      padding="max_length",
      truncation=True,
      max_length=MAX_LENGTH,
      return_tensors="pt"
    )


In [None]:
def run_inference(model_name, hf_name, labels_len):
  print(f"\nRunning inference: {model_name}")

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")

  tokenizer = AutoTokenizer.from_pretrained(hf_name)
  model = AutoModelForSequenceClassification.from_pretrained(
    hf_name,
    num_labels=labels_len,
    problem_type="multi_label_classification"
  )

  model.to(device)
  model.eval()

  dataset = TextDataset(texts, tokenizer)
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

  all_probs = []

  start_time = time.perf_counter()

  with torch.no_grad():
    for batch in dataloader:
      batch = {k: v.squeeze(1).to(device) for k, v in batch.items()}
      outputs = model(**batch)
      logits = outputs.logits
      probs = torch.sigmoid(logits)
      all_probs.append(probs.cpu().numpy())

  end_time = time.perf_counter()

  probs = np.vstack(all_probs)

  timing = {
    "model": hf_name,
    "device": str(device),
    "batch_size": BATCH_SIZE,
    "num_samples": n_samples,
    "total_inference_time_sec": round(end_time - start_time, 3),
    "avg_time_per_sample_ms": round(
      (end_time - start_time) / n_samples * 1000, 3
    )
  }

  return probs, timing


## Sanity Check

In [50]:
SANITY_N = 5
df_sanity = df.head(SANITY_N)

texts = df_sanity["text"].tolist()
ids = df_sanity["id"].tolist()


probs, timing = run_inference(
  model_name="distilbert",
  hf_name="bhadresh-savani/distilbert-base-uncased-emotion",
  labels_len=len(DIST_LABELS)
)

print("Probabilities shape:", probs.shape)
print("Sample probabilities:\n", probs[:2])

preds = (probs >= THRESHOLD).astype(int)
print("Sample predictions:\n", preds[:2])

for i in range(df_sanity.shape[0]):
  print("\nTEXT:", texts[i])
  for j, label in enumerate(DIST_LABELS):
    print(f"{label:10s} â†’ {probs[i][j]:.3f}")



Running inference: distilbert
Device: cpu
Probabilities shape: (5, 6)
Sample probabilities:
 [[0.97813433 0.7523837  0.09782096 0.48692903 0.2512284  0.04176866]
 [0.9988959  0.18733689 0.15640457 0.37029728 0.1045764  0.09828914]]
Sample predictions:
 [[1 1 0 0 0 0]
 [1 0 0 0 0 0]]

TEXT: She had really bad chronic back pain from what I can remember, so was self treating with alcohol
sadness    â†’ 0.978
joy        â†’ 0.752
love       â†’ 0.098
anger      â†’ 0.487
fear       â†’ 0.251
surprise   â†’ 0.042

TEXT: Ah well I still feel a bit bad for the parrot but at least he wonâ€˜t be harmed :)
sadness    â†’ 0.999
joy        â†’ 0.187
love       â†’ 0.156
anger      â†’ 0.370
fear       â†’ 0.105
surprise   â†’ 0.098

TEXT: But is that a good thing though ðŸ¤”ðŸ¤”
sadness    â†’ 0.450
joy        â†’ 0.993
love       â†’ 0.151
anger      â†’ 0.336
fear       â†’ 0.131
surprise   â†’ 0.081

TEXT: The balance above the base fee gets put somewhere else. Education, healthcare, etc. Voil

In [51]:
probs, timing = run_inference(
  model_name="roberta",
  hf_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
  labels_len=len(ROBERTA_LABELS)
)

print("Probabilities shape:", probs.shape)
print("Sample probabilities:\n", probs[:2])

preds = (probs >= THRESHOLD).astype(int)
print("Sample predictions:\n", preds[:2])

for i in range(df_sanity.shape[0]):
  print("\nTEXT:", texts[i])
  for j, label in enumerate(DIST_LABELS):
    print(f"{label:10s} â†’ {probs[i][j]:.3f}")


Running inference: roberta
Device: cpu


tokenizer_config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Probabilities shape: (5, 11)
Sample probabilities:
 [[0.22499156 0.05597249 0.35495785 0.07578404 0.01990303 0.00538129
  0.0675408  0.5977556  0.94322455 0.01272445 0.00929501]
 [0.04210198 0.08851668 0.04805688 0.65962917 0.66848457 0.0283549
  0.85636437 0.03561831 0.08661252 0.01710225 0.04877528]]
Sample predictions:
 [[0 0 0 0 0 0 0 1 1 0 0]
 [0 0 0 1 1 0 1 0 0 0 0]]

TEXT: She had really bad chronic back pain from what I can remember, so was self treating with alcohol
sadness    â†’ 0.225
joy        â†’ 0.056
love       â†’ 0.355
anger      â†’ 0.076
fear       â†’ 0.020
surprise   â†’ 0.005

TEXT: Ah well I still feel a bit bad for the parrot but at least he wonâ€˜t be harmed :)
sadness    â†’ 0.042
joy        â†’ 0.089
love       â†’ 0.048
anger      â†’ 0.660
fear       â†’ 0.668
surprise   â†’ 0.028

TEXT: But is that a good thing though ðŸ¤”ðŸ¤”
sadness    â†’ 0.013
joy        â†’ 0.730
love       â†’ 0.027
anger      â†’ 0.086
fear       â†’ 0.444
surprise   â†’ 0.012

TEX

In [38]:
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# # Load tokenizer
# distil_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# # Load model
# distil_model = DistilBertForSequenceClassification.from_pretrained(
#   "distilbert-base-uncased",
#   problem_type="multi_label_classification",  # for multi-label
#   num_labels=len(kept_labels)               # your 8 labels
# )

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load tokenizer
distil_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Load model
distil_model = DistilBertForSequenceClassification.from_pretrained(
  "bhadresh-savani/distilbert-base-uncased-emotion",
  problem_type="multi_label_classification",  # for multi-label
  num_labels=len(["sadness", "joy", "love", "anger", "fear", "surprise"])               # your 8 labels
)


In [42]:
# from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

# # Load tokenizer
# roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# # Load model
# roberta_model = RobertaForSequenceClassification.from_pretrained(
#   "roberta-base",
#   problem_type="multi_label_classification",
#   num_labels=len(kept_labels)
# )

from transformers import RobertaTokenizerFast, RobertaForSequenceClassification

# Load tokenizer
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# Load model
roberta_model = RobertaForSequenceClassification.from_pretrained(
  "cardiffnlp/twitter-roberta-base-emotion-multilabel-latest",
  problem_type="multi_label_classification",
  num_labels=len(["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])
)

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [36]:
# quick inference
import torch

def predict(model, tokenizer, texts, threshold=0.5):
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
  with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits)  # multi-label probabilities
    preds = (probs >= threshold).int()
  return preds, probs


In [44]:
print(["sadness", "joy", "love", "anger", "fear", "surprise"])
predict(distil_model, distil_tokenizer, ["I am so happy today!", "This is terrible..."])


['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


(tensor([[0, 1, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0]], dtype=torch.int32),
 tensor([[0.2172, 0.9986, 0.2412, 0.1349, 0.0895, 0.1017],
         [0.9974, 0.1571, 0.0907, 0.4971, 0.2588, 0.0890]]))

In [45]:
print(["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])
predict(roberta_model, roberta_tokenizer, ["I am so happy today!", "This is terrible..."])

['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']


(tensor([[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
         [1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0]], dtype=torch.int32),
 tensor([[0.0148, 0.0433, 0.0136, 0.0122, 0.9900, 0.6960, 0.8425, 0.0085, 0.0150,
          0.0384, 0.1132],
         [0.9562, 0.0266, 0.9608, 0.6311, 0.0089, 0.0081, 0.0064, 0.1003, 0.5182,
          0.0625, 0.0075]]))

## Full Inference

In [61]:
for short_name, (hf_name, labels) in MODEL_NAMES.items():

  probs, timing = run_inference(short_name, hf_name, len(labels))

  # Convert probabilities â†’ predictions
  preds = (probs >= THRESHOLD).astype(int)

  # Build output dataframe
  out_df = pd.DataFrame({
    "id": ids,
    "text": texts
  })

  for i, label in enumerate(labels):
    out_df[f"conf_{label}"] = probs[:, i]
    out_df[f"pred_{label}"] = preds[:, i]

  # Save predictions
  pred_path = f"{OUTPUT_DIR}/{short_name}_predictions.csv"
  out_df.to_csv(pred_path, index=False)

  # Save timing
  timing_path = f"{OUTPUT_DIR}/{short_name}_timing.json"
  with open(timing_path, "w") as f:
    json.dump(timing, f, indent=2)

  print(f"Saved â†’ {pred_path}")
  print(f"Saved â†’ {timing_path}")



Running inference: distilbert
Device: cpu
Saved â†’ data/Emotion NLP/results//distilbert_predictions.csv
Saved â†’ data/Emotion NLP/results//distilbert_timing.json

Running inference: roberta
Device: cpu
Saved â†’ data/Emotion NLP/results//roberta_predictions.csv
Saved â†’ data/Emotion NLP/results//roberta_timing.json
