In [3]:
!pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [4]:
!pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu


In [5]:
!pip install --upgrade transformers

Defaulting to user installation because normal site-packages is not writeable


In [6]:
# Import all libs
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics import (
    f1_score, recall_score, balanced_accuracy_score,
    matthews_corrcoef, precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [7]:
# Authenticate with HuggingFace
import os
from dotenv import load_dotenv
from huggingface_hub import login

# from google.colab import userdata
# hugging_face_token = userdata.get("hf_token") #If using gg colab

load_dotenv() #If using VSCode
hugging_face_token = os.getenv("hf_token") #If using VSCode

login(token=hugging_face_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [8]:
# Check memory allocated for MPS
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS device available:", torch.backends.mps.is_available())
    # This tells you memory usage is not directly exposed, but you can monitor allocated tensors
    x = torch.randn(1024, 1024, device=device)
    print("Tensor allocated on MPS:", x.device)
else:
    print(f"CUDA device available: {torch.cuda.is_available()}")

CUDA device available: True


Load AG news datasets

In [None]:
label_map = {
    0: "world",
    1: "sports",
    2: "business",
    3: "sci/tech"
}
# Load existing prepared files (kept for reference)
ag_news_imbalanced_data_99_to_1 = pd.read_parquet("Data/ag_news/ag_news_train_imbalanced_99_to_1.parquet")
balanced_data = pd.read_parquet("Data/ag_news/ag_news_train_balanced.parquet")
ag_news_imbalanced_data_49_to_1 = pd.read_parquet("Data/ag_news/ag_news_train_imbalanced_49_to_1_ratio.parquet")

# Map numeric labels into text labels
balanced_data["label"] = balanced_data["label"].map(label_map)
ag_news_imbalanced_data_99_to_1["label"] = ag_news_imbalanced_data_99_to_1["label"].map(label_map)
ag_news_imbalanced_data_49_to_1["label"] = ag_news_imbalanced_data_49_to_1["label"].map(label_map)

# Shuffle the dataset
ag_news_imbalanced_data_99_to_1 = ag_news_imbalanced_data_99_to_1.sample(frac=1).reset_index(drop=True)
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
ag_news_imbalanced_data_49_to_1 = ag_news_imbalanced_data_49_to_1.sample(frac=1).reset_index(drop=True)

# Utility: create imbalanced AG News dataset programmatically by choosing a majority label
def split_ratio_for_ag_news(df, majority_label, majority_count, minority_count):
    """Create an imbalanced AG News subset where `majority_label` has `majority_count` samples"""
    parts = []
    labels = df['label'].unique().tolist()
    for lab in labels:
        if lab == majority_label:
            parts.append(df[df['label'] == lab].sample(majority_count, random_state=42))
        else:
            # sample minority_count from each other class
            parts.append(df[df['label'] == lab].sample(minority_count, random_state=42))
    out = pd.concat(parts, ignore_index=True, sort=False)
    return out.sample(frac=1).reset_index(drop=True)

# Example: create variants with different majority classes (minority_count=20 as requested)
ag_news_world_majority_99 = split_ratio_for_ag_news(balanced_data, 'world', majority_count=980, minority_count=20)
ag_news_sports_majority_99 = split_ratio_for_ag_news(balanced_data, 'sports', majority_count=980, minority_count=20)
ag_news_business_majority_99 = split_ratio_for_ag_news(balanced_data, 'business', majority_count=980, minority_count=20)

# Keep original balanced_data as the balanced dataset
# Testing
balanced_data


Unnamed: 0,text,label
0,Winamp vulnerable to camouflaged-skin attacks ...,sci/tech
1,Cricket: Warne back to face NZ Australia's Dar...,world
2,Woods Unsure About Tee Time With Injury (AP) A...,sports
3,Greece proved to be ideal host ATHENS - No que...,sports
4,Plan unveiled to ease Iraq debt Germany has un...,world
...,...,...
995,Oil Prices Slip Below #36;46; Risks Abound (R...,business
996,"Poverty, Persecution Stalk Many Indigenous Peo...",sci/tech
997,IBM Putting PC Business Up for Sale Internatio...,sci/tech
998,Video Shows G.I.'s at Weapon Cache A videotape...,world


Load toxic text dataset (The dataset is already imbalanced)

- Rename the columns to have the same names as other datasets
- Label = 0 --> Not toxic
- Label = 1 --> Toxic
- Map label column into words

In [None]:
toxic_label_map = {
    0: "nontoxic",
    1: "toxic"
}

def split_ratio_for_toxic_dataset(df, majority_label='nontoxic', majority_count=500, minority_count=20):
    """Create an imbalanced toxic_text subset where `majority_label` has `majority_count` samples and the other label has `minority_count`."""
    parts = []
    for lab in df['label'].unique():
        if lab == majority_label:
            parts.append(df[df['label'] == lab].sample(majority_count, random_state=42))
        else:
            parts.append(df[df['label'] == lab].sample(minority_count, random_state=42))
    out = pd.concat(parts, ignore_index=True, sort=False)
    return out.sample(frac=1).reset_index(drop=True)

toxic_text = pd.read_csv("Data/toxic_text/train.csv")
toxic_text = toxic_text[["comment_text", "toxic"]]
toxic_text = toxic_text.rename(columns={"comment_text": "text", "toxic": "label"})
toxic_text["label"] = toxic_text["label"].map(toxic_label_map)

# Get 3 small subsets of the main datasets with 3 different ratios and different majority classes
toxic_balanced = split_ratio_for_toxic_dataset(toxic_text, majority_label='nontoxic', majority_count=500, minority_count=500)
# Increase minority_count to 20 as requested
toxic_99_to_1 = split_ratio_for_toxic_dataset(toxic_text, majority_label='nontoxic', majority_count=980, minority_count=20)
toxic_49_to_1 = split_ratio_for_toxic_dataset(toxic_text, majority_label='nontoxic', majority_count=940, minority_count=20)
toxic_toxic_majority_99 = split_ratio_for_toxic_dataset(toxic_text, majority_label='toxic', majority_count=980, minority_count=20)


Load twitter emotion type dataset (This is also imbalanced)
- Create 3 small datasets, roughly 2000 rows each, with balanced, 99:1, 49:1 imbalanced ratios

In [None]:
emotion_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}
emotion_df = pd.read_parquet("Data/twit/twitter_emotion.parquet")
emotion_df["label"] = emotion_df["label"].map(emotion_map)

def split_ratio_for_emotion_dataset(df, majority_label='sadness', majority_count=200, minority_count=20):
    """Create an imbalanced emotion subset where `majority_label` has `majority_count` samples and every other label has `minority_count`."""
    parts = []
    labels = df['label'].unique().tolist()
    for lab in labels:
        if lab == majority_label:
            parts.append(df[df['label'] == lab].sample(majority_count, random_state=42))
        else:
            parts.append(df[df['label'] == lab].sample(minority_count, random_state=42))
    out = pd.concat(parts, ignore_index=True, sort=False)
    return out.sample(frac=1).reset_index(drop=True)

# Get 3 small subsets of the main datasets with 3 different ratios and different majority labels
emotion_balanced = split_ratio_for_emotion_dataset(emotion_df, majority_label='sadness', majority_count=200, minority_count=200)
# Use minority_count=20 as requested to reduce random variation
emotion_imbalanced_99_to_1 = split_ratio_for_emotion_dataset(emotion_df, majority_label='sadness', majority_count=950, minority_count=20)
emotion_imbalanced_49_to_1 = split_ratio_for_emotion_dataset(emotion_df, majority_label='sadness', majority_count=202, minority_count=20)
# Also create variants where the majority class is 'joy' or others to compare
emotion_joy_majority_99 = split_ratio_for_emotion_dataset(emotion_df, majority_label='joy', majority_count=950, minority_count=20)
emotion_love_majority_99 = split_ratio_for_emotion_dataset(emotion_df, majority_label='love', majority_count=950, minority_count=20)

# Quick check
emotion_imbalanced_99_to_1["label"].value_counts()




label
sadness     950
love         10
joy          10
anger        10
surprise     10
fear         10
Name: count, dtype: int64

Function to build instruction for the LLMs, which can be fit with all 3 classification datasets

In [12]:
def build_prompt(df, text, label_map, shots_per_class=None):
    """
    Function to construct an instruction for the LLM

    Args:
        text (str): The text of the data

    Returns:
        prompt (str): The constructed prompt for the LLM
    """
    assert shots_per_class is not None, "Please provide 'shots_per_class' parameter"
    prompt = (
        f"You are a powerful, precise, and helpful assistant that classifies text into well-defined categories, NO MATTER THE CONTEXT."
        f" IMPORTANT: CHOOSE ONE WORD FROM THESE CATEGORIES: {', '.join(list(label_map.values()))}."
        f" Respond with exactly one word: the single best category inside the given categories, DO NOT ANSWER ANY OTHER CATEGORIES BESIDES THE GIVEN ONE."
        f" Do not explain your choice, provide reasoning, or output anything else."
        f" Learn from these examples to understand context and edge cases: "

    )
    # ASSUME THE shots_per_class WILL ALWAYS BE PASSED
    few_shots_example = []
    for lab in list(label_map.values()):
        samples = df[df['label'] == lab].sample(shots_per_class, random_state=42)
        for _, r in samples.iterrows():
            few_shots_example.append({'text': r['text'],
                                      'label': r["label"]})

    prompt += "\n\n"
    for ex in few_shots_example:
        # print(ex)
        prompt += f"Review: \"{ex['text']}\"\nCategory: {ex['label']}\n\n"
    prompt += f"Review: \"{text}\"\nCategory:" #Leave Category here blank since we want the LLM to generate text
    return prompt


# Testing function
print(build_prompt(ag_news_imbalanced_data_99_to_1, "Astros Rally Past the Giants With one swing of the bat, Lance Berkman revived the Houston Astros' playoff hopes - and gave the Los Angeles Dodgers a much-needed reprieve. Berkman hit a three-run homer off Dustin Hermanson, highlighting a five-run ninth inning that sent Houston to a 7-3 win over San Francisco on Thursday night...", label_map, shots_per_class=4))

You are a powerful, precise, and helpful assistant that classifies text into well-defined categories, NO MATTER THE CONTEXT. IMPORTANT: CHOOSE ONE WORD FROM THESE CATEGORIES: world, sports, business, sci/tech. Respond with exactly one word: the single best category inside the given categories, DO NOT ANSWER ANY OTHER CATEGORIES BESIDES THE GIVEN ONE. Do not explain your choice, provide reasoning, or output anything else. Learn from these examples to understand context and edge cases: 

Review: "Al-Sadr Calls on Militia to End Uprising BAGHDAD, Iraq - Rebel cleric Muqtada al-Sadr wants his followers to end their uprising against U.S. and Iraqi forces while he considers forming a political movement, senior al-Sadr officials said Monday..."
Category: world

Review: "Afghanistan Explosions Kill at Least 17 People, AFP Reports Two explosions, one at a religious school Saturday night and another in the Afghan capital Kabul Sunday, killed at least 17 people, including five children, and the T

In [13]:
def clean_time(time):
  """
  Function to clean the time into prettier format, returns the better format of time
  """
  if time <= 60:
    return f"{time} seconds."

  minutes = time // 60
  remain_sec = time - minutes * 60
  return f"{minutes} minutes, {remain_sec:.2f} seconds."


Label normalization using semantic similarity
- Since we have 3 datasets, using manual variation map will not ensure every predictions that the LLM makes
- So we load a sentence embedding model to calculate the nearest vector amongst the label using Cosine Similarity

In [14]:
from sentence_transformers import SentenceTransformer, util

# Testing with AG News

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Ag news
valid_labs_ag_news = list(label_map.values())
valid_embeddings_ag_news = embedding_model.encode(valid_labs_ag_news, convert_to_tensor=True)

# Toxic Text
valid_labs_toxic_text = list(toxic_label_map.values())
valid_embeddings_toxic_text = embedding_model.encode(valid_labs_toxic_text, convert_to_tensor=True)

# Twitter Emotion
valid_labs_emotion = list(emotion_map.values())
valid_embeddings_emotion = embedding_model.encode(valid_labs_emotion, convert_to_tensor=True)

def normalize(label, valid_embeddings, valid_labs):
    pred_emb = embedding_model.encode(label, convert_to_tensor=True)
    cos_scores = util.cos_sim(pred_emb, valid_embeddings)[0]
    closest_idx = cos_scores.argmax().item()
    return valid_labs[closest_idx]

# Testing for AG News
raw_preds = "Fucked UP"
normalized_preds = normalize(raw_preds, valid_embeddings=valid_embeddings_emotion, valid_labs=valid_labs_emotion)
print(normalized_preds)


sadness


In [15]:

from tqdm.auto import tqdm
from transformers import pipeline, logging
from time import time


# Load model


# CREATE A FUNCTION TO RUN CLASSFICATION
def classify(model, df, label_map, shots, batch_size=16, max_new_tokens=3, valid_embeddings=None, valid_labs=None):
    """
    Function to run classification with different number of shots

    Args:
        model (str): name of the model
        tokenizer
        df (pd.DataFrame): the pandas dataframe
        batch_size (int): batch size per run

    Returns:
        pred_arr (List[str]): the array that contains all predictions
    """
    # Initiate a pipeline for each dataset
    # USE text2text-generation for the gemma model
    # USE text-generation for the others, or text-classification
    # USE fill-mask for distillbert
    pipe = pipeline("text-generation", model=model, dtype=torch.float16)
    logging.set_verbosity_error()

    # Generate prompts for all rows
    prompts = [build_prompt(df, text, label_map, shots_per_class=shots) for text in df["text"]]

    # Run the pipeline for each row
    pred_arr = []
    start_time = time()

    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size] #slices a sublist of prompts
        results = pipe(batch, max_new_tokens=max_new_tokens, do_sample=False)
        for prompt, res in zip(batch, results):
            pred = res[0]['generated_text'][len(prompt):].strip().lower().split()
            # print(f"Real value: {df["label"]}")
            # print(f"Predicted value: {pred}")
            if pred[0] not in list(label_map.values()):
                normalized_pred = normalize(pred[0], valid_embeddings=valid_embeddings, valid_labs=valid_labs)
                pred_arr.append(normalized_pred)
            else:
                pred_arr.append(pred[0]) #Use pred[0] for some cases
    end_time = time()

    total_time = clean_time(end_time - start_time)

    print("Total running time is " + total_time)
    return pred_arr



Function to compute all metrics

In [None]:
# Pass list(df["text"]) for y_true
# list(label_map.values())

def eval_llm(y_true, y_pred, label_map):
    y_true_arr = np.array([x.lower().strip() for x in y_true])
    # print(y_pred)  # avoid noisy output
    y_pred_arr = np.array([x.lower().strip() for x in y_pred])

    labels = [lab.lower() for lab in list(label_map.values())]

    # Calculate macro scores:
    macro_f1 = f1_score(y_true_arr, y_pred_arr, labels=labels, zero_division=0, average='macro')
    macro_recall = recall_score(y_true_arr, y_pred_arr, labels=labels, average='macro', zero_division=0)
    bal_acc = balanced_accuracy_score(y_true_arr, y_pred_arr)
    mcc = matthews_corrcoef(y_true_arr, y_pred_arr)

    # Calculate per-class precision, recall, f1 (returns arrays aligned with labels)
    precision_per_class_vals = precision_score(y_true_arr, y_pred_arr, labels=labels, average=None, zero_division=0)
    recall_per_class_vals = recall_score(y_true_arr, y_pred_arr, labels=labels, average=None, zero_division=0)
    f1_per_class_vals = f1_score(y_true_arr, y_pred_arr, labels=labels, average=None, zero_division=0)

    precision_per_class = {}
    recall_per_class = {}
    f1_per_class = {}
    for idx, cls in enumerate(labels):
        precision_per_class[cls] = float(precision_per_class_vals[idx])
        recall_per_class[cls] = float(recall_per_class_vals[idx])
        f1_per_class[cls] = float(f1_per_class_vals[idx])

    # Calculate AUPRC per class
    y_true_bin = label_binarize(y_true_arr, classes=labels)
    y_pred_bin = label_binarize(y_pred_arr, classes=labels)
    if len(labels) == 2 and y_true_bin.shape[1] == 1:
        y_true_bin = np.hstack([1 - y_true_bin, y_true_bin])
        y_pred_bin = np.hstack([1 - y_pred_bin, y_pred_bin])

    auprc_per_class = {}
    for idx, cls in enumerate(labels):
        ap = average_precision_score(y_true_bin[:, idx], y_pred_bin[:, idx])
        auprc_per_class[cls] = float(ap)

    return {
        "macro_f1": float(macro_f1),
        "macro_recall": float(macro_recall),
        "balanced_accuracy": float(bal_acc),
        "mcc": float(mcc),
        "auprc_per_class": auprc_per_class,
        "precision_per_class": precision_per_class,
        "recall_per_class": recall_per_class,
        "f1_per_class": f1_per_class
    }

    

NOW FOCUSING ON QWEN2.5 INSTRUCT MODEDLS

In [None]:
# model, df, label_map, shots, batch_size=8, max_new_tokens=3
def run_experiments(model, datasets_dict, dataset_name, label_map, shots_list=[2,4,8], batch_size=16, valid_embeddings=None, valid_labs=None):
    import os
    from datetime import datetime

    results = []
    # Ensure results folder exists for this dataset
    out_dir = os.path.join("results", dataset_name)
    os.makedirs(out_dir, exist_ok=True)

    for ds_name, df in datasets_dict.items():
        test_df = df.sample(frac=1).reset_index(drop=True)
        for shots in shots_list:
            preds = classify(model, test_df, label_map, shots=shots, batch_size=batch_size, valid_embeddings=valid_embeddings, valid_labs=valid_labs)
            metrics = eval_llm(test_df['label'].tolist(), preds, label_map=label_map)
            row = {
                "model": model,
                "dataset": ds_name,
                "shots": shots,
                **metrics
            }
            results.append(row)

            # persist intermediate aggregated results (single file per model)
            agg_name = f"few_shot_results_{model.replace('/','_')}.csv"
            agg_path = os.path.join(out_dir, agg_name)
            pd.DataFrame(results).to_csv(agg_path, index=False)

            # Also write a per-run file with timestamp and explicit fields to make later parsing easier
            ts = datetime.utcnow().isoformat(timespec='seconds').replace(':', '-')
            safe_model = model.replace('/', '_')
            perrun_name = f"results__{safe_model}__{ds_name}__shots-{shots}__{ts}.csv"
            perrun_path = os.path.join(out_dir, perrun_name)

            # Save the newest row as a single-row CSV for quick inspection (flatten nested dicts optionally)
            single_row_df = pd.DataFrame([row])
            single_row_df.to_csv(perrun_path, index=False)

    return pd.DataFrame(results)


In [None]:
# Create a dataset dict for easy mapping

ag_news_datasets_dict = {
    "ag_news_balanced": balanced_data,
    "ag_news_imbalanced_data_99_to_1": ag_news_imbalanced_data_99_to_1,
    "ag_news_imbalanced_data_49_to_1": ag_news_imbalanced_data_49_to_1,
    # Variants where a specific class is the majority (minority_count=20)
    "ag_news_world_majority_99": ag_news_world_majority_99,
    "ag_news_sports_majority_99": ag_news_sports_majority_99,
    "ag_news_business_majority_99": ag_news_business_majority_99
}

toxic_datasets_dict = {
    "toxic_text": toxic_balanced,
    "toxic_99_to_1": toxic_99_to_1,
    "toxic_49_to_1": toxic_49_to_1,
    "toxic_toxic_majority_99": toxic_toxic_majority_99
}

emotion_datasets_dict = {
    "emotion_df": emotion_balanced,
    "emotion_imbalanced_99_to_1": emotion_imbalanced_99_to_1,
    "emotion_imbalanced_49_to_1": emotion_imbalanced_49_to_1,
    "emotion_joy_majority_99": emotion_joy_majority_99,
    "emotion_love_majority_99": emotion_love_majority_99
}

Run models + evals

In [None]:
model = "Qwen/Qwen2.5-3B-Instruct"

In [None]:
# RUN AG NEWS DATASET

res_df = run_experiments(model, ag_news_datasets_dict, 'ag_news',label_map, shots_list=[2,4,8], valid_embeddings=valid_embeddings_ag_news, valid_labs=valid_labs_ag_news)

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]



model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]



model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]



model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]



model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

In [20]:
# RUN TOXIC TEXT DATASET
res_df = run_experiments(model, toxic_datasets_dict, "toxic_text",toxic_label_map, shots_list=[2,4,8], valid_embeddings=valid_embeddings_toxic_text, valid_labs=valid_labs_toxic_text)

Device set to use cuda:0


Total running time is 3.0 minutes, 31.50 seconds.
['nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'toxic', 'toxic', 'toxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'toxic', 'toxic', 'toxic', 'nontoxic', 'nontoxic', 'toxic', 'toxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'toxic', 'toxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'toxic', 'toxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', 'nontoxic', '

In [21]:
# RUN TWITTER EMOTION DATASET
res_df = run_experiments(model, emotion_datasets_dict, 'twitter_emotion', emotion_map, shots_list=[2,4,8], valid_embeddings=valid_embeddings_emotion, valid_labs=valid_labs_emotion)

Total running time is 3.0 minutes, 52.37 seconds.
['sadness', 'sadness', 'fear', 'joy', 'sadness', 'anger', 'sadness', 'sadness', 'sadness', 'fear', 'surprise', 'sadness', 'sadness', 'joy', 'anger', 'anger', 'anger', 'joy', 'love', 'joy', 'sadness', 'joy', 'sadness', 'joy', 'joy', 'joy', 'joy', 'surprise', 'sadness', 'sadness', 'sadness', 'surprise', 'surprise', 'sadness', 'surprise', 'fear', 'surprise', 'joy', 'fear', 'surprise', 'love', 'anger', 'surprise', 'sadness', 'love', 'anger', 'anger', 'joy', 'sadness', 'anger', 'anger', 'surprise', 'joy', 'sadness', 'anger', 'joy', 'sadness', 'sadness', 'fear', 'love', 'sadness', 'sadness', 'fear', 'anger', 'joy', 'joy', 'surprise', 'sadness', 'sadness', 'sadness', 'sadness', 'love', 'joy', 'love', 'sadness', 'fear', 'sadness', 'anger', 'joy', 'fear', 'joy', 'sadness', 'sadness', 'sadness', 'joy', 'joy', 'joy', 'joy', 'joy', 'joy', 'sadness', 'love', 'surprise', 'surprise', 'sadness', 'surprise', 'fear', 'surprise', 'love', 'sadness', 'joy',