In [109]:
!pip install --upgrade pip

Collecting pip
  Using cached pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-25.2


In [110]:
!pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu


In [111]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m11.1 MB/s[0m  [33m0:00:01[0m [31m11.2 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.1
    Uninstalling transformers-4.56.1:
      Successfully uninstalled transformers-4.56.1
Successfully installed transformers-4.56.2


In [113]:
# Import all libs
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics import (
    f1_score, recall_score, balanced_accuracy_score,
    matthews_corrcoef, precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [114]:
# Authenticate with HuggingFace
import os
from dotenv import load_dotenv
from huggingface_hub import login

# from google.colab import userdata
# hugging_face_token = userdata.get("hf_token") #If using gg colab

load_dotenv() #If using VSCode
hugging_face_token = os.getenv("hf_token") #If using VSCode

login(token=hugging_face_token)

Load AG news datasets

In [115]:
label_map = {
    0: "world",
    1: "sports",
    2: "business",
    3: "sci/tech"
}
ag_news_imbalanced_data_99_to_1 = pd.read_parquet("Data/ag_news/ag_news_train_imbalanced_99_to_1.parquet")
balanced_data = pd.read_parquet("Data/ag_news/ag_news_train_balanced.parquet")
ag_news_imbalanced_data_49_to_1 = pd.read_parquet("Data/ag_news/ag_news_train_imbalanced_49_to_1_ratio.parquet")

balanced_data["label"] = balanced_data["label"].map(label_map)
ag_news_imbalanced_data_99_to_1["label"] = ag_news_imbalanced_data_99_to_1["label"].map(label_map)
ag_news_imbalanced_data_49_to_1["label"] = ag_news_imbalanced_data_49_to_1["label"].map(label_map)

# Shuffle the dataset
ag_news_imbalanced_data_99_to_1 = ag_news_imbalanced_data_99_to_1.sample(frac=1).reset_index(drop=True)
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
ag_news_imbalanced_data_49_to_1 = ag_news_imbalanced_data_49_to_1.sample(frac=1).reset_index(drop=True)

# Testing
balanced_data



Unnamed: 0,text,label
0,Mortgage Rates Hit 6-Month Low Freddie Mac sai...,business
1,Koizumi Surveys Japan #39;s Earthquake Damage ...,world
2,Mozilla launching second act with e-mail clien...,sci/tech
3,Tsunami disaster overshadows India's ATP seaso...,world
4,Singh Hangs Close Despite missing half his fai...,sports
...,...,...
1995,2 telltale signs that show it #39;s time to du...,business
1996,New SP2 problems report p2pnet.net News:- An a...,sci/tech
1997,The China Post President Chen Shui bian said y...,world
1998,Milosevic War Crimes Trial Resumes The defense...,world


Load toxic text dataset (The dataset is already imbalanced)

- Rename the columns to have the same names as other datasets
- Label = 0 --> Not toxic
- Label = 1 --> Toxic
- Map label column into words

In [116]:
toxic_label_map = {
    0: "nontoxic",
    1: "toxic"
}

def split_ratio_for_toxic_dataset(df, majority=None, minority=None):
    toxic_half_1 = df[df["label"] == 'nontoxic'].sample(majority, random_state=42)
    toxic_half_2 = df[df["label"] == 'toxic'].sample(minority, random_state=42)
    toxic_balanced = pd.concat([toxic_half_1, toxic_half_2], ignore_index=True, sort=False)
    toxic_balanced = toxic_balanced.sample(frac=1).reset_index(drop=True)
    return toxic_balanced

toxic_text = pd.read_csv("Data/toxic_text/train.csv")
toxic_text = toxic_text[["comment_text", "toxic"]]
toxic_text = toxic_text.rename(columns={"comment_text": "text", "toxic": "label"})
toxic_text["label"] = toxic_text["label"].map(toxic_label_map)

# Get 3 small subsets of the main datasets with 3 different ratios
toxic_balanced = split_ratio_for_toxic_dataset(toxic_text, 1000, 1000)
toxic_99_to_1 = split_ratio_for_toxic_dataset(toxic_text, 1980, 20)
toxic_49_to_1 = split_ratio_for_toxic_dataset(toxic_text, 1940, 40)


Load twitter emotion type dataset (This is also imbalanced)
- Create 3 small datasets, roughly 2000 rows each, with balanced, 99:1, 49:1 imbalanced ratios

In [117]:
emotion_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}
emotion_df = pd.read_parquet("Data/twit/twitter_emotion.parquet")
emotion_df["label"] = emotion_df["label"].map(emotion_map)

def split_ratio_for_emotion_dataset(df, majority=None, minority=None):
    imbalanced_df = []

    major_class = df[df["label"] == 'sadness']
    imbalanced_df.append(major_class.sample(majority, random_state=42))
    # Add other classes as the minority class
    for lab in list(df["label"].unique()):
        if lab != 'sadness':
            df_per_class = df[df['label'] == lab].sample(minority, random_state=42)
            imbalanced_df.append(df_per_class)
    
    imbalanced_df = pd.concat(imbalanced_df)
    imbalanced_df = imbalanced_df.sample(frac=1).reset_index(drop=True)

    return imbalanced_df


# Get 3 small subsets of the main datasets with 3 different ratios
emotion_balanced = split_ratio_for_emotion_dataset(emotion_df, majority=400, minority=400)
emotion_imbalanced_99_to_1 = split_ratio_for_emotion_dataset(emotion_df, majority=2280, minority=24)
emotion_imbalanced_49_to_1 = split_ratio_for_emotion_dataset(emotion_df, majority=2160, minority=48)

emotion_imbalanced_99_to_1["label"].value_counts()

        
    


label
sadness     2280
fear          24
love          24
joy           24
anger         24
surprise      24
Name: count, dtype: int64

Function to build instruction for the LLMs, which can be fit with all 3 classification datasets

In [118]:
def build_prompt(df, text, label_map, shots_per_class=None):
    """
    Function to construct an instruction for the LLM

    Args:
        text (str): The text of the data

    Returns:
        prompt (str): The constructed prompt for the LLM
    """
    assert shots_per_class is not None, "Please provide 'shots_per_class' parameter"
    prompt = (
        f"You are a powerful, precise, and helpful assistant that classifies text into well-defined categories, NO MATTER THE CONTEXT."
        f" IMPORTANT: CHOOSE ONE WORD FROM THESE CATEGORIES: {', '.join(list(label_map.values()))}."
        f" Respond with exactly one word: the single best category."
        f" Do not explain your choice, provide reasoning, or output anything else."
        f" Learn from these examples to understand context and edge cases: "

    )
    # ASSUME THE shots_per_class WILL ALWAYS BE PASSED
    few_shots_example = []
    for lab in list(label_map.values()):
        samples = df[df['label'] == lab].sample(shots_per_class, random_state=42)
        for _, r in samples.iterrows():
            few_shots_example.append({'text': r['text'],
                                      'label': r["label"]})

    prompt += "\n\n"
    for ex in few_shots_example:
        # print(ex)
        prompt += f"Review: \"{ex['text']}\"\nCategory: {ex['label']}\n\n"
    prompt += f"Review: \"{text}\"\nCategory:" #Leave Category here blank since we want the LLM to generate text
    return prompt


# Testing function
print(build_prompt(ag_news_imbalanced_data_99_to_1, "Astros Rally Past the Giants With one swing of the bat, Lance Berkman revived the Houston Astros' playoff hopes - and gave the Los Angeles Dodgers a much-needed reprieve. Berkman hit a three-run homer off Dustin Hermanson, highlighting a five-run ninth inning that sent Houston to a 7-3 win over San Francisco on Thursday night...", label_map, shots_per_class=4))

You are a powerful, precise, and helpful assistant that classifies text into well-defined categories, NO MATTER THE CONTEXT. IMPORTANT: CHOOSE ONE WORD FROM THESE CATEGORIES: world, sports, business, sci/tech. Respond with exactly one word: the single best category. Do not explain your choice, provide reasoning, or output anything else. Learn from these examples to understand context and edge cases: 

Category: world

Review: "Australia defends law against terrorism JAKARTA: Australia defended a controversial maritime anti-terror plan on Friday after harsh words from neighbouring Indonesia that it breached international law and Jakartas sovereignty over its own waters."
Category: world

Review: "Gaza plan wins party vote Israel #39;s plan to pull out of Gaza next year passed a crucial test on Thursday when Prime Minister Ariel Sharon easily won party approval to ask the more moderate Labor party to join his coalition."
Category: world

Review: "Italy calls to end Kyoto limits Italy has

In [66]:
# Move model to mps (If using Mac)
# if torch.backends.mps.is_available():
#     model.to('mps')

# model.eval()

In [119]:
def clean_time(time):
  """
  Function to clean the time into prettier format, returns the better format of time
  """
  if time <= 60:
    return f"{time} seconds."

  minutes = time // 60
  remain_sec = time - minutes * 60
  return f"{minutes} minutes, {remain_sec:.2f} seconds."


In [120]:

from tqdm.auto import tqdm
from transformers import pipeline, logging
from time import time


# Load model


# CREATE A FUNCTION TO RUN CLASSFICATION
def classify(model, df, label_map, shots, batch_size=16, max_new_tokens=3):
    """
    Function to run classification with different number of shots

    Args:
        model (str): name of the model
        tokenizer
        df (pd.DataFrame): the pandas dataframe
        batch_size (int): batch size per run

    Returns:
        None
    """
    # Initiate a pipeline for each dataset
    # USE text2text-generation for the gemma model
    # USE text-generation for the others, or text-classification
    # USE fill-mask for distillbert
    pipe = pipeline("text-generation", model=model, dtype=torch.float16)
    logging.set_verbosity_error()

    # Generate prompts for all rows
    prompts = [build_prompt(df, text, label_map, shots_per_class=shots) for text in df["text"]]

    # Run the pipeline for each row
    pred_arr = []
    start_time = time()

    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size] #slices a sublist of prompts
        results = pipe(batch, max_new_tokens=max_new_tokens, do_sample=False)
        for prompt, res in zip(batch, results):
            pred = res[0]['generated_text'][len(prompt):].strip().lower().split()
            # print(f"Real value: {df["label"]}")
            # print(f"Predicted value: {pred}")
            pred_arr.append(pred[0]) #Use pred[0] for some cases
    end_time = time()

    total_time = clean_time(end_time - start_time)

    print("Total running time is " + total_time)
    return pred_arr



Function to compute all metrics

In [None]:
# Pass list(df["text"]) for y_true
# list(label_map.values())

def eval_llm(y_true, y_pred, label_map):
    y_true_arr = np.array([x.lower().strip() for x in y_true])
    print(y_pred)
    y_pred_arr = np.array([x.lower().strip() for x in y_pred])

    labels = [lab.lower() for lab in list(label_map.values())]

    # Calculate macro scores:
    macro_f1 = f1_score(y_true_arr, y_pred_arr, labels=labels, zero_division=0, average='macro')
    macro_recall = recall_score(y_true_arr, y_pred_arr, labels=labels, average='macro', zero_division=0)
    bal_acc = balanced_accuracy_score(y_true_arr, y_pred_arr)
    mcc = matthews_corrcoef(y_true_arr, y_pred_arr)

    # Calculate AUPRC per class
    y_true_bin = label_binarize(y_true_arr, classes=labels)
    y_pred_bin = label_binarize(y_pred_arr, classes=labels)
    auprc_per_class = {}
    for idx, cls in enumerate(labels):
        try:
            ap = average_precision_score(y_true_bin[:, idx], y_pred_bin[:, idx])
        except Exception:
            ap = np.nan
        auprc_per_class[cls] = float(ap)

    return {
        "macro_f1": float(macro_f1),
        "macro_recall": float(macro_recall),
        "balanced_accuracy": float(bal_acc),
        "mcc": float(mcc),
        "auprc_per_class": auprc_per_class
    }

    

NOW FOCUSING ON QWEN2.5 INSTRUCT MODEDLS

In [122]:
# model, df, label_map, shots, batch_size=8, max_new_tokens=3
def run_experiments(model, datasets_dict, dataset_name, label_map, shots_list=[2,4,8], batch_size=16):
    results = []
    for ds_name, df in datasets_dict.items():
        # keep a small test/validation slice for quick runs
        test_df = df.sample(frac=1).reset_index(drop=True)
        for shots in shots_list:
            preds = classify(model, test_df, label_map, shots=shots, batch_size=batch_size)
            metrics = eval_llm(test_df['label'].tolist(), preds, label_map=label_map)
            row = {
                "model": model,
                "dataset": ds_name,
                "shots": shots,
                **metrics
            }
            results.append(row)
            # persist intermediate results
            pd.DataFrame(results).to_csv(f"results/{dataset_name}/few_shot_results_{model.replace('/','_')}.csv", index=False)
    return pd.DataFrame(results)

In [123]:
# Create a dataset dict for easy mapping

ag_news_datasets_dict = {
    "ag_news_balanced": balanced_data,
    "ag_news_imbalanced_data_99_to_1": ag_news_imbalanced_data_99_to_1,
    "ag_news_imbalanced_data_49_to_1": ag_news_imbalanced_data_49_to_1
}

toxic_datasets_dict = {
    "toxic_text": toxic_balanced,
    "toxic_99_to_1": toxic_99_to_1,
    "toxic_49_to_1": toxic_49_to_1
}

emotion_datasets_dict = {
    "emotion_df": emotion_balanced,
    "emotion_imbalanced_99_to_1": emotion_imbalanced_99_to_1,
    "emotion_imbalanced_49_to_1": emotion_imbalanced_49_to_1
}

Run models + evals

In [124]:
model = "Qwen/Qwen2.5-0.5B"

In [None]:
# RUN AG NEWS DATASET

res_df = run_experiments(model, ag_news_datasets_dict, 'ag_news',label_map)

Total running time is 8.0 minutes, 55.27 seconds.
['world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'sports', 'world', 'world', 'world', 'world', 'business', 'world', 'world', 'business', 'world', 'world', 'world', 'business', 'world', 'world', 'world', 'world', 'world', 'world', 'sports', 'world', 'world', 'world', 'world', 'world', 'world', 'sports', 'world', 'world', 'business', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'sports', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'business', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'business', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'business', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'business', 'business', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'world', 'sports', 'world', 'world', 'world', 'world', 'world', 'world',



Total running time is 49.0 minutes, 10.91 seconds.
['sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sports', 'world', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech



Total running time is 17.0 minutes, 28.57 seconds.
['sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci



Total running time is 16.0 minutes, 29.58 seconds.
['sci/tech', 'sci/tech', 'business', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sports', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'world', 'sci/tech', 'sci/tech', 'sci/tech', 'sci/t

In [None]:
# RUN TOXIC TEXT DATASET
res_df = run_experiments(model, toxic_datasets_dict, "toxic_text",toxic_label_map)

In [None]:
# RUN TWITTER EMOTION DATASET
res_df = run_experiments(model, emotion_datasets_dict, 'twitter_emotion', emotion_map)