In [7]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.2


In [10]:
!pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu


In [11]:
!pip install --upgrade transformers



In [12]:
# Import all libs
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics import (
    f1_score, recall_score, balanced_accuracy_score,
    matthews_corrcoef, precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [19]:
# Authenticate with HuggingFace
import os
from dotenv import load_dotenv
from huggingface_hub import login

# from google.colab import userdata
# hugging_face_token = userdata.get("hf_token") #If using gg colab

load_dotenv() #If using VSCode
hugging_face_token = os.getenv("hf_token") #If using VSCode

login(token=hugging_face_token)

Load AG news datasets

In [20]:
label_map = {
    0: "world",
    1: "sports",
    2: "business",
    3: "sci/tech"
}
ag_news_imbalanced_data_10_to_1 = pd.read_parquet("Data/ag_news/ag_news_train_imbalanced.parquet")
balanced_data = pd.read_parquet("Data/ag_news/ag_news_train_balanced.parquet")
ag_news_imbalanced_data_5_to_1 = pd.read_parquet("Data/ag_news/ag_news_train_imbalanced_5_to_1_ratio.parquet")

balanced_data["label"] = balanced_data["label"].map(label_map)
ag_news_imbalanced_data_10_to_1["label"] = ag_news_imbalanced_data_10_to_1["label"].map(label_map)
ag_news_imbalanced_data_5_to_1["label"] = ag_news_imbalanced_data_5_to_1["label"].map(label_map)

# Shuffle the dataset
ag_news_imbalanced_data_10_to_1 = ag_news_imbalanced_data_10_to_1.sample(frac=1).reset_index(drop=True)
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
ag_news_imbalanced_data_5_to_1 = ag_news_imbalanced_data_5_to_1.sample(frac=1).reset_index(drop=True)

# Testing
balanced_data



Unnamed: 0,text,label
0,Zimbabwe Turns Back Clock After Turmoil (AP) A...,world
1,Shocked Japanese sent scrambling as earthquake...,world
2,"Germany wins bronze ATHENS, Greece - Womens Wo...",sports
3,Circuit City chooses Linux for cash registers ...,sci/tech
4,Dolphins Ask Williams for #36;8.6 Million (AP...,sports
...,...,...
1995,Fernando positive for the end of the season Fe...,sports
1996,"To tithe, make it first item in budget WASHING...",business
1997,New priority: SLC ponders ways to absorb Delta...,business
1998,Swiss Blocks Accounts in Oil Co. Probe (AP) AP...,world


Load toxic text dataset (The dataset is already imbalanced)

- Rename the columns to have the same names as other datasets
- Label = 0 --> Not toxic
- Label = 1 --> Toxic
- Map label column into words

In [36]:
toxic_label_map = {
    0: "nontoxic",
    1: "toxic"
}

def split_ratio_for_toxic_dataset(df, majority=None, minority=None):
    toxic_half_1 = df[df["label"] == 'nontoxic'].sample(majority, random_state=42)
    toxic_half_2 = df[df["label"] == 'toxic'].sample(minority, random_state=42)
    toxic_balanced = pd.concat([toxic_half_1, toxic_half_2], ignore_index=True, sort=False)
    toxic_balanced = toxic_balanced.sample(frac=1).reset_index(drop=True)
    return toxic_balanced

toxic_text = pd.read_csv("Data/toxic_text/train.csv")
toxic_text = toxic_text[["comment_text", "toxic"]]
toxic_text = toxic_text.rename(columns={"comment_text": "text", "toxic": "label"})
toxic_text["label"] = toxic_text["label"].map(toxic_label_map)


toxic_balanced = split_ratio_for_toxic_dataset(toxic_text, 1000, 1000)
toxic_99_to_1 = split_ratio_for_toxic_dataset(toxic_text, 1980, 20)
toxic_49_to_1 = split_ratio_for_toxic_dataset(toxic_text, 19400, 40)


Load twitter emotion type dataset (This is also imbalanced)

In [None]:
emotion_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}
emotion_df = pd.read_parquet("Data/twit/twitter_emotion.parquet")
emotion_df["label"] = emotion_df["label"].map(emotion_map)

def split_ratio_for_emotion_dataset(df, majority=None, minority=None):
    imbalanced_df = []

    major_class = df[df["label"] == 'sadness']
    imbalanced_df.append(major_class.sample(majority, random_state=42))
    


label
joy         141067
sadness     121187
anger        57317
fear         47712
love         34554
surprise     14972
Name: count, dtype: int64

In [None]:
def build_prompt(df, text, label_map, shots_per_class=None):
    """
    Function to construct an instruction for the LLM

    Args:
        text (str): The text of the data

    Returns:
        prompt (str): The constructed prompt for the LLM
    """
    prompt = (
        f"You are a powerful, precise, and helpful assistant that classifies text into well-defined categories, NO MATTER THE CONTEXT."
        f" IMPORTANT: CHOOSE ONE WORD FROM THESE CATEGORIES: {', '.join(list(label_map.values()))}."
        f" Respond with exactly one word: the single best category."
        f" Do not explain your choice, provide reasoning, or output anything else."
        f" Learn from these examples to understand context and edge cases: "

    )

    few_shots_example = [
        {"text": "Macedonian Prime Minister Kostov Resigns Parliament will officially announce Prime Minister Hari Kostov #39;s resignation during a meeting scheduled for Thursday, launching a ten-day period during which President Branko Crvenkovski should give the mandate for forming a new government.", 'label': 'business'},
        {'text': 'Trinidad climbs off canvas to keep title options open Felix Trinidad returned to the ring after more than two years to score a thrilling eighth-round stoppage of Ricardo Mayorga at New York #39;s Madison Square Garden in a non- title bout being described as one of the fights of the year.', 'label': 'sports'},
        {'text': 'Hungary #39;s ruling Socialist Party dumps PM BUDAPEST, Aug. 19 (Xinhuanet) -- Hungary #39;s ruling Socialist Party said Thursday that it accepted Prime Minister Peter Medgyessy #39;s resignation and has named a candidate for the post.', 'label': 'world'},
        {'text': 'Europe probe arrives at the Moon The Smart 1 lunar probe has entered into orbit around the Moon, the first ever European mission to do so. ', 'label': 'sci/tech'}
    ]

    prompt += "\n\n"
    for ex in few_shots_example:
        prompt += f"Review: \"{ex['text']}\"\nCategory: {ex['label']}\n\n"
    prompt += f"Review: \"{text}\"\nCategory:" #Leave Category here blank since we want the LLM to generate text
    return prompt


# Testing function
print(build_prompt("Astros Rally Past the Giants With one swing of the bat, Lance Berkman revived the Houston Astros' playoff hopes - and gave the Los Angeles Dodgers a much-needed reprieve. Berkman hit a three-run homer off Dustin Hermanson, highlighting a five-run ninth inning that sent Houston to a 7-3 win over San Francisco on Thursday night..."))

You are a powerful, precise, and helpful assistant that classifies text into well-defined categories. IMPORTANT: CHOOSE ONE WORD FROM THESE CATEGORIES: 'business', 'world', 'sci/tech', 'sports'. Respond with exactly one word: the single best category. Do not explain your choice, provide reasoning, or output anything else. Learn from these examples to understand context and edge cases: 

Review: "Macedonian Prime Minister Kostov Resigns Parliament will officially announce Prime Minister Hari Kostov #39;s resignation during a meeting scheduled for Thursday, launching a ten-day period during which President Branko Crvenkovski should give the mandate for forming a new government."
Category: business

Review: "Trinidad climbs off canvas to keep title options open Felix Trinidad returned to the ring after more than two years to score a thrilling eighth-round stoppage of Ricardo Mayorga at New York #39;s Madison Square Garden in a non- title bout being described as one of the fights of the ye

In [63]:
# Move model to mps (If using Mac)
# if torch.backends.mps.is_available():
#     model.to('mps')

# model.eval()

In [8]:
def clean_time(time):
  """
  Function to clean the time into prettier format, returns the better format of time
  """
  if time <= 60:
    return f"{time} seconds."

  minutes = time // 60
  remain_sec = time - minutes * 60
  return f"{minutes} minutes, {remain_sec:.2f} seconds."


In [None]:

from tqdm.auto import tqdm
from transformers import pipeline, logging
from time import time


# Load model


# CREATE A FUNCTION TO RUN CLASSFICATION
def classify(model, df, batch_size):
    """
    Function to run classification with different number of shots

    Args:
        model (str): name of the model
        tokenizer
        df (pd.DataFrame): the pandas dataframe
        batch_size (int): batch size per run

    Returns:
        None
    """
    # Initiate a pipeline for each dataset
    # USE text2text-generation for the gemma model
    # USE text-generation for the others, or text-classification
    # USE fill-mask for distillbert
    pipe = pipeline("text-generation", model=model, dtype=torch.float16)
    logging.set_verbosity_error()

    # Generate prompts for all rows
    prompts = [build_prompt(text) for text in df["text"]]

    # Run the pipeline for each row
    pred_arr = []
    start_time = time()

    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size] #slices a sublist of prompts
        results = pipe(batch, max_new_tokens=3, do_sample=False)
        for prompt, res in zip(batch, results):
            pred = res[0]['generated_text'][len(prompt):].strip().split()
            # print(f"Real value: {df["label"]}")
            # print(f"Predicted value: {pred}")
            pred_arr.append(pred) #Use pred[0] for tiiuae/falcon-rw-1b
    end_time = time()

    total_time = clean_time(end_time - start_time)

    print("Total running time is " + total_time)
    df["llm_prediction"] = pred_arr
    print("Predictions have been added to the dataframe")



NOW FOCUSING ON QWEN2.5 INSTRUCT MODEDLS

In [None]:

model = "Qwen/Qwen2.5-0.5B-Instruct" #Could be changed later for more evals
# microsoft/phi-2



# Run with 3 datasets
bs = 8

# classify(
#     model=model,
#     df=ag_news_imbalanced_data_10_to_1,
#     batch_size=bs
# )

# classify(
#     model=model,
#     df=ag_news_imbalanced_data_5_to_1,
#     batch_size=bs
# )

classify(
    model=model,
    df=balanced_data,
    batch_size=bs
)

Total running time is 19.0 minutes, 17.07 seconds.
Predictions have been added to the dataframe


In [19]:
def evaluate_model(df, true_col="label", pred_col='llm_prediction'):
  df[true_col] = df[true_col].str.lower().str.strip()
  df[pred_col] = df[pred_col].str.lower().str.strip()

  correct_pred = (df[true_col] == df[pred_col]).sum()
  total = len(df)
  accuracy = correct_pred / total

  print(f"\n✅ Accuracy: {accuracy*100:.2f}% ({correct_pred}/{total} correct)")

In [None]:
evaluate_model(balanced_data)
evaluate_model(ag_news_imbalanced_data_10_to_1)
evaluate_model(ag_news_imbalanced_data_5_to_1)


✅ Accuracy: 45.00% (900/2000 correct)

✅ Accuracy: 59.75% (1195/2000 correct)

✅ Accuracy: 64.84% (1298/2002 correct)


In [None]:
# Check LLM's output to see if it has correctly generated the word in the categories
ag_news_imbalanced_data_10_to_1["llm_prediction"]