In [56]:
!pip install -r req.txt



In [57]:
!pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cpu


In [1]:
import torch

print(torch.mps.is_available()) #if using mac
# print(torch.cuda.is_available()) #if using GPU with cuda

True


In [3]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

# from google.colab import userdata
# hugging_face_token = userdata.get("hf_token") #If using gg colab

load_dotenv() #If using VSCode
hugging_face_token = os.getenv("hf_token") #If using VSCode

login(token=hugging_face_token)

In [4]:
import pandas as pd

label_map = {
    0: "world",
    1: "sports",
    2: "business",
    3: "sci/tech"
}
imbalanced_data = pd.read_parquet("Data/ag_news_train_imbalanced.parquet")
balanced_data = pd.read_parquet("Data/ag_news_train_balanced.parquet")
imbalanced_data_5_to_1 = pd.read_parquet("Data/ag_news_train_imbalanced_5_to_1_ratio.parquet")

balanced_data["label"] = balanced_data["label"].map(label_map)
imbalanced_data["label"] = imbalanced_data["label"].map(label_map)
imbalanced_data_5_to_1["label"] = imbalanced_data_5_to_1["label"].map(label_map)

# Shuffle the dataset
imbalanced_data = imbalanced_data.sample(frac=1).reset_index(drop=True)
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)
imbalanced_data_5_to_1 = imbalanced_data_5_to_1.sample(frac=1).reset_index(drop=True)

# Testing
balanced_data[balanced_data["label"] == 'sci/tech'].iloc[1]["text"]



'Study claims lake on Mars was wide, deep NASA #39;s Mars rover Opportunity found evidence for a lake or sea on Mars, and new research suggests the body of water was deep, large and long-lasting.'

In [5]:
def build_prompt(text):
    """
    Function to construct an instruction for the LLM

    Args:
        text (str): The text of the data

    Returns:
        prompt (str): The constructed prompt for the LLM
    """
    prompt = (
        f"You are a powerful, precise, and helpful assistant that classifies text into well-defined categories."
        f" Your task is to carefully analyze the meaning, tone, and intent of the given text, then select the most appropriate category."
        f" Choose only from the following categories: 'business', 'world', 'sci/tech', 'sports'."
        f" Respond with exactly one word: the single best category."
        f" Do not explain your choice, provide reasoning, or output anything else."
        f" Learn from these examples to understand context and edge cases: "

    )

    few_shots_example = [
        {"text": "Macedonian Prime Minister Kostov Resigns Parliament will officially announce Prime Minister Hari Kostov #39;s resignation during a meeting scheduled for Thursday, launching a ten-day period during which President Branko Crvenkovski should give the mandate for forming a new government.", 'label': 'business'},
        {'text': 'Trinidad climbs off canvas to keep title options open Felix Trinidad returned to the ring after more than two years to score a thrilling eighth-round stoppage of Ricardo Mayorga at New York #39;s Madison Square Garden in a non- title bout being described as one of the fights of the year.', 'label': 'sports'},
        {'text': 'Hungary #39;s ruling Socialist Party dumps PM BUDAPEST, Aug. 19 (Xinhuanet) -- Hungary #39;s ruling Socialist Party said Thursday that it accepted Prime Minister Peter Medgyessy #39;s resignation and has named a candidate for the post.', 'label': 'world'},
        {'text': 'Europe probe arrives at the Moon The Smart 1 lunar probe has entered into orbit around the Moon, the first ever European mission to do so. ', 'label': 'sci/tech'}
    ]

    prompt += "\n\n"
    for ex in few_shots_example:
        prompt += f"Review: \"{ex['text']}\"\nCategory: {ex['label']}\n\n"
    prompt += f"Review: \"{text}\"\nCategory:" #Leave Category here blank since we want the LLM to generate text
    return prompt


# Testing function
print(build_prompt("Astros Rally Past the Giants With one swing of the bat, Lance Berkman revived the Houston Astros' playoff hopes - and gave the Los Angeles Dodgers a much-needed reprieve. Berkman hit a three-run homer off Dustin Hermanson, highlighting a five-run ninth inning that sent Houston to a 7-3 win over San Francisco on Thursday night..."))

You are a powerful, precise, and helpful assistant that classifies text into well-defined categories. Your task is to carefully analyze the meaning, tone, and intent of the given text, then select the most appropriate category. Choose only from the following categories: 'business', 'world', 'sci/tech', 'sports'. Respond with exactly one word: the single best category. Do not explain your choice, provide reasoning, or output anything else. Learn from these examples to understand context and edge cases: 

Review: "Macedonian Prime Minister Kostov Resigns Parliament will officially announce Prime Minister Hari Kostov #39;s resignation during a meeting scheduled for Thursday, launching a ten-day period during which President Branko Crvenkovski should give the mandate for forming a new government."
Category: business

Review: "Trinidad climbs off canvas to keep title options open Felix Trinidad returned to the ring after more than two years to score a thrilling eighth-round stoppage of Rica

In [10]:
# Load the model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model = "Qwen/Qwen2.5-0.5B" #Could be changed later for more evals
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(
    model,
    torch_dtype=torch.float16
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [63]:
# Move model to mps (If using Mac)
# if torch.backends.mps.is_available():
#     model.to('mps')

# model.eval()

In [11]:
def clean_time(time):
  """
  Function to clean the time into prettier format, returns the better format of time
  """
  if time <= 60:
    return f"{time} seconds."

  minutes = time // 60
  remain_sec = time - minutes * 60
  return f"{minutes} minutes, {remain_sec:.2f} seconds."


In [12]:

from tqdm.auto import tqdm
from transformers import pipeline, logging
from time import time


# Load model


# CREATE A FUNCTION TO RUN CLASSFICATION
def classify(model, tokenizer, df, batch_size):
    """
    Function to run classification on 3 datasets, using batched prediction

    Args:
        model (str): name of the model
        tokenizer
        df (pd.DataFrame): the pandas dataframe
        batch_size (int): batch size per run

    Returns:
        None
    """
    # Initiate a pipeline for each dataset
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    logging.set_verbosity_error()

    # Generate prompts for all rows
    prompts = [build_prompt(text) for text in df["text"]]

    # Run the pipeline for each row
    pred_arr = []
    start_time = time()

    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size] #slices a sublist of prompts
        results = pipe(batch, max_new_tokens=3, do_sample=False, pad_token_id=tokenizer.eos_token_id)
        for prompt, res in zip(batch, results):
            pred = res[0]['generated_text'][len(prompt):].strip().split()[0]
            pred_arr.append(pred)
    end_time = time()

    total_time = clean_time(end_time - start_time)

    print("Total running time is " + total_time)
    df["llm_prediction"] = pred_arr
    print("Predictions have been added to the dataframe")



In [13]:
# Run with 3 datasets
bs = 8

classify(
    model=model,
    tokenizer=tokenizer,
    df=imbalanced_data,
    batch_size=bs
)

classify(
    model=model,
    tokenizer=tokenizer,
    df=imbalanced_data_5_to_1,
    batch_size=bs
)

classify(
    model=model,
    tokenizer=tokenizer,
    df=balanced_data,
    batch_size=bs
)

Device set to use mps:0


Total running time is 5.0 minutes, 50.05 seconds.
Predictions have been added to the dataframe
Total running time is 5.0 minutes, 50.75 seconds.
Predictions have been added to the dataframe
Total running time is 5.0 minutes, 51.57 seconds.
Predictions have been added to the dataframe


In [19]:
def evaluate_model(df, true_col="label", pred_col='llm_prediction'):
  df[true_col] = df[true_col].str.lower().str.strip()
  df[pred_col] = df[pred_col].str.lower().str.strip()

  correct_pred = (df[true_col] == df[pred_col]).sum()
  total = len(df)
  accuracy = correct_pred / total

  print(f"\n✅ Accuracy: {accuracy*100:.2f}% ({correct_pred}/{total} correct)")

In [20]:
evaluate_model(balanced_data)
evaluate_model(imbalanced_data)
evaluate_model(imbalanced_data_5_to_1)


✅ Accuracy: 34.60% (692/2000 correct)

✅ Accuracy: 67.15% (1343/2000 correct)

✅ Accuracy: 79.67% (1595/2002 correct)


In [18]:
imbalanced_data_5_to_1["llm_prediction"].unique()

array(['world', 'business', 'sports', 'sci/tech'], dtype=object)