In [1]:
import pandas as pd

# Get training dataset
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["train"])
df_train.head()

# Get testing dataset
df_test = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["test"])
df_test.head()

Unnamed: 0,text,label
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3


In [25]:
from dotenv import load_dotenv
import os
import requests
import time
import json


load_dotenv()
API_KEY = os.getenv("API_KEY")

# Sample out an balanced training data
n_rows_per_class = 1000
balanced_dfs = []

for label in sorted(df_train["label"].unique()):
    class_samples = df_train[df_train["label"] == label]
    balanced_dfs.append(class_samples.sample(n_rows_per_class, random_state=42))

balanced_data = pd.concat(balanced_dfs)

# Sample out an imbalanced training data (Assume label 0 as the majority class)
n_majority = 2000
n_minority = 200

imbalanced_dfs = []
label_0_class = df_train[df_train["label"] == 0]
imbalanced_dfs.append(label_0_class.sample(n_majority, random_state=42))

for label in sorted(df_train["label"].unique())[1:]:
    class_samples = df_train[df_train['label'] == label]
    imbalanced_dfs.append(class_samples.sample(n_minority, random_state=42))

imbalanced_data = pd.concat(imbalanced_dfs)
# Shuffle the imbalanced dataset to mix the classes
imbalanced_data = imbalanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nBalanced Training Set Label Distribution:")
print(balanced_data['label'].value_counts().sort_index())

print("\nImbalanced Training Set Label Distribution:")
print(imbalanced_data['label'].value_counts().sort_index())




Balanced Training Set Label Distribution:
label
0    1000
1    1000
2    1000
3    1000
Name: count, dtype: int64

Imbalanced Training Set Label Distribution:
label
0    2000
1     200
2     200
3     200
Name: count, dtype: int64


In [21]:
# Create a small testing set
n_rows_per_class = 1000
test_balanced_dfs = []
for label in sorted(df_test["label"].unique()):
    test_samples = df_test[df_test["label"] == label]
    test_balanced_dfs.append(test_samples.sample(n_rows_per_class, random_state=42))

testing_set = pd.concat(test_balanced_dfs)
testing_set = testing_set.sample(frac=1, random_state=42).reset_index(drop=True)



In [26]:
# Save datasets to parquet for later use
balanced_data.to_parquet('Data/ag_news_train_balanced.parquet')
imbalanced_data.to_parquet('Data/ag_news_train_imbalanced.parquet')
testing_set.to_parquet('Data/ag_news_test_small.parquet')

In [5]:
# Create a function to build the prompt strings on both balanced and imbalanced
label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

def build_shots_prompt(train_df, shots=None, imbalanced_ratio=None):
    prompt_lines = ["Classify the following text into one of these categories: World, Sports, Business, Sci/Tech", 
                   "", 
                   "IMPORTANT: Respond with ONLY the category name, nothing else.",
                   "",
                   "Examples:"]
    
    if shots:
        # Build a balanced prompt
        for label in sorted(train_df['label'].unique()):
            class_samples = train_df[train_df['label'] == label].sample(shots, random_state=42)
            for _, row in class_samples.iterrows():
                prompt_lines.append(f"Text: {row['text']}")
                prompt_lines.append(f"Category: {label_map[row['label']]}")
                prompt_lines.append("")  # Add a blank line between examples
                
    elif imbalanced_ratio:
        # Build an imbalanced prompt based on the provided ratios
        for label, n_shots in imbalanced_ratio.items():
            class_samples = train_df[train_df['label'] == label].sample(n_shots, random_state=42)
            for _, row in class_samples.iterrows():
                prompt_lines.append(f"Text: {row['text']}")
                prompt_lines.append(f"Category: {label_map[row['label']]}")
                prompt_lines.append("")  # Add a blank line between examples
    else:
        raise ValueError("Must provide either 'shots_per_class' or 'imbalanced_ratios'")
        
    # Join all lines into a single string
    prompt_str = "\n".join(prompt_lines)
    return prompt_str


In [6]:
def classify(text, examples_prompt, model_name, api_key):
    full_prompt = f"{examples_prompt}\n\nNow classify this new text:\nText: {text}\nCategory:"
    payload = {
        "model": model_name, 
        "messages": [
            {"role": "user", "content": full_prompt}
        ],
        "temperature": 0.0,  
        "max_tokens": 10     
    }

    # The headers required by Open Router
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(
            url="https://openrouter.ai/api/v1/chat/completions",
            headers=headers,
            data=json.dumps(payload)
        )
        response.raise_for_status() 
        
        result = response.json()
        # Extract the model's response, which is similar to the OpenAI API.
        prediction = result['choices'][0]['message']['content'].strip()
        pred_lines = prediction.splitlines()[0].lower()
        for label in label_map.values():
            if label.lower() in pred_lines:
                return label
        return prediction.strip()
        
    except requests.exceptions.RequestException as e:
        print(f"Request error for text '{text[:50]}...': {e}")
        time.sleep(5)  
        return "Error"
    except KeyError as e:
        print(f"Could not parse response for text '{text[:50]}...': {e}. Response: {result}")
        return "Error"


In [10]:
from sklearn.metrics import classification_report
import re

model = "meta-llama/llama-4-maverick:free"

# Define expected labels
expected_labels = list(label_map.values())
print(f"Expected labels: {expected_labels}")

balanced_prompt = build_shots_prompt(balanced_data, shots=2)
imbalanced_prompt = build_shots_prompt(balanced_data, imbalanced_ratio={0: 4, 1: 1, 2: 1, 3: 1})

print(f"--- EVALUATING {model} ---")
model_res = {}

def clean_prediction(pred, expected_labels):
    if not pred:
        return "Unknown"
    
    # Convert to lowercase and remove extra spaces
    pred = pred.lower().strip()
    
    
    pred = re.sub(r'[^\w\s]', '', pred)  
    pred = re.sub(r'\b(category|is|the|a|an)\b', '', pred)  
    pred = pred.strip()
    
    
    for label in expected_labels:
        if label.lower() in pred or pred in label.lower():
            return label
    
   
    variation_map = {
        'sport': 'Sports',
        'sci': 'Sci/Tech',
        'technology': 'Sci/Tech',
        'tech': 'Sci/Tech',
        'businesses': 'Business',
        'world news': 'World'
    }
    
    for variation, correct_label in variation_map.items():
        if variation in pred:
            return correct_label
    
    
    print(f"Unrecognized prediction: '{pred}' -> Mapping to 'Unknown'")
    return "Unknown"

for prompt_name, prompt in [("Balanced", balanced_prompt), ("Imbalanced", imbalanced_prompt)]:
    print(f"Running with {prompt_name} prompt")
    
    y_pred_arr = []
    y_true_arr = []
    
    n_rows = 20

    for _, row in testing_set.iloc[n_rows:40].iterrows():
        y_true = label_map[row["label"]]
        raw_pred = classify(
            text=row["text"],
            examples_prompt=prompt,
            model_name=model,
            api_key=API_KEY
        )
        
       
        cleaned_pred = clean_prediction(raw_pred, expected_labels)
        
        y_true_arr.append(y_true)
        y_pred_arr.append(cleaned_pred)
        
        
        if len(y_pred_arr) % 5 == 0:
            print(f"Sample: True='{y_true}', Raw='{raw_pred}', Cleaned='{cleaned_pred}'")

    
    unique_preds = set(y_pred_arr)
    print(f"Unique predictions: {unique_preds}")
    
    
    report = classification_report(y_true_arr, y_pred_arr, 
                                    labels=expected_labels,
                                    target_names=expected_labels,
                                    output_dict=True, 
                                    zero_division=0)
    
    model_res[prompt_name] = {
        'predictions': y_pred_arr,
        'true_labels': y_true_arr,
        'raw_predictions': [clean_prediction(p, expected_labels) for p in y_pred_arr],
        'classification_report': report,
        'macro_f1': report['macro avg']['f1-score']
    }

    print(f"    {prompt_name} Prompt Macro-F1: {report['macro avg']['f1-score']:.4f}")
    
   
    time.sleep(60)  

print("Evaluation done")

# Print detailed results
for prompt_name in model_res:
    print(f"\n--- {prompt_name} Prompt Results ---")
    print(f"Macro-F1: {model_res[prompt_name]['macro_f1']:.4f}")
    
    
    pred_counts = {}
    for pred in model_res[prompt_name]['predictions']:
        pred_counts[pred] = pred_counts.get(pred, 0) + 1
    print(f"Prediction distribution: {pred_counts}")

Expected labels: ['World', 'Sports', 'Business', 'Sci/Tech']
--- EVALUATING meta-llama/llama-4-maverick:free ---
Running with Balanced prompt
Sample: True='Business', Raw='Business', Cleaned='Business'
Sample: True='Sports', Raw='Sports', Cleaned='Sports'
Sample: True='Business', Raw='Business', Cleaned='Business'
Request error for text 'U.S. Treasuries Inch Up, Await Data  LONDON (Reute...': 429 Client Error: Too Many Requests for url: https://openrouter.ai/api/v1/chat/completions
Unrecognized prediction: 'error' -> Mapping to 'Unknown'
Request error for text 'Toyota reports a silicon carbide breakthrough Move...': 429 Client Error: Too Many Requests for url: https://openrouter.ai/api/v1/chat/completions
Unrecognized prediction: 'error' -> Mapping to 'Unknown'
Sample: True='Sci/Tech', Raw='Error', Cleaned='Unknown'
Unique predictions: {'Sci/Tech', 'Unknown', 'World', 'Business', 'Sports'}
Unrecognized prediction: 'unknown' -> Mapping to 'Unknown'
Unrecognized prediction: 'unknown' -> 