In [1]:
!pip install transformers peft bitsandbytes datasets accelerate tqdm torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import time
import os
from datasets import load_dataset
import pandas as pd
from tqdm.notebook import tqdm
from peft import PeftModel

In [3]:
# --- 1. Configuration ---

# !! IMPORTANT: Set these paths correctly !!
base_model_name = "microsoft/Phi-3-mini-4k-instruct"

# Check if running on GPU
if not torch.cuda.is_available():
    raise SystemError("GPU not available. Please enable GPU runtime in Colab.")
else:
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")

GPU detected: Tesla T4


In [4]:


# --- 2. Load Base Model and Tokenizer (Quantized) ---
print("Loading base model and tokenizer (4-bit)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

base_model.config.use_cache = True # Enable cache for faster generation

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left" # Usually left for generation
print('Finished config')

Loading base model and tokenizer (4-bit)...


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Finished config


In [5]:
model = base_model.eval()
print("Base Model ready for inference.")

Base Model ready for inference.


In [6]:
# --- 4. Load and Prepare Test Data (Same as before) ---
print("Loading ag_news test dataset...")
ag_news_dataset = load_dataset("ag_news")
test_data = ag_news_dataset['test']

# --- !! Use the SAME subset size as the Gemini test !! ---
NUM_TEST_SAMPLES = 100 # Make sure this matches the Gemini script
test_sample = test_data.shuffle(seed=42).select(range(NUM_TEST_SAMPLES)) # Use the same random seed and range
print(f"Using {len(test_sample)} samples from the test set.")

label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
category_names = list(label_map.values())

Loading ag_news test dataset...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Using 100 samples from the test set.


In [7]:
# --- 5. Define Prompt Formatting Function (Must Match Fine-tuning Format!) ---
def create_zeroshot_prompt(example):
    instruction = "Classify the following news article into one of these categories: World, Sports, Business, or Sci/Tech."
    input_text = example['text'].strip()
    # Format exactly as used in training
    return f"<s>[INST] {instruction} Article: '{input_text}' [/INST]" # Note: No answer part


In [8]:
# --- 6. Function to Run Inference and Measure Latency ---
@torch.inference_mode() # Disable gradient calculations for efficiency
def get_tuned_slm_classification(prompt_text):
    inputs = tokenizer(prompt_text, return_tensors="pt", padding=True, truncation=True, max_length=1024).to("cuda") # Ensure tensors are on GPU

    start_time = time.time()
    # --- Generation Call ---
    outputs = model.generate(
        **inputs,
        max_new_tokens=10, # Only need a few tokens for the category name
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id, # Set pad token id
        do_sample=False, # Use greedy decoding for classification
        temperature=None, # Not needed for greedy
        top_p=None, # Not needed for greedy
        use_cache=False # Disable cache to avoid the error
    )
    # ---------------------
    end_time = time.time()
    latency = end_time - start_time # Latency of the generation step only

    # Decode the generated part, skipping the prompt and special tokens
    # Find the length of the input prompt tokens
    input_token_len = inputs.input_ids.shape[1]
    # Decode only the newly generated tokens
    generated_text = tokenizer.decode(outputs[0][input_token_len:], skip_special_tokens=True)

    # --- Basic Response Cleaning ---
    predicted_category = None
    cleaned_response = generated_text.strip()
    # Find the first category name mentioned (case-insensitive)
    best_match_pos = float('inf')
    for cat_name in category_names:
      try:
        pos = cleaned_response.lower().index(cat_name.lower())
        if pos < best_match_pos:
            best_match_pos = pos
            predicted_category = cat_name
      except ValueError:
        continue
    if predicted_category is None:
         print(f"Warning: Could not parse category from SLM response: '{cleaned_response}' for prompt: '{prompt_text[:100]}...'")

    return predicted_category, latency, cleaned_response

In [9]:
# --- 7. Run Evaluation Loop ---
results_tuned_slm = []

print("\nStarting evaluation for fine-tuned SLM...")
for example in tqdm(test_sample):
    true_label_num = example['label']
    true_label_name = label_map[true_label_num]

    # Format the prompt for the fine-tuned model
    prompt = create_zeroshot_prompt(example)

    # Get prediction and latency
    pred_slm, lat_slm, raw_slm = get_tuned_slm_classification(prompt)

    results_tuned_slm.append({
        "true_label": true_label_name,
        "predicted_label": pred_slm,
        "latency": lat_slm,
        "raw_response": raw_slm
    })
    # Optional: Add a small delay if running into CUDA memory issues between runs
    # torch.cuda.empty_cache()
    # time.sleep(0.1)

print("\nEvaluation complete.")


Starting evaluation for fine-tuned SLM...


  0%|          | 0/100 [00:00<?, ?it/s]




Evaluation complete.


In [11]:
# --- 8. Calculate and Print Results ---
def calculate_metrics(results):
    correct = 0
    total_latency = 0
    valid_predictions = 0
    for res in results:
        total_latency += res['latency']
        if res['predicted_label'] is not None:
            valid_predictions += 1
            if res['true_label'] == res['predicted_label']:
                correct += 1
        # else: Treat parse errors as incorrect

    accuracy = (correct / len(results)) * 100 if len(results) > 0 else 0
    # Convert latency to milliseconds
    avg_latency_ms = (total_latency / len(results)) * 1000 if len(results) > 0 else 0
    parse_rate = (valid_predictions / len(results)) * 100 if len(results) > 0 else 0
    return accuracy, avg_latency_ms, parse_rate

acc_slm, lat_slm_avg, parse_slm = calculate_metrics(results_tuned_slm)

print("\n--- Fine-Tuned SLM Performance on AG News ---")
print(f"Base Model: {base_model_name}")
# print(f"Adapter: {adapter_path}")
print(f"Number of test samples: {len(test_sample)}")
print(f"  Accuracy: {acc_slm:.2f}%")
print(f"  Avg. Inference Latency: {lat_slm_avg:.2f} ms per request (GPU local)")
print(f"  Successfully Parsed Responses: {parse_slm:.2f}%")


--- Fine-Tuned SLM Performance on AG News ---
Base Model: microsoft/Phi-3-mini-4k-instruct
Number of test samples: 100
  Accuracy: 48.00%
  Avg. Inference Latency: 5596.79 ms per request (GPU local)
  Successfully Parsed Responses: 55.00%


In [14]:
# --- 9. Save results
df_slm = pd.DataFrame(results_tuned_slm)
df_slm.to_csv("/content/drive/MyDrive/untuned_tuned_slm_class_results.csv", index=False)
print("\nDetailed results saved to untuned_slm_results.csv")


Detailed results saved to untuned_slm_results.csv


In [None]:
!top -b -o %MEM -n 1 | head -n 15

top - 02:08:29 up 3 min,  0 users,  load average: 1.34, 0.59, 0.24
Tasks:  18 total,   1 running,  16 sleeping,   0 stopped,   1 zombie
%Cpu(s): 90.3 us,  6.5 sy,  0.0 ni,  3.2 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
MiB Mem :  12975.6 total,   7567.6 free,   2038.7 used,   3369.3 buff/cache
MiB Swap:      0.0 total,      0.0 free,      0.0 used.  10653.4 avail Mem 

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
    695 root      20   0 2914652   1.3g  41392 S 181.2  10.5   0:23.17 node
    111 root      20   0  400716 146780  27224 S   0.0   1.1   0:04.41 jupyter+
    666 root      20   0  659888  98820  26308 S   6.2   0.7   0:01.07 python3
     66 root      20   0   92840  72856  19728 S   0.0   0.5   0:02.17 colab-f+
      7 root      20   0 1160808  57548  41380 S   0.0   0.4   0:00.55 node
    689 root      20   0 1275108  19464  11816 S   0.0   0.1   0:00.04 languag+
     12 root      20   0 1269860  13476   9608 S   0.0   0.1   0:00.04 kernel_+
 

In [None]:
!nvidia-smi


Wed Oct 29 01:36:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                