In [None]:
!pip uninstall -y bitsandbytes

Found existing installation: bitsandbytes 0.44.1
Uninstalling bitsandbytes-0.44.1:
  Successfully uninstalled bitsandbytes-0.44.1


# Prompt Engineering vs. Fine-Tuning: Cross-Domain Sentiment Classification

**Project Overview:** Comparing baseline (zero-shot), prompt engineering, and fine-tuning (LoRA) on 5-class sentiment classification for Yelp and Amazon reviews.

---

## Environment Setup

Install required packages and check GPU availability.

In [None]:
# Install required packages
!pip install transformers==4.44.0 datasets==2.20.0 accelerate==0.33.0 peft==0.12.0 scikit-learn

Collecting transformers==4.44.0
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate==0.33.0
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting peft==0.12.0
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.0)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow-hotfix (from datasets==2.20.0)
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets==2.20.0)
 

In [None]:
# Check GPU availability
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

GPU Available: True
GPU Name: NVIDIA L4
GPU Memory: 23.80 GB


In [None]:
# Mount and login
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
login()  # Enter token

Mounted at /content/drive


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
import re, json, pickle
from datetime import datetime
import gc

np.random.seed(42)

## Load Datasets

Loading Yelp Review Full and Amazon Reviews Full (both 5-class sentiment).

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import numpy as np
from google.colab import drive
import csv

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Set random seed for reproducibility
np.random.seed(42)

# Load Yelp dataset (5-class sentiment: 1-5 stars, labels 0-4)
print("\nLoading Yelp Review Full dataset from Hugging Face...")
print("Source: https://huggingface.co/datasets/Yelp/yelp_review_full")
yelp_full = load_dataset("Yelp/yelp_review_full")

# Load Amazon dataset from Kaggle CSV
print("\nLoading Amazon dataset from Kaggle CSV...")
print("Source: Amazon Reviews fine-grained 5 classes (Kaggle)")

# Update this path to match where you saved the files in your Drive
amazon_train_path = "/content/drive/MyDrive/train.csv"
amazon_test_path = "/content/drive/MyDrive/test.csv"

# Custom function to load the Amazon CSV with proper parsing
def load_amazon_csv(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        csv_reader = csv.reader(f, quotechar='"', doublequote=True, strict=False)
        for i, row in enumerate(csv_reader):
            # Skip header row if present
            if i == 0 and (row[0] == 'class_index' or not row[0].isdigit()):
                continue

            if len(row) >= 3:
                # First column is label, second is title, rest is review text
                try:
                    label = int(row[0])
                    review_title = row[1]
                    review_text = row[2] if len(row) == 3 else ','.join(row[2:])
                    data.append({
                        'label': label - 1,  # Convert from 1-5 to 0-4
                        'review_title': review_title,
                        'review_text': review_text,
                        'text': review_text  # Use just review_text
                        # Or combine: 'text': review_title + '. ' + review_text
                    })
                except ValueError:
                    # Skip rows that can't be parsed
                    print(f"Skipping row {i}: {row[0]}")
                    continue
    return pd.DataFrame(data)

print("Loading train.csv...")
amazon_train_df = load_amazon_csv(amazon_train_path)

print("Loading test.csv...")
amazon_test_df = load_amazon_csv(amazon_test_path)

# Keep only needed columns
amazon_train_df = amazon_train_df[['text', 'label']]
amazon_test_df = amazon_test_df[['text', 'label']]

# Convert to HuggingFace Dataset format
amazon_full = DatasetDict({
    'train': Dataset.from_pandas(amazon_train_df, preserve_index=False),
    'test': Dataset.from_pandas(amazon_test_df, preserve_index=False)
})

print("\n=== Dataset Information ===")
print(f"Yelp - Train: {len(yelp_full['train']):,} samples")
print(f"Yelp - Test: {len(yelp_full['test']):,} samples")
print(f"\nAmazon - Train: {len(amazon_full['train']):,} samples")
print(f"Amazon - Test: {len(amazon_full['test']):,} samples")

# Verify label distributions
print("\n=== Label Distribution Check ===")
print("Yelp label range:", min(yelp_full['train']['label']), "-", max(yelp_full['train']['label']))
print("Amazon label range:", min(amazon_full['train']['label']), "-", max(amazon_full['train']['label']))

# Verify class balance
from collections import Counter
print("\nYelp train label distribution:", Counter(yelp_full['train']['label']))
print("Amazon train label distribution:", Counter(amazon_full['train']['label']))

# Check sample data
print("\n=== Sample Data Structure ===")
print("\nYelp sample:", yelp_full['train'][0])
print("\nAmazon sample:", amazon_full['train'][0])

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Loading Yelp Review Full dataset from Hugging Face...
Source: https://huggingface.co/datasets/Yelp/yelp_review_full


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]


Loading Amazon dataset from Kaggle CSV...
Source: Amazon Reviews fine-grained 5 classes (Kaggle)
Loading train.csv...
Loading test.csv...

=== Dataset Information ===
Yelp - Train: 650,000 samples
Yelp - Test: 50,000 samples

Amazon - Train: 3,000,000 samples
Amazon - Test: 650,000 samples

=== Label Distribution Check ===
Yelp label range: 0 - 4
Amazon label range: 0 - 4

Yelp train label distribution: Counter({4: 130000, 1: 130000, 3: 130000, 0: 130000, 2: 130000})
Amazon train label distribution: Counter({2: 600000, 4: 600000, 3: 600000, 0: 600000, 1: 600000})

=== Sample Data Structure ===

Yelp sample: {'label': 4, 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specia

In [None]:
# Subsample Amazon dataset to match scale with Yelp
print("\n=== Subsampling Amazon Dataset ===")
print("Original Amazon size - Train: 3,000,000, Test: 650,000")

# We'll subsample to a more manageable size that still gives us plenty of data
# Target: ~200k train, ~50k test (similar scale to Yelp)
amazon_train_subset = amazon_full['train'].shuffle(seed=42).select(range(200000))
amazon_test_subset = amazon_full['test'].shuffle(seed=42).select(range(50000))

# Update the amazon_full dataset
amazon_full = DatasetDict({
    'train': amazon_train_subset,
    'test': amazon_test_subset
})

print(f"\nSubsampled Amazon - Train: {len(amazon_full['train']):,} samples")
print(f"Subsampled Amazon - Test: {len(amazon_full['test']):,} samples")

# Verify class distribution is still balanced
from collections import Counter
print("\nAmazon subsampled label distribution:", Counter(amazon_full['train']['label']))


=== Subsampling Amazon Dataset ===
Original Amazon size - Train: 3,000,000, Test: 650,000

Subsampled Amazon - Train: 200,000 samples
Subsampled Amazon - Test: 50,000 samples

Amazon subsampled label distribution: Counter({2: 40318, 0: 40261, 1: 39996, 4: 39912, 3: 39513})


## Create Train/Val/Test Splits

Creating stratified splits: **5,000 train / 1,000 val / 3,000 test** per dataset.

**Why these sizes?**
- **Train (5k)**: 1,000 samples per class - sufficient for LoRA fine-tuning to show its potential
- **Val (1k)**: 200 samples per class - reliable for hyperparameter selection (LoRA rank)
- **Test (3k)**: 600 samples per class - robust evaluation metrics
- **Total**: 9,000 samples per dataset (still very manageable)

In [None]:
def create_splits(dataset, train_size=5000, val_size=1000, test_size=3000, label_field='label'):
    """
    Create stratified train/val/test splits - OPTIMIZED VERSION.
    """
    print(f"Creating splits from dataset with {len(dataset['train']):,} train samples...")

    # Convert to pandas for faster filtering
    train_df = dataset['train'].to_pandas()

    # Detect label range
    min_label = train_df[label_field].min()
    max_label = train_df[label_field].max()
    num_classes = max_label - min_label + 1

    print(f"Detected {num_classes} classes: labels from {min_label} to {max_label}")

    samples_per_class_train = train_size // num_classes
    samples_per_class_val = val_size // num_classes

    train_indices = []
    val_indices = []

    # Sample from each class
    for label in range(min_label, max_label + 1):
        # Get indices for this class
        class_mask = train_df[label_field] == label
        class_indices = train_df[class_mask].index.tolist()

        # Shuffle and sample
        np.random.shuffle(class_indices)
        train_indices.extend(class_indices[:samples_per_class_train])
        val_indices.extend(class_indices[samples_per_class_train:samples_per_class_train + samples_per_class_val])

    # Create splits using select
    train_split = dataset['train'].select(train_indices)
    val_split = dataset['train'].select(val_indices)
    test_split = dataset['test'].shuffle(seed=42).select(range(min(test_size, len(dataset['test']))))

    return {
        'train': train_split,
        'val': val_split,
        'test': test_split
    }

# Create splits for both datasets (5k train / 1k val / 3k test)
print("Creating Yelp splits (5,000 train / 1,000 val / 3,000 test)...")
yelp_splits = create_splits(yelp_full, train_size=5000, val_size=1000, test_size=3000)

print("\nCreating Amazon splits (5,000 train / 1,000 val / 3,000 test)...")
amazon_splits = create_splits(amazon_full, train_size=5000, val_size=1000, test_size=3000)

print("\n=== Split Sizes ===")
print(f"Yelp - Train: {len(yelp_splits['train']):,}, Val: {len(yelp_splits['val']):,}, Test: {len(yelp_splits['test']):,}")
print(f"Amazon - Train: {len(amazon_splits['train']):,}, Val: {len(amazon_splits['val']):,}, Test: {len(amazon_splits['test']):,}")
print(f"\nTotal samples per dataset: {len(yelp_splits['train']) + len(yelp_splits['val']) + len(yelp_splits['test']):,}")

Creating Yelp splits (5,000 train / 1,000 val / 3,000 test)...
Creating splits from dataset with 650,000 train samples...
Detected 5 classes: labels from 0 to 4

Creating Amazon splits (5,000 train / 1,000 val / 3,000 test)...
Creating splits from dataset with 200,000 train samples...
Detected 5 classes: labels from 0 to 4

=== Split Sizes ===
Yelp - Train: 5,000, Val: 1,000, Test: 3,000
Amazon - Train: 5,000, Val: 1,000, Test: 3,000

Total samples per dataset: 9,000


In [None]:
# Verify class distribution in splits
def check_class_distribution(split, name, label_field='label'):
    labels = [example[label_field] for example in split]
    unique, counts = np.unique(labels, return_counts=True)
    print(f"\n{name} class distribution:")
    for label, count in zip(unique, counts):
        print(f"  Class {label}: {count} samples ({count/len(labels)*100:.1f}%)")

print("=== Yelp Class Distributions ===")
check_class_distribution(yelp_splits['train'], "Yelp Train")
check_class_distribution(yelp_splits['val'], "Yelp Val")
check_class_distribution(yelp_splits['test'], "Yelp Test")

print("\n=== Amazon Class Distributions ===")
amazon_label_field = 'stars' if 'stars' in amazon_full['train'].column_names else 'label'
check_class_distribution(amazon_splits['train'], "Amazon Train", amazon_label_field)
check_class_distribution(amazon_splits['val'], "Amazon Val", amazon_label_field)
check_class_distribution(amazon_splits['test'], "Amazon Test", amazon_label_field)

=== Yelp Class Distributions ===

Yelp Train class distribution:
  Class 0: 1000 samples (20.0%)
  Class 1: 1000 samples (20.0%)
  Class 2: 1000 samples (20.0%)
  Class 3: 1000 samples (20.0%)
  Class 4: 1000 samples (20.0%)

Yelp Val class distribution:
  Class 0: 200 samples (20.0%)
  Class 1: 200 samples (20.0%)
  Class 2: 200 samples (20.0%)
  Class 3: 200 samples (20.0%)
  Class 4: 200 samples (20.0%)

Yelp Test class distribution:
  Class 0: 645 samples (21.5%)
  Class 1: 583 samples (19.4%)
  Class 2: 579 samples (19.3%)
  Class 3: 650 samples (21.7%)
  Class 4: 543 samples (18.1%)

=== Amazon Class Distributions ===

Amazon Train class distribution:
  Class 0: 1000 samples (20.0%)
  Class 1: 1000 samples (20.0%)
  Class 2: 1000 samples (20.0%)
  Class 3: 1000 samples (20.0%)
  Class 4: 1000 samples (20.0%)

Amazon Val class distribution:
  Class 0: 200 samples (20.0%)
  Class 1: 200 samples (20.0%)
  Class 2: 200 samples (20.0%)
  Class 3: 200 samples (20.0%)
  Class 4: 200 sam

## Load Gemma Model

Loading Gemma-2-2B-it

In [None]:
# Model configuration
model_id = "google/gemma-2-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
model.config.use_cache = False
print(f"Model: {torch.cuda.memory_allocated()/1e9:.2f} GB")

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Model: 5.23 GB


## Baseline (Zero-Shot) Evaluation

Simple zero-shot prompting as baseline.

In [None]:
def create_zero_shot_prompt(text):
    """
    Create a zero-shot prompt for sentiment classification.

    Returns:
        Formatted prompt string
    """
    prompt = f"""Classify the sentiment of the following review on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Review: {text}

Sentiment (1-5):"""
    return prompt

# Test on a sample
sample_text = yelp_splits['test'][0]['text']
sample_label = yelp_splits['test'][0]['label']

prompt = create_zero_shot_prompt(sample_text)
print("=== Sample Zero-Shot Prompt ===")
print(prompt)
print(f"\nTrue label: {sample_label + 1}")

=== Sample Zero-Shot Prompt ===
Classify the sentiment of the following review on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Review: Kabuto is your run-of-the-mill Japanese Steakhouse. Different stations with chefs slinging shrimp tails around the communal dining areas like it's a lunchtime magic show. Always a plethora of laughs and gags going around the group. \n\nThis place is great for lunch. $9 and 30 minutes and you're out the door. Uhhh...If I'm craving a salad with ginger dressing, which I always am, (you do too. admit it) fried rice, steak, shrimp and white sauce (DUDE) then Kabuto is king of lunch options in my book. Always super clean and full of kindhearted staff. The parking lot is super difficult to get in and out of though. 51 traffic at lunch is a beast. Good luck getting stuck behind someone trying to cut across traffic at 12pm on a weekday. It's murder. This place would greatly benefit from another exit/entrance or a 

### Generate Model Predictions

Function to generate predictions from the model and extract sentiment ratings.

In [None]:
import re
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch

def generate_prediction(prompt, max_new_tokens=10):
    """
    Generate model prediction for a given prompt.
    """
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode only the generated tokens
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated_text

def extract_rating(response_text):
    """
    Extract rating (1-5) from model response.
    """
    # Try to find a number between 1-5
    matches = re.findall(r'\b[1-5]\b', response_text)
    if matches:
        return int(matches[0])
    return None

def evaluate_zero_shot(dataset_split, dataset_name, num_samples=None):
    """
    Evaluate zero-shot performance on a dataset.
    """
    if num_samples is None:
        num_samples = len(dataset_split)

    predictions = []
    true_labels = []
    failed_parses = 0

    print(f"\nEvaluating {dataset_name} zero-shot on {num_samples} samples...")

    for i in tqdm(range(num_samples)):
        text = dataset_split[i]['text']
        true_label = dataset_split[i]['label']

        # Create prompt
        prompt = create_zero_shot_prompt(text)

        # Generate response
        response = generate_prediction(prompt, max_new_tokens=10)

        # Extract predicted rating
        pred = extract_rating(response)

        if pred is not None:
            # Convert back to 0-4 scale (model outputs 1-5)
            predictions.append(pred - 1)
            true_labels.append(true_label)
        else:
            failed_parses += 1
            # Default to middle class if parsing fails
            predictions.append(2)
            true_labels.append(true_label)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    f1_weighted = f1_score(true_labels, predictions, average='weighted')

    print(f"\n{'='*50}")
    print(f"{dataset_name} Zero-Shot Results")
    print(f"{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 (Macro): {f1_macro:.4f}")
    print(f"F1 (Weighted): {f1_weighted:.4f}")
    print(f"Failed parses: {failed_parses}/{num_samples}")
    print(f"\nClassification Report:")
    print(classification_report(true_labels, predictions,
                                target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4']))

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'predictions': predictions,
        'true_labels': true_labels,
        'failed_parses': failed_parses
    }

### Full Yelp Test Evaluation

Evaluate zero-shot performance on the complete Yelp test set (3,000 samples).

In [None]:
# Full evaluation on Yelp test set (3000 samples)
print("\n" + "="*70)
print("FULL YELP TEST SET EVALUATION")
print("="*70)
yelp_zero_shot_results = evaluate_zero_shot(yelp_splits['test'], "Yelp Test")


FULL YELP TEST SET EVALUATION

Evaluating Yelp Test zero-shot on 3000 samples...


100%|██████████| 3000/3000 [30:56<00:00,  1.62it/s]


Yelp Test Zero-Shot Results
Accuracy: 0.5293
F1 (Macro): 0.4872
F1 (Weighted): 0.4913
Failed parses: 6/3000

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.92      0.26      0.41       645
     Class 1       0.45      0.74      0.56       583
     Class 2       0.58      0.61      0.60       579
     Class 3       0.48      0.85      0.61       650
     Class 4       0.86      0.15      0.26       543

    accuracy                           0.53      3000
   macro avg       0.66      0.52      0.49      3000
weighted avg       0.66      0.53      0.49      3000






In [None]:
import json
import pickle
from datetime import datetime

# Create timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save results as JSON (metrics only)
yelp_results_summary = {
    'dataset': 'Yelp',
    'method': 'zero_shot',
    'timestamp': timestamp,
    'num_samples': 3000,
    'accuracy': yelp_zero_shot_results['accuracy'],
    'f1_macro': yelp_zero_shot_results['f1_macro'],
    'f1_weighted': yelp_zero_shot_results['f1_weighted'],
    'failed_parses': yelp_zero_shot_results['failed_parses']
}

with open(f'/content/drive/MyDrive/yelp_zero_shot_results_{timestamp}.json', 'w') as f:
    json.dump(yelp_results_summary, f, indent=2)

# Save full results including predictions (pickle)
with open(f'/content/drive/MyDrive/yelp_zero_shot_full_{timestamp}.pkl', 'wb') as f:
    pickle.dump(yelp_zero_shot_results, f)

print("Results saved to Google Drive:")
print(f"   - Summary: yelp_zero_shot_results_{timestamp}.json")
print(f"   - Full data: yelp_zero_shot_full_{timestamp}.pkl")

Results saved to Google Drive:
   - Summary: yelp_zero_shot_results_20251202_235037.json
   - Full data: yelp_zero_shot_full_20251202_235037.pkl


### Full Amazon Test Evaluation

Evaluate zero-shot performance on the complete Amazon test set (3,000 samples).

In [None]:
# Full evaluation on Amazon test set (3000 samples)
print("\n" + "="*70)
print("FULL AMAZON TEST SET EVALUATION")
print("="*70)
amazon_zero_shot_results = evaluate_zero_shot(amazon_splits['test'], "Amazon Test")


FULL AMAZON TEST SET EVALUATION

Evaluating Amazon Test zero-shot on 3000 samples...


100%|██████████| 3000/3000 [30:51<00:00,  1.62it/s]


Amazon Test Zero-Shot Results
Accuracy: 0.4270
F1 (Macro): 0.3675
F1 (Weighted): 0.3655
Failed parses: 10/3000

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.83      0.14      0.24       599
     Class 1       0.41      0.72      0.52       629
     Class 2       0.47      0.50      0.48       607
     Class 3       0.37      0.75      0.50       548
     Class 4       0.80      0.05      0.10       617

    accuracy                           0.43      3000
   macro avg       0.58      0.43      0.37      3000
weighted avg       0.58      0.43      0.37      3000






In [None]:
import json
import pickle
from datetime import datetime

# Create timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save results as JSON (metrics only)
amazon_results_summary = {
    'dataset': 'Amazon',
    'method': 'zero_shot',
    'timestamp': timestamp,
    'num_samples': 3000,
    'accuracy': amazon_zero_shot_results['accuracy'],
    'f1_macro': amazon_zero_shot_results['f1_macro'],
    'f1_weighted': amazon_zero_shot_results['f1_weighted'],
    'failed_parses': amazon_zero_shot_results['failed_parses']
}

with open(f'/content/drive/MyDrive/amazon_zero_shot_results_{timestamp}.json', 'w') as f:
    json.dump(amazon_results_summary, f, indent=2)

# Save full results including predictions (pickle)
with open(f'/content/drive/MyDrive/amazon_zero_shot_full_{timestamp}.pkl', 'wb') as f:
    pickle.dump(amazon_zero_shot_results, f)

print("Amazon results saved to Google Drive:")
print(f"   - Summary: amazon_zero_shot_results_{timestamp}.json")
print(f"   - Full data: amazon_zero_shot_full_{timestamp}.pkl")

# Quick comparison
print("\n" + "="*70)
print("ZERO-SHOT BASELINE COMPARISON")
print("="*70)
print(f"Yelp    - Accuracy: {yelp_zero_shot_results['accuracy']:.4f}, F1 (Macro): {yelp_zero_shot_results['f1_macro']:.4f}")
print(f"Amazon  - Accuracy: {amazon_zero_shot_results['accuracy']:.4f}, F1 (Macro): {amazon_zero_shot_results['f1_macro']:.4f}")

Amazon results saved to Google Drive:
   - Summary: amazon_zero_shot_results_20251203_002229.json
   - Full data: amazon_zero_shot_full_20251203_002229.pkl

ZERO-SHOT BASELINE COMPARISON
Yelp    - Accuracy: 0.5293, F1 (Macro): 0.4872
Amazon  - Accuracy: 0.4270, F1 (Macro): 0.3675


## Few-Shot Prompting (4-shot)

Add 4 diverse examples to the prompt to help the model learn the task through demonstration.

### Select Representative Examples

Choose balanced examples from the training set (one per class, plus one extra).

In [None]:
import numpy as np

def select_few_shot_examples(train_split, n_shots=4, label_field='label'):
    """
    Select diverse examples for few-shot prompting.
    Ensures representation across different sentiment classes.

    Args:
        train_split: Training dataset split
        n_shots: Number of examples to select
        label_field: Name of the label field

    Returns:
        List of example dictionaries
    """
    examples = []

    # Convert to list for easier sampling
    train_data = list(train_split)

    # Get examples from each class (stratified sampling)
    num_classes = 5
    samples_per_class = n_shots // num_classes
    remaining = n_shots % num_classes

    for label in range(num_classes):
        # Get all samples for this class
        class_samples = [s for s in train_data if s[label_field] == label]

        # Sample randomly
        n_samples = samples_per_class + (1 if label < remaining else 0)
        if n_samples > 0 and len(class_samples) > 0:
            sampled = np.random.choice(len(class_samples),
                                      min(n_samples, len(class_samples)),
                                      replace=False)
            examples.extend([class_samples[i] for i in sampled])

    return examples[:n_shots]

# Select 4-shot examples from Yelp training set
np.random.seed(42)  # For reproducibility
yelp_4shot_examples = select_few_shot_examples(yelp_splits['train'], n_shots=4)

print("Selected 4-shot examples:")
for i, ex in enumerate(yelp_4shot_examples):
    label = ex['label']
    text_preview = ex['text'][:100] + "..." if len(ex['text']) > 100 else ex['text']
    print(f"\nExample {i+1} - Label: {label} (Sentiment: {label+1})")
    print(f"Text: {text_preview}")

Selected 4-shot examples:

Example 1 - Label: 0 (Sentiment: 1)
Text: had a rep out yesterday to give me an estimate on my area rug & runner. Both a wool blend & very exp...

Example 2 - Label: 1 (Sentiment: 2)
Text: i really cant say much about this place, but they have juk (rice porridge) that is really fricken ch...

Example 3 - Label: 2 (Sentiment: 3)
Text: Personnellement, je ne m'y arr\u00eate jamais pour manger sur place. L'endroit y est trop bruyantt, ...

Example 4 - Label: 3 (Sentiment: 4)
Text: This is a difficult restaurant to review because you simply want to love it.  The owners have spruce...


### Create Few-Shot Prompt Template

Build prompts that include example demonstrations before the test query.

In [None]:
def create_few_shot_prompt(text, examples, label_field='label'):
    """
    Create a few-shot prompt with example demonstrations.

    Args:
        text: The review text to classify
        examples: List of example dictionaries
        label_field: Name of the label field

    Returns:
        Formatted prompt string with examples
    """
    prompt = """Classify the sentiment of reviews on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Here are some examples:

"""

    # Add example demonstrations
    for i, ex in enumerate(examples):
        example_text = ex['text']
        example_label = ex[label_field] + 1  # Convert 0-4 to 1-5
        prompt += f"Review: {example_text}\nSentiment: {example_label}\n\n"

    # Add the query
    prompt += f"Now classify this review:\n\nReview: {text}\nSentiment:"

    return prompt

# Test the few-shot prompt
sample_text = yelp_splits['test'][0]['text']
sample_label = yelp_splits['test'][0]['label']

few_shot_prompt = create_few_shot_prompt(sample_text, yelp_4shot_examples)
print("=== Sample 4-Shot Prompt ===")
print(few_shot_prompt[:1000] + "..." if len(few_shot_prompt) > 1000 else few_shot_prompt)
print(f"\n\nTrue label: {sample_label + 1}")

=== Sample 4-Shot Prompt ===
Classify the sentiment of reviews on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Here are some examples:

Review: had a rep out yesterday to give me an estimate on my area rug & runner. Both a wool blend & very expensive. Rep came in w/an attitude - measured both rugs, said \"I assume these are synthetic?\", (how rude for him to say that without asking me) asked if he could sit down, mumbled under his breath as he was plugging in figures on his calculator, kept asking me if there were any other products I needed since they had this \"50% sale going on\". The figure he presented me was twice the price as I got 2 years ago - so much for the 50% sale. I told him what I paid for the exact 2 items the previous time, he got all full of himself & said he had to use what numbers he was given & couldn't deviate. Not even knowing who the rep was that came out before, he proceeded to say \"he's no longer with the compa

### Test 4-Shot Generation

Verify that the model can generate predictions with the few-shot prompt.

In [None]:
# Test generation with few-shot prompt
response = generate_prediction(few_shot_prompt, max_new_tokens=10)
print("\n=== Model Response ===")
print(response)


=== Model Response ===
 4 


**Explanation:**

The review


### Evaluate 4-Shot on Yelp Test Set

Run full evaluation with 4-shot prompting on Yelp (3,000 samples).

In [None]:
def evaluate_few_shot(dataset_split, dataset_name, examples, num_samples=None, label_field='label'):
    """
    Evaluate few-shot performance on a dataset.

    Args:
        dataset_split: The dataset split to evaluate
        dataset_name: Name for logging
        examples: List of few-shot examples
        num_samples: Number of samples to evaluate (None = all)
        label_field: Name of the label field

    Returns:
        Dictionary with evaluation results
    """
    if num_samples is None:
        num_samples = len(dataset_split)

    predictions = []
    true_labels = []
    failed_parses = 0

    print(f"\nEvaluating {dataset_name} few-shot ({len(examples)} examples) on {num_samples} samples...")

    for i in tqdm(range(num_samples)):
        text = dataset_split[i]['text']
        true_label = dataset_split[i][label_field]

        # Create few-shot prompt
        prompt = create_few_shot_prompt(text, examples, label_field)

        # Generate response
        response = generate_prediction(prompt, max_new_tokens=10)

        # Extract predicted rating
        pred = extract_rating(response)

        if pred is not None:
            # Convert back to 0-4 scale (model outputs 1-5)
            predictions.append(pred - 1)
            true_labels.append(true_label)
        else:
            failed_parses += 1
            # Default to middle class if parsing fails
            predictions.append(2)
            true_labels.append(true_label)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    f1_weighted = f1_score(true_labels, predictions, average='weighted')

    print(f"\n{'='*50}")
    print(f"{dataset_name} Few-Shot ({len(examples)}-shot) Results")
    print(f"{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 (Macro): {f1_macro:.4f}")
    print(f"F1 (Weighted): {f1_weighted:.4f}")
    print(f"Failed parses: {failed_parses}/{num_samples}")
    print(f"\nClassification Report:")
    print(classification_report(true_labels, predictions,
                                target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'],
                                zero_division=0))

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'predictions': predictions,
        'true_labels': true_labels,
        'failed_parses': failed_parses,
        'n_shots': len(examples)
    }

# Full evaluation on Yelp test set with 4-shot
print("\n" + "="*70)
print("YELP 4-SHOT EVALUATION")
print("="*70)
yelp_4shot_results = evaluate_few_shot(yelp_splits['test'], "Yelp Test", yelp_4shot_examples)


YELP 4-SHOT EVALUATION

Evaluating Yelp Test few-shot (4 examples) on 3000 samples...


100%|██████████| 3000/3000 [31:32<00:00,  1.59it/s]


Yelp Test Few-Shot (4-shot) Results
Accuracy: 0.4490
F1 (Macro): 0.4563
F1 (Weighted): 0.4571
Failed parses: 1320/3000

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.78      0.56      0.65       645
     Class 1       0.43      0.32      0.37       583
     Class 2       0.28      0.74      0.40       579
     Class 3       0.54      0.25      0.34       650
     Class 4       0.78      0.38      0.51       543

    accuracy                           0.45      3000
   macro avg       0.56      0.45      0.46      3000
weighted avg       0.56      0.45      0.46      3000






In [None]:
# Save 4-shot results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save results as JSON (metrics only)
yelp_4shot_summary = {
    'dataset': 'Yelp',
    'method': '4_shot',
    'n_shots': 4,
    'timestamp': timestamp,
    'num_samples': 3000,
    'accuracy': yelp_4shot_results['accuracy'],
    'f1_macro': yelp_4shot_results['f1_macro'],
    'f1_weighted': yelp_4shot_results['f1_weighted'],
    'failed_parses': yelp_4shot_results['failed_parses']
}

with open(f'/content/drive/MyDrive/yelp_4shot_results_{timestamp}.json', 'w') as f:
    json.dump(yelp_4shot_summary, f, indent=2)

# Save full results including predictions (pickle)
with open(f'/content/drive/MyDrive/yelp_4shot_full_{timestamp}.pkl', 'wb') as f:
    pickle.dump(yelp_4shot_results, f)

print("Results saved to Google Drive")

# Comparison with zero-shot
print("\n" + "="*70)
print("YELP: ZERO-SHOT vs 4-SHOT COMPARISON")
print("="*70)
print(f"Zero-shot - Accuracy: {yelp_zero_shot_results['accuracy']:.4f}, F1 (Macro): {yelp_zero_shot_results['f1_macro']:.4f}")
print(f"4-shot    - Accuracy: {yelp_4shot_results['accuracy']:.4f}, F1 (Macro): {yelp_4shot_results['f1_macro']:.4f}")
print(f"\nImprovement: {(yelp_4shot_results['accuracy'] - yelp_zero_shot_results['accuracy'])*100:.2f}% accuracy gain")

Results saved to Google Drive

YELP: ZERO-SHOT vs 4-SHOT COMPARISON
Zero-shot - Accuracy: 0.5293, F1 (Macro): 0.4872
4-shot    - Accuracy: 0.4490, F1 (Macro): 0.4563

Improvement: -8.03% accuracy gain


### Re-run 4-Shot with Improved Extraction

After identifying that the worsening performance using 4-shot might be due to extraction issues, I improved the extraction function and re-evaluated for fair comparison with CoT.

In [None]:
# Define improved extraction function
def extract_rating_improved(response_text):
    """
    Improved extraction that handles various formats and markdown.
    Looks for ratings in different parts of the response.
    """
    # Remove markdown formatting
    cleaned = response_text.replace('**', '').replace('*', '').strip()

    # Strategy 1: Look for "Sentiment: X" or "Rating: X" patterns
    patterns = [
        r'[Ss]entiment[:\s]+([1-5])',
        r'[Rr]ating[:\s]+([1-5])',
        r'^([1-5])\s*$',  # Just a number on its own
        r'^([1-5])\s*\n',  # Number followed by newline
    ]

    for pattern in patterns:
        match = re.search(pattern, cleaned)
        if match:
            return int(match.group(1))

    # Strategy 2: Find ANY number 1-5 in the text
    matches = re.findall(r'\b([1-5])\b', cleaned)
    if matches:
        # Take the last occurrence (likely the final answer)
        return int(matches[-1])

    # Strategy 3: Look in first 100 characters only
    first_part = cleaned[:100]
    matches = re.findall(r'([1-5])', first_part)
    if matches:
        return int(matches[0])

    return None

In [None]:
# Re-run 4-shot with improved extraction for fair comparison
print("\n" + "="*70)
print("YELP 4-SHOT EVALUATION (WITH IMPROVED EXTRACTION)")
print("="*70)

predictions = []
true_labels = []
failed_parses = 0

print(f"\nRe-evaluating Yelp Test few-shot (4 examples) with improved extraction...")

for i in tqdm(range(len(yelp_splits['test']))):
    text = yelp_splits['test'][i]['text']
    true_label = yelp_splits['test'][i]['label']

    # Create few-shot prompt
    prompt = create_few_shot_prompt(text, yelp_4shot_examples)

    # Generate response with ONLY 15 tokens
    response = generate_prediction(prompt, max_new_tokens=15)

    # Extract with IMPROVED function
    pred = extract_rating_improved(response)

    if pred is not None:
        predictions.append(pred - 1)
        true_labels.append(true_label)
    else:
        failed_parses += 1
        predictions.append(2)
        true_labels.append(true_label)

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1_macro = f1_score(true_labels, predictions, average='macro')
f1_weighted = f1_score(true_labels, predictions, average='weighted')

print(f"\n{'='*50}")
print(f"Yelp Test Few-Shot (4-shot) Results - IMPROVED")
print(f"{'='*50}")
print(f"Accuracy: {accuracy:.4f} (was {yelp_4shot_results['accuracy']:.4f})")
print(f"F1 (Macro): {f1_macro:.4f} (was {yelp_4shot_results['f1_macro']:.4f})")
print(f"F1 (Weighted): {f1_weighted:.4f}")
print(f"Failed parses: {failed_parses}/3000 (was 1320/3000)")
print(f"\nImprovement from better extraction:")
print(f"  Accuracy: +{(accuracy - yelp_4shot_results['accuracy'])*100:.2f}%")
print(f"  Failed parses reduced: {1320 - failed_parses}")

print(f"\nClassification Report:")
print(classification_report(true_labels, predictions,
                            target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'],
                            zero_division=0))

# Update the results variable to use for comparisons
yelp_4shot_results = {
    'accuracy': accuracy,
    'f1_macro': f1_macro,
    'f1_weighted': f1_weighted,
    'predictions': predictions,
    'true_labels': true_labels,
    'failed_parses': failed_parses,
    'n_shots': 4
}

# Save the improved results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f'/content/drive/MyDrive/yelp_4shot_improved_{timestamp}.json', 'w') as f:
    json.dump({
        'dataset': 'Yelp',
        'method': '4_shot_improved_extraction',
        'timestamp': timestamp,
        'num_samples': 3000,
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'failed_parses': failed_parses
    }, f, indent=2)

with open(f'/content/drive/MyDrive/yelp_4shot_improved_full_{timestamp}.pkl', 'wb') as f:
    pickle.dump(yelp_4shot_results, f)

print("\n Improved 4-shot results saved to Google Drive")


YELP 4-SHOT EVALUATION (WITH IMPROVED EXTRACTION)

Re-evaluating Yelp Test few-shot (4 examples) with improved extraction...


100%|██████████| 3000/3000 [41:09<00:00,  1.21it/s]


Yelp Test Few-Shot (4-shot) Results - IMPROVED
Accuracy: 0.4467 (was 0.4490)
F1 (Macro): 0.4553 (was 0.4563)
F1 (Weighted): 0.4564
Failed parses: 1331/3000 (was 1320/3000)

Improvement from better extraction:
  Accuracy: +-0.23%
  Failed parses reduced: -11

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.79      0.56      0.66       645
     Class 1       0.42      0.32      0.36       583
     Class 2       0.27      0.73      0.40       579
     Class 3       0.55      0.25      0.35       650
     Class 4       0.79      0.38      0.51       543

    accuracy                           0.45      3000
   macro avg       0.57      0.45      0.46      3000
weighted avg       0.57      0.45      0.46      3000


 Improved 4-shot results saved to Google Drive





### 4-Shot Prompting on Amazon

Apply the same 4-shot approach to Amazon to test cross-domain consistency.




In [None]:
def select_few_shot_examples(train_split, n_shots=4, label_field='label'):
    """Select diverse examples for few-shot prompting."""
    examples = []
    train_data = list(train_split)

    num_classes = 5
    samples_per_class = n_shots // num_classes
    remaining = n_shots % num_classes

    for label in range(num_classes):
        class_samples = [s for s in train_data if s[label_field] == label]
        n_samples = samples_per_class + (1 if label < remaining else 0)
        if n_samples > 0 and len(class_samples) > 0:
            sampled = np.random.choice(len(class_samples),
                                      min(n_samples, len(class_samples)),
                                      replace=False)
            examples.extend([class_samples[i] for i in sampled])

    return examples[:n_shots]

def create_few_shot_prompt(text, examples, label_field='label'):
    """Create a few-shot prompt with example demonstrations."""
    prompt = """Classify the sentiment of reviews on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Here are some examples:

"""

    for i, ex in enumerate(examples):
        example_text = ex['text']
        example_label = ex[label_field] + 1
        prompt += f"Review: {example_text}\nSentiment: {example_label}\n\n"

    prompt += f"Now classify this review:\n\nReview: {text}\nSentiment:"

    return prompt

# Select 4-shot examples from Amazon training set
np.random.seed(42)
amazon_4shot_examples = select_few_shot_examples(amazon_splits['train'], n_shots=4)

print("Selected 4-shot examples for Amazon:")
for i, ex in enumerate(amazon_4shot_examples):
    label = ex['label']
    text_preview = ex['text'][:100] + "..." if len(ex['text']) > 100 else ex['text']
    print(f"\nExample {i+1} - Label: {label} (Sentiment: {label+1})")
    print(f"Text: {text_preview}")

Selected 4-shot examples for Amazon:

Example 1 - Label: 0 (Sentiment: 1)
Text: I was aware this was an import. When I received my DVD I was unable to play it because it is not num...

Example 2 - Label: 1 (Sentiment: 2)
Text: I was expecting a better movie. The plot was a little thin and the movie was boring. The movie is de...

Example 3 - Label: 2 (Sentiment: 3)
Text: The Honeywell C7089U1006 Outdoor Temperature Sensor was easy to hook up and install, but a little mo...

Example 4 - Label: 3 (Sentiment: 4)
Text: These are nice canisters, but we have granite countertops, so some anti-abrasive stuff on the bottom...


In [None]:
# Test the few-shot prompt on Amazon
sample_text = amazon_splits['test'][0]['text']
sample_label = amazon_splits['test'][0]['label']

few_shot_prompt = create_few_shot_prompt(sample_text, amazon_4shot_examples)
print("=== Sample 4-Shot Prompt (Amazon) ===")
print(few_shot_prompt[:1000] + "..." if len(few_shot_prompt) > 1000 else few_shot_prompt)
print(f"\n\nTrue label: {sample_label + 1}")

# Test generation
response = generate_prediction(few_shot_prompt, max_new_tokens=15)
print("\n=== Model Response ===")
print(response)
print(f"\nExtracted: {extract_rating_improved(response)}")

=== Sample 4-Shot Prompt (Amazon) ===
Classify the sentiment of reviews on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Here are some examples:

Review: I was aware this was an import. When I received my DVD I was unable to play it because it is not numbered for my area.When I received my DVD the envelope said Belgium. So I do not have a playable video.
Sentiment: 1

Review: I was expecting a better movie. The plot was a little thin and the movie was boring. The movie is definitely geared to an adult audience.
Sentiment: 2

Review: The Honeywell C7089U1006 Outdoor Temperature Sensor was easy to hook up and install, but a little more $$ than I really should have paid, I put this in when I installed the new programmable thermostat. Now I have a reference to the outside temp at the thermostat. I don't know if they make a compatible cordless remote, which would be easier, but this one, so far seems accurate and operable. Plan on splicing the

### Evaluate 4-Shot on Amazon Test Set

In [None]:
# Full evaluation on Amazon test set with 4-shot
print("\n" + "="*70)
print("AMAZON 4-SHOT EVALUATION (WITH IMPROVED EXTRACTION)")
print("="*70)

predictions = []
true_labels = []
failed_parses = 0

print(f"\nEvaluating Amazon Test few-shot (4 examples) with improved extraction...")

for i in tqdm(range(len(amazon_splits['test']))):
    text = amazon_splits['test'][i]['text']
    true_label = amazon_splits['test'][i]['label']

    # Create few-shot prompt
    prompt = create_few_shot_prompt(text, amazon_4shot_examples)

    # Generate response with optimized token count
    response = generate_prediction(prompt, max_new_tokens=15)

    # Extract with IMPROVED function
    pred = extract_rating_improved(response)

    if pred is not None:
        predictions.append(pred - 1)
        true_labels.append(true_label)
    else:
        failed_parses += 1
        predictions.append(2)
        true_labels.append(true_label)

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1_macro = f1_score(true_labels, predictions, average='macro')
f1_weighted = f1_score(true_labels, predictions, average='weighted')

print(f"\n{'='*50}")
print(f"Amazon Test Few-Shot (4-shot) Results")
print(f"{'='*50}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 (Macro): {f1_macro:.4f}")
print(f"F1 (Weighted): {f1_weighted:.4f}")
print(f"Failed parses: {failed_parses}/3000")

print(f"\nClassification Report:")
print(classification_report(true_labels, predictions,
                            target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'],
                            zero_division=0))

# Store results
amazon_4shot_results = {
    'accuracy': accuracy,
    'f1_macro': f1_macro,
    'f1_weighted': f1_weighted,
    'predictions': predictions,
    'true_labels': true_labels,
    'failed_parses': failed_parses,
    'n_shots': 4
}


AMAZON 4-SHOT EVALUATION (WITH IMPROVED EXTRACTION)

Evaluating Amazon Test few-shot (4 examples) with improved extraction...


100%|██████████| 3000/3000 [41:07<00:00,  1.22it/s]


Amazon Test Few-Shot (4-shot) Results
Accuracy: 0.2433
F1 (Macro): 0.1611
F1 (Weighted): 0.1620
Failed parses: 2718/3000

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.67      0.02      0.05       599
     Class 1       0.32      0.04      0.07       629
     Class 2       0.21      0.94      0.34       607
     Class 3       0.49      0.06      0.11       548
     Class 4       0.83      0.14      0.24       617

    accuracy                           0.24      3000
   macro avg       0.50      0.24      0.16      3000
weighted avg       0.50      0.24      0.16      3000






## LoRA Fine-Tuning on Yelp

Fine-tune the model using LoRA (Low-Rank Adaptation) with different rank values to compare parameter efficiency.

**Why LoRA?**
- LoRA adapts only a small subset of parameters
- Efficient training on consumer GPUs

**I'll test:**
- r=8 (standard, efficient)

### Prepare Data for Fine-Tuning

Format the training data for the model with proper instruction templates.

In [None]:
# Format training data
def format_instruction(example):
    text = example['text']
    label = example['label'] + 1  # Convert 0-4 to 1-5

    prompt = f"""Classify the sentiment of the following review on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Review: {text}

Sentiment (1-5): {label}"""

    return {"formatted_text": prompt}

print("Formatting datasets...")
train_dataset = yelp_splits['train'].map(format_instruction)
val_dataset = yelp_splits['val'].map(format_instruction)

print(f"Formatted {len(train_dataset)} train, {len(val_dataset)} val samples")

Formatting datasets...
Formatted 5000 train, 1000 val samples


In [None]:
# Tokenize with SHORT sequences to save memory
def tokenize_function(examples):
    tokenized = tokenizer(
        examples['formatted_text'],
        truncation=True,
        max_length=128,  # Very short to save memory
        padding='max_length',
    )
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

print("Tokenizing...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train"
)

tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing val"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Tokenizing...


Tokenizing val:   0%|          | 0/1000 [00:00<?, ? examples/s]

### Configure LoRA for Fine-Tuning

Set up LoRA configuration for efficient parameter adaptation.

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

# Reload base model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Enable gradient checkpointing BEFORE adding LoRA
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Now create LoRA model
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model_lora = get_peft_model(model, lora_config)

# Verify trainability
trainable = sum(p.numel() for p in model_lora.parameters() if p.requires_grad)
total = sum(p.numel() for p in model_lora.parameters())
print(f"\nTrainable: {trainable:,} ({100*trainable/total:.2f}%)")

# Check that some parameters actually require grad
for name, param in model_lora.named_parameters():
    if param.requires_grad:
        print(f"Found trainable param: {name}")
        break


Trainable: 3,194,880 (0.12%)
Found trainable param: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight


In [None]:
# memory-optimized training settings
training_args = TrainingArguments(
    output_dir=f"/content/drive/MyDrive/yelp_lora_r8",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    warmup_steps=50,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=True,
    optim="adamw_torch",
    report_to="none",
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

print("\n Starting training...")
train_result = trainer.train()

print(f"\n Training complete")
print(f"Final train loss: {train_result.training_loss:.4f}")

# Save model
model_lora.save_pretrained("/content/drive/MyDrive/yelp_lora_r8_final")
tokenizer.save_pretrained("/content/drive/MyDrive/yelp_lora_r8_final")
print("Model saved to Drive")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



 Starting training...


Epoch,Training Loss,Validation Loss
1,1.7136,1.722168
2,1.6637,1.71982



 Training complete
Final train loss: 1.7174
Model saved to Drive


### Evaluate Fine-Tuned Model on Yelp Test Set

Test the LoRA r=8 model on Yelp test data.

In [None]:
# Improved extraction function (same as before)
import re

def extract_rating_improved(response_text):
    """
    Improved extraction that handles various formats.
    """
    cleaned = response_text.replace('**', '').replace('*', '').strip()

    # Strategy 1: Look for patterns
    patterns = [
        r'[Ss]entiment[:\s]+([1-5])',
        r'[Rr]ating[:\s]+([1-5])',
        r'^([1-5])\s*$',
        r'^([1-5])\s*\n',
    ]

    for pattern in patterns:
        match = re.search(pattern, cleaned)
        if match:
            return int(match.group(1))

    # Strategy 2: Find ANY number 1-5
    matches = re.findall(r'\b([1-5])\b', cleaned)
    if matches:
        return int(matches[-1])  # Take last occurrence

    return None

def evaluate_finetuned(model, tokenizer, dataset_split, dataset_name, num_samples=None):
    """
    Evaluate fine-tuned model performance.
    """
    if num_samples is None:
        num_samples = len(dataset_split)

    predictions = []
    true_labels = []
    failed_parses = 0

    print(f"\nEvaluating {dataset_name} on {num_samples} samples...")

    model.eval()

    for i in tqdm(range(num_samples)):
        text = dataset_split[i]['text']
        true_label = dataset_split[i]['label']

        # Create prompt (same format as training)
        prompt = f"""Classify the sentiment of the following review on a scale of 1 to 5:
1 = Very Negative
2 = Negative
3 = Neutral
4 = Positive
5 = Very Positive

Review: {text}

Sentiment (1-5):"""

        # Generate prediction
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=0.1,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

        # Extract rating
        pred = extract_rating_improved(response)

        if pred is not None:
            predictions.append(pred - 1)  # Convert to 0-4
            true_labels.append(true_label)
        else:
            failed_parses += 1
            predictions.append(2)
            true_labels.append(true_label)

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    f1_weighted = f1_score(true_labels, predictions, average='weighted')

    print(f"\n{'='*50}")
    print(f"{dataset_name} Fine-Tuned (LoRA r=8) Results")
    print(f"{'='*50}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 (Macro): {f1_macro:.4f}")
    print(f"F1 (Weighted): {f1_weighted:.4f}")
    print(f"Failed parses: {failed_parses}/{num_samples}")
    print(f"\nClassification Report:")
    print(classification_report(true_labels, predictions,
                                target_names=['Class 0', 'Class 1', 'Class 2', 'Class 3', 'Class 4'],
                                zero_division=0))

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'predictions': predictions,
        'true_labels': true_labels,
        'failed_parses': failed_parses
    }

# Evaluate on Yelp test set
print("\n" + "="*70)
print("EVALUATION: LoRA r=8 on Yelp Test")
print("="*70)
yelp_finetuned_results = evaluate_finetuned(model_lora, tokenizer, yelp_splits['test'], "Yelp Test")


EVALUATION: LoRA r=8 on Yelp Test

Evaluating Yelp Test on 3000 samples...


100%|██████████| 3000/3000 [39:10<00:00,  1.28it/s]


Yelp Test Fine-Tuned (LoRA r=8) Results
Accuracy: 0.6733
F1 (Macro): 0.6739
F1 (Weighted): 0.6741
Failed parses: 146/3000

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.82      0.75      0.78       645
     Class 1       0.64      0.65      0.64       583
     Class 2       0.58      0.64      0.61       579
     Class 3       0.64      0.59      0.61       650
     Class 4       0.70      0.76      0.73       543

    accuracy                           0.67      3000
   macro avg       0.68      0.67      0.67      3000
weighted avg       0.68      0.67      0.67      3000






In [None]:
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

with open(f'/content/drive/MyDrive/yelp_finetuned_r8_results_{timestamp}.json', 'w') as f:
    json.dump({
        'dataset': 'Yelp',
        'method': 'lora_finetuned',
        'lora_rank': 8,
        'timestamp': timestamp,
        'num_samples': 3000,
        'accuracy': yelp_finetuned_results['accuracy'],
        'f1_macro': yelp_finetuned_results['f1_macro'],
        'f1_weighted': yelp_finetuned_results['f1_weighted'],
        'failed_parses': yelp_finetuned_results['failed_parses'],
        'train_loss': train_result.training_loss
    }, f, indent=2)

with open(f'/content/drive/MyDrive/yelp_finetuned_r8_full_{timestamp}.pkl', 'wb') as f:
    pickle.dump(yelp_finetuned_results, f)

print("Results saved")

Results saved


### Evaluate on Amazon Test Set (Transfer Learning)

Test whether the Yelp-trained model generalizes to Amazon reviews.

In [None]:
# Evaluate on Amazon test set (cross-domain transfer)
print("\n" + "="*70)
print("EVALUATION: LoRA r=8 on Amazon Test (Transfer Learning)")
print("="*70)
amazon_finetuned_results = evaluate_finetuned(model_lora, tokenizer, amazon_splits['test'], "Amazon Test (Transfer)")

# Save Amazon results
with open(f'/content/drive/MyDrive/amazon_finetuned_transfer_{timestamp}.json', 'w') as f:
    json.dump({
        'dataset': 'Amazon',
        'method': 'lora_finetuned_transfer',
        'lora_rank': 8,
        'trained_on': 'Yelp',
        'timestamp': timestamp,
        'num_samples': 3000,
        'accuracy': amazon_finetuned_results['accuracy'],
        'f1_macro': amazon_finetuned_results['f1_macro'],
        'f1_weighted': amazon_finetuned_results['f1_weighted'],
        'failed_parses': amazon_finetuned_results['failed_parses']
    }, f, indent=2)

print(" Amazon transfer results saved")


EVALUATION: LoRA r=8 on Amazon Test (Transfer Learning)

Evaluating Amazon Test (Transfer) on 3000 samples...


100%|██████████| 3000/3000 [39:07<00:00,  1.28it/s]


Amazon Test (Transfer) Fine-Tuned (LoRA r=8) Results
Accuracy: 0.6043
F1 (Macro): 0.6019
F1 (Weighted): 0.6033
Failed parses: 1/3000

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.70      0.69      0.70       599
     Class 1       0.53      0.61      0.57       629
     Class 2       0.56      0.46      0.51       607
     Class 3       0.52      0.53      0.52       548
     Class 4       0.71      0.72      0.71       617

    accuracy                           0.60      3000
   macro avg       0.60      0.60      0.60      3000
weighted avg       0.61      0.60      0.60      3000

 Amazon transfer results saved





### Final Results Comparison

Complete comparison of all approaches across both datasets.

In [None]:
# Load all saved results from Google Drive
import json
import pickle
from glob import glob

# Find the most recent result files
def load_latest_json(pattern):
    files = glob(pattern)
    if not files:
        raise FileNotFoundError(f"No files found matching: {pattern}")
    latest = max(files, key=lambda x: x.split('_')[-1])
    with open(latest, 'r') as f:
        return json.load(f)

# Load Yelp results
yelp_zero_shot_data = load_latest_json('/content/drive/MyDrive/yelp_zero_shot_results_*.json')
yelp_4shot_data = load_latest_json('/content/drive/MyDrive/yelp_4shot_improved_*.json')
yelp_finetuned_data = load_latest_json('/content/drive/MyDrive/yelp_finetuned_r8_results_*.json')

# Load Amazon results
amazon_zero_shot_data = load_latest_json('/content/drive/MyDrive/amazon_zero_shot_results_*.json')
amazon_4shot_data = load_latest_json('/content/drive/MyDrive/amazon_4shot_results_*.json')
amazon_finetuned_data = load_latest_json('/content/drive/MyDrive/amazon_finetuned_transfer_*.json')

# Display final comparison
print("="*70)
print("FINAL RESULTS: ALL METHODS")
print("="*70)

print("\n YELP RESULTS:")
print(f"{'Method':<25} {'Accuracy':<12} {'F1 (Macro)':<12} {'Failed Parses'}")
print("-" * 65)
print(f"{'Zero-shot':<25} {yelp_zero_shot_data['accuracy']:<12.4f} {yelp_zero_shot_data['f1_macro']:<12.4f} {yelp_zero_shot_data['failed_parses']}")
print(f"{'4-shot':<25} {yelp_4shot_data['accuracy']:<12.4f} {yelp_4shot_data['f1_macro']:<12.4f} {yelp_4shot_data['failed_parses']}")
print(f"{'Fine-tuned (LoRA r=8)':<25} {yelp_finetuned_data['accuracy']:<12.4f} {yelp_finetuned_data['f1_macro']:<12.4f} {yelp_finetuned_data['failed_parses']}")

print("\n AMAZON RESULTS:")
print(f"{'Method':<25} {'Accuracy':<12} {'F1 (Macro)':<12} {'Failed Parses'}")
print("-" * 65)
print(f"{'Zero-shot':<25} {amazon_zero_shot_data['accuracy']:<12.4f} {amazon_zero_shot_data['f1_macro']:<12.4f} {amazon_zero_shot_data['failed_parses']}")
print(f"{'4-shot':<25} {amazon_4shot_data['accuracy']:<12.4f} {amazon_4shot_data['f1_macro']:<12.4f} {amazon_4shot_data['failed_parses']}")
print(f"{'Fine-tuned (transfer)':<25} {amazon_finetuned_data['accuracy']:<12.4f} {amazon_finetuned_data['f1_macro']:<12.4f} {amazon_finetuned_data['failed_parses']}")

print("\n" + "="*70)
print(" KEY FINDINGS")
print("="*70)

yelp_improvement = (yelp_finetuned_data['accuracy'] - yelp_zero_shot_data['accuracy']) * 100
amazon_improvement = (amazon_finetuned_data['accuracy'] - amazon_zero_shot_data['accuracy']) * 100

print(f"\n1. Fine-Tuning vs Prompting")
print(f"   Yelp:   {yelp_zero_shot_data['accuracy']:.1%} → {yelp_finetuned_data['accuracy']:.1%} ({yelp_improvement:+.1f}%)")
print(f"   Amazon: {amazon_zero_shot_data['accuracy']:.1%} → {amazon_finetuned_data['accuracy']:.1%} ({amazon_improvement:+.1f}%)")
print(f"   Fine-tuning provides substantial improvement on both datasets")

print(f"\n2. Cross-Domain Generalization")
print(f"   Model trained on Yelp achieved {amazon_finetuned_data['accuracy']:.1%} on Amazon")
print(f"   Strong transfer learning - +{amazon_improvement:.1f}% over zero-shot")

print(f"\n3. Few-Shot Prompting Failure (Consistent Pattern)")
print(f"   Zero-shot: {yelp_zero_shot_data['accuracy']:.1%}")
print(f"   Yelp:")
print(f"     Zero-shot: {yelp_zero_shot_data['accuracy']:.1%} | 4-shot: {yelp_4shot_data['accuracy']:.1%} ({(yelp_4shot_data['accuracy']-yelp_zero_shot_data['accuracy'])*100:.1f}%)")
print(f"   Amazon:")
print(f"     Zero-shot: {amazon_zero_shot_data['accuracy']:.1%} | 4-shot: {amazon_4shot_data['accuracy']:.1%} ({(amazon_4shot_data['accuracy']-amazon_zero_shot_data['accuracy'])*100:.1f}%)")
print(f"   Few-shot examples consistently hurt performance across both datasets")
print(f"   Gemma-2-2B struggles with complex prompts (high failed parse rates)")

print(f"\n4. Output Reliability")
print(f"   Method          Yelp Failed    Amazon Failed")
print(f"   Zero-shot       {yelp_zero_shot_data['failed_parses']:>4}/3000     {amazon_zero_shot_data['failed_parses']:>4}/3000")
print(f"   4-shot          {yelp_4shot_data['failed_parses']:>4}/3000     {amazon_4shot_data['failed_parses']:>4}/3000")
print(f"   Fine-tuned      {yelp_finetuned_data['failed_parses']:>4}/3000     {amazon_finetuned_data['failed_parses']:>4}/3000")
print(f"   Fine-tuned model produces dramatically more consistent output")

print("\n" + "="*70)
print("CONCLUSION")
print("="*70)
print("Fine-tuning with LoRA clearly justifies the additional complexity:")
print(f"• +{yelp_improvement:.1f}% improvement on in-domain data (Yelp)")
print(f"• +{amazon_improvement:.1f}% improvement on out-of-domain data (Amazon)")
print("• Strong cross-domain generalization")
print("• Dramatically more reliable output (low failed parse rates)")
print("• Few-shot prompting consistently underperforms zero-shot")
print("• Training time: ~90 minutes for 5,000 samples")
print("• Memory efficient: LoRA adds only 0.12% trainable parameters")
print("=" * 70)

FINAL RESULTS: ALL METHODS

 YELP RESULTS:
Method                    Accuracy     F1 (Macro)   Failed Parses
-----------------------------------------------------------------
Zero-shot                 0.5293       0.4872       6
4-shot                    0.4467       0.4553       1331
Fine-tuned (LoRA r=8)     0.6733       0.6739       146

 AMAZON RESULTS:
Method                    Accuracy     F1 (Macro)   Failed Parses
-----------------------------------------------------------------
Zero-shot                 0.4270       0.3675       10
4-shot                    0.2433       0.1611       2718
Fine-tuned (transfer)     0.6043       0.6019       1

 KEY FINDINGS

1. Fine-Tuning vs Prompting
   Yelp:   52.9% → 67.3% (+14.4%)
   Amazon: 42.7% → 60.4% (+17.7%)
   Fine-tuning provides substantial improvement on both datasets

2. Cross-Domain Generalization
   Model trained on Yelp achieved 60.4% on Amazon
   Strong transfer learning - +17.7% over zero-shot

3. Few-Shot Prompting Failure 