# Model Evaluation with SageMaker Processing Job

This notebook evaluates the fine-tuned QWEN3-0.6B model using SageMaker Processing Job on test data.

## 1. Setup and Import Libraries

In [None]:
import os
import boto3
import sagemaker
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.pytorch import PyTorchProcessor
from datetime import datetime
import json

## 2. Configure SageMaker Session and Parameters

In [None]:
# SageMaker session
sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::637423390840:role/WSParticipantRole" # need to change your role

# S3 Bucket
bucket = sagemaker_session.default_bucket()
prefix = "qwen3-0-6-lora-samples"

# Training job name from previous notebook - UPDATE THIS
training_job_name = "qwen3-0-6b-lora-fine-tuning-lora-2025-08-31-12-28-29"

print(f"Using bucket: {bucket}")
print(f"Using prefix: {prefix}")
print(f"Evaluating model from training job: {training_job_name}")

## 3. Upload Test Data to S3

In [None]:
# Upload test.jsonl data to S3
print("Uploading test.jsonl data to S3...")
test_s3_uri = sagemaker_session.upload_data(
    path='samples/test.jsonl',
    bucket=bucket,
    key_prefix=f'{prefix}/data/test'
)
print(f"Test data uploaded to: {test_s3_uri}")

## 4. Get Model S3 Location from Training Job

In [None]:
# Get model artifacts location from training job
sm_client = boto3.client('sagemaker')
training_job = sm_client.describe_training_job(TrainingJobName=training_job_name)

model_s3_uri = training_job['ModelArtifacts']['S3ModelArtifacts']
print(f"Model artifacts location: {model_s3_uri}")

## 5. Create Evaluation Script

In [None]:
# Create evaluation script directory
os.makedirs('src/evaluation', exist_ok=True)

# Write evaluation script - CPU optimized version
evaluation_script = '''#!/usr/bin/env python3
import os
import json
import torch
import tarfile
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

class TestDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=256):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        with open(file_path, 'r') as f:
            for line in f:
                item = json.loads(line.strip())
                self.data.append(item)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        text = item.get("text", "")
        
        # Tokenize
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "text": text
        }

def evaluate_model():
    # Paths
    model_path = "/opt/ml/processing/model"
    test_data_path = "/opt/ml/processing/input/test/test.jsonl"
    output_path = "/opt/ml/processing/output"
    
    # Extract model.tar.gz
    print("Extracting model artifacts...")
    with tarfile.open(os.path.join(model_path, "model.tar.gz"), "r:gz") as tar:
        tar.extractall(model_path)
    
    # Load tokenizer and base model - CPU optimized
    print("Loading tokenizer and base model...")
    model_name = "Qwen/Qwen3-0.6B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load base model for CPU
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Changed to float32 for CPU
        device_map="cpu",  # Force CPU usage
        trust_remote_code=True
    )
    
    # Load LoRA adapter
    print("Loading LoRA adapter...")
    model = PeftModel.from_pretrained(base_model, model_path)
    model.eval()
    
    # Create test dataset
    print("Loading test data...")
    test_dataset = TestDataset(test_data_path, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    
    # Evaluation - CPU based
    print("Starting evaluation...")
    all_predictions = []
    all_references = []
    total_loss = 0
    num_samples = 0
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch["input_ids"]  # No .cuda() for CPU
            attention_mask = batch["attention_mask"]  # No .cuda() for CPU
            
            # Generate predictions with reduced parameters for CPU
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=64,  # Reduced for CPU performance
                temperature=0.7,
                do_sample=True,
                top_p=0.95,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
            
            # Decode predictions
            pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            original_text = batch["text"][0]
            
            # Extract generated part (simple approach)
            if len(pred_text) > len(original_text):
                generated_text = pred_text[len(original_text):].strip()
            else:
                generated_text = pred_text
            
            all_predictions.append(generated_text)
            all_references.append(original_text)
            
            # Calculate perplexity instead of loss for CPU efficiency
            num_samples += 1
    
    # Calculate simple metrics
    avg_pred_length = np.mean([len(pred.split()) for pred in all_predictions])
    avg_ref_length = np.mean([len(ref.split()) for ref in all_references])
    
    # Simple text similarity metric
    def calculate_similarity(predictions, references):
        similarities = []
        for pred, ref in zip(predictions, references):
            pred_words = set(pred.lower().split())
            ref_words = set(ref.lower().split())
            
            if len(ref_words) == 0:
                similarities.append(0)
                continue
            
            intersection = pred_words & ref_words
            similarity = len(intersection) / len(ref_words) if len(ref_words) > 0 else 0
            similarities.append(similarity)
        
        return np.mean(similarities)
    
    text_similarity = calculate_similarity(all_predictions, all_references)
    
    # Prepare results
    metrics = {
        "text_similarity": text_similarity,
        "avg_prediction_length": avg_pred_length,
        "avg_reference_length": avg_ref_length,
        "num_samples": num_samples,
        "model_location": model_path,
        "evaluation_type": "cpu_based"
    }
    
    # Save metrics
    print(f"\\nEvaluation Results:")
    print(f"Text Similarity: {text_similarity:.4f}")
    print(f"Average Prediction Length: {avg_pred_length:.2f} words")
    print(f"Average Reference Length: {avg_ref_length:.2f} words")
    print(f"Number of samples: {num_samples}")
    
    # Save results to JSON
    with open(os.path.join(output_path, "evaluation_metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)
    
    # Save sample predictions
    samples = []
    for i in range(min(5, len(all_predictions))):
        samples.append({
            "prediction": all_predictions[i],
            "reference": all_references[i]
        })
    
    with open(os.path.join(output_path, "sample_predictions.json"), "w") as f:
        json.dump(samples, f, indent=2, ensure_ascii=False)
    
    print("\\nEvaluation completed successfully!")
    return metrics

if __name__ == "__main__":
    evaluate_model()
'''

with open('src/evaluation/evaluate.py', 'w') as f:
    f.write(evaluation_script)

print("CPU-optimized evaluation script created successfully")

## 6. Create and Run Processing Job

In [None]:
# Create PyTorchProcessor - CPU based evaluation to avoid g5 quota issues
processor = PyTorchProcessor(
    framework_version="2.6.0",
    py_version="py312",
    role=role,
    instance_type="ml.m5.xlarge",  # Changed from ml.g5.2xlarge to avoid quota issues
    instance_count=1,
    base_job_name="qwen3-evaluation",
    sagemaker_session=sagemaker_session,
)

# Job name
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
processing_job_name = f"qwen3-evaluation-{timestamp}"

# Run processing job
print(f"Starting processing job: {processing_job_name}")
processor.run(
    code="evaluate.py",
    source_dir="src/evaluation",
    inputs=[
        ProcessingInput(
            source=model_s3_uri,
            destination="/opt/ml/processing/model",
            input_name="model"
        ),
        ProcessingInput(
            source=test_s3_uri,
            destination="/opt/ml/processing/input/test",
            input_name="test"
        )
    ],
    outputs=[
        ProcessingOutput(
            source="/opt/ml/processing/output",
            destination=f"s3://{bucket}/{prefix}/evaluation/{processing_job_name}",
            output_name="evaluation_results"
        )
    ],
    job_name=processing_job_name,
    wait=False
)

print(f"\\nProcessing job '{processing_job_name}' has been submitted!")
print(f"Evaluation results will be saved to: s3://{bucket}/{prefix}/evaluation/{processing_job_name}")

## 8. Monitor Processing Job and Get Results

In [None]:
# Wait for job completion (optional)
# processor.jobs[-1].wait()

# After job completes, download and display results
import time
print("Waiting for processing job to complete...")
print("This may take 10-15 minutes...")

# You can monitor the job status
while True:
    job_description = sm_client.describe_processing_job(ProcessingJobName=processing_job_name)
    status = job_description['ProcessingJobStatus']
    print(f"Job status: {status}")
    
    if status in ['Completed', 'Failed', 'Stopped']:
        break
    
    time.sleep(30)

## 9. Load and Display Evaluation Results

In [None]:
# Download evaluation results
s3_client = boto3.client('s3')

if status == 'Completed':
    # Download metrics
    metrics_key = f"{prefix}/evaluation/{processing_job_name}/evaluation_metrics.json"
    
    try:
        response = s3_client.get_object(Bucket=bucket, Key=metrics_key)
        metrics = json.loads(response['Body'].read())
        
        print("\\n=== Evaluation Results ===")
        print(f"Evaluation Loss: {metrics['text_similarity']:.4f}")
        print(f"Number of test samples: {metrics['num_samples']}")
        
        # Store metrics for model registry decision
        evaluation_metrics = metrics
        
    except Exception as e:
        print(f"Error loading metrics: {e}")
else:
    print(f"Processing job failed with status: {status}")