## 1. Initialize SageMaker Environment

In [None]:
# Install dependencies if needed
import sys
import subprocess

def install_packages(packages):
    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

packages = ["sagemaker>=2.0", "peft", "boto3", "python-dotenv", "PyYAML"]
install_packages(packages)

print("âœ“ Packages installed")

In [None]:
# Import libraries
import os
import sys
from dotenv import load_dotenv

# Add src to path for imports
sys.path.insert(0, './src')

# Load environment variables
load_dotenv()

import logging
logging.basicConfig(level=logging.INFO)

print("âœ“ Imports complete")

In [None]:
# Initialize SageMaker session
from sagemaker_config import initialize_sagemaker

session, role, bucket, region = initialize_sagemaker()

print(f"âœ“ SageMaker initialized")
print(f"  Region: {region}")
print(f"  Bucket: {bucket}")
print(f"  Role: {role}")

## 2. Prepare Dataset

Run this once to prepare and upload dataset to S3.

In [None]:
# Prepare dataset
import os
sys.path.insert(0, './')
from src.dataset_utils import load_dialogsum_subset, save_jsonl
import boto3

# Load dataset
train_size = 125
val_size = 32

print(f"Loading DialogSum (train: {train_size}, val: {val_size})...")
train_data, val_data = load_dialogsum_subset(train_size, val_size)

# Create local directory
os.makedirs("data/jsonl", exist_ok=True)

# Save to JSONL
train_path = "data/jsonl/train.jsonl"
val_path = "data/jsonl/val.jsonl"

save_jsonl(train_data, train_path)
save_jsonl(val_data, val_path)

print(f"âœ“ Saved to {train_path} and {val_path}")

In [None]:
# Upload to S3
s3_prefix = "llm"
s3 = boto3.client("s3", region_name=region)

s3.upload_file(train_path, bucket, f"{s3_prefix}/train.jsonl")
s3.upload_file(val_path, bucket, f"{s3_prefix}/val.jsonl")

print(f"âœ“ Uploaded to S3:")
print(f"  s3://{bucket}/{s3_prefix}/train.jsonl")
print(f"  s3://{bucket}/{s3_prefix}/val.jsonl")

## 3. Configure and Launch Training Job

In [None]:
# Load configuration
import yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Display configuration
print("Training Configuration:")
print(f"  Model: {config['model']['name']}")
print(f"  Instance: {config['training']['instance_type']}")
print(f"  Epochs: {config['training']['epochs']}")
print(f"  Batch size: {config['training']['batch_size']}")
print(f"  Learning rate: {config['training']['learning_rate']}")
print(f"  LoRA rank: {config['model']['peft']['r']}")

In [None]:
# Create SageMaker estimator
from sagemaker.huggingface import HuggingFace

model_name = config["model"]["name"]

estimator = HuggingFace(
    entry_point="train.py",
    source_dir="src",
    instance_type=config["training"]["instance_type"],
    instance_count=config["training"]["instance_count"],
    role=role,
    transformers_version="4.36",
    pytorch_version="2.1",
    py_version="py310",
    use_spot_instances=config["training"]["use_spot"],
    max_wait=config["training"]["max_wait"],
    hyperparameters={
        "model-name": model_name,
        "num-epochs": config["training"]["epochs"],
        "batch-size": config["training"]["batch_size"],
        "learning-rate": config["training"]["learning_rate"],
        "lora-r": config["model"]["peft"]["r"],
        "lora-alpha": config["model"]["peft"]["lora_alpha"],
        "lora-dropout": config["model"]["peft"]["dropout"],
    }
)

print("âœ“ Estimator created")

In [None]:
# Launch training job
print("ðŸš€ Launching training job...")
print(f"Dataset: s3://{bucket}/{s3_prefix}/")
print("")

estimator.fit({
    "train": f"s3://{bucket}/{s3_prefix}/train.jsonl",
    "validation": f"s3://{bucket}/{s3_prefix}/val.jsonl"
})

print(f"\nâœ“ Training complete!")
print(f"Job name: {estimator.latest_training_job.name}")
print(f"Model URI: {estimator.model_uri}")

## 4. Download and Test Model Locally

In [None]:
# Download model artifacts from S3
import shutil

model_local_path = "./model_artifacts"
if os.path.exists(model_local_path):
    shutil.rmtree(model_local_path)

# Extract S3 path from model_uri (e.g., s3://bucket/path/output/model.tar.gz)
model_s3_uri = estimator.model_uri
print(f"Downloading model from: {model_s3_uri}")

# Use sagemaker to extract
model_data = session.download_data(
    path=model_local_path,
    source=model_s3_uri,
    target_dir=model_local_path
)

print(f"âœ“ Model downloaded to: {model_local_path}")

In [None]:
# Test model on sample dialogue
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset

# Load test data
dataset = load_dataset("knkarthick/dialogsum")
test_sample = dataset["test"][200]

dialogue = test_sample["dialogue"]
human_summary = test_sample["summary"]

print("Sample Dialogue:")
print("-" * 60)
print(dialogue[:300] + "...")
print("\n" + "=" * 60)

In [None]:
# Load trained model for inference
print("Loading model...")
model_path = model_local_path  # Path where model was downloaded
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch.float32)

# Generate summary
prompt = f"Summarize the following conversation:\n\n{dialogue}\n\nSummary:"
inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    inputs["input_ids"],
    max_new_tokens=200,
    num_beams=1
)

model_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Display results
print("Human Summary:")
print(human_summary)
print("\n" + "=" * 60)
print("Model Generated Summary:")
print(model_summary)
print("=" * 60)

## 5. Evaluate on Test Set

In [None]:
# Evaluate on 10 test samples
import evaluate

rouge = evaluate.load("rouge")

test_dialogues = dataset["test"][:10]["dialogue"]
test_summaries = dataset["test"][:10]["summary"]

predictions = []

print("Generating predictions on 10 samples...")
for idx, dialogue in enumerate(test_dialogues):
    prompt = f"Summarize the following conversation:\n\n{dialogue}\n\nSummary:"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    outputs = model.generate(inputs["input_ids"], max_new_tokens=200)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(prediction)
    
    if (idx + 1) % 5 == 0:
        print(f"  Processed {idx + 1}/10")

# Compute ROUGE
results = rouge.compute(
    predictions=predictions,
    references=test_summaries,
    use_aggregator=True,
    use_stemmer=True
)

print("\n" + "=" * 60)
print("ROUGE Evaluation Results (10 test samples)")
print("=" * 60)
print(f"ROUGE-1: {results['rouge1']:.4f}")
print(f"ROUGE-2: {results['rouge2']:.4f}")
print(f"ROUGE-L: {results['rougeL']:.4f}")
print("=" * 60)

## 6. Optional: Deploy Endpoint

Uncomment to deploy a real-time inference endpoint (costs ~$0.05/hour).

In [None]:
# # Deploy endpoint
# print("Deploying endpoint...")
# predictor = estimator.deploy(
#     instance_type="ml.m5.xlarge",
#     initial_instance_count=1,
#     endpoint_name="flan-t5-dialogsum-endpoint"
# )
# print(f"âœ“ Endpoint deployed: {predictor.endpoint_name}")

In [None]:
# # Test endpoint
# test_prompt = "Summarize: Person A: Hello. Person B: Hi, how are you? Person A: Good, thanks!"
# 
# result = predictor.predict({
#     "inputs": test_prompt
# })
# 
# print(f"Endpoint response: {result}")

In [None]:
# # Delete endpoint when done (to save costs)
# predictor.delete_endpoint()
# print("âœ“ Endpoint deleted")

## Summary

âœ… Complete workflow for fine-tuning FLAN-T5-Base:
1. âœ“ Initialized SageMaker environment
2. âœ“ Prepared and uploaded DialogSum dataset
3. âœ“ Configured training with LoRA
4. âœ“ Launched remote training job
5. âœ“ Downloaded and tested model locally
6. âœ“ Evaluated on test set with ROUGE metrics

**Total cost**: ~$1-3 USD (using spot instances)
**Training time**: 5-15 minutes

For production deployment, consider using SageMaker endpoints or batch transform jobs.