In [2]:
import sagemaker
from sagemaker.s3 import S3Uploader
from datasets import load_dataset
import os

# --- 1. Define the Hugging Face Dataset and Load It ---
# We will use the 'yahma/alpaca-cleaned' dataset.
# It is already split into 'train' and formatted for instruction tuning.
DATASET_ID = "yahma/alpaca-cleaned"
print(f"Loading dataset: {DATASET_ID}")

# Load the dataset directly from the Hugging Face Hub
# We explicitly load the 'train' split
dataset = load_dataset(DATASET_ID, split="train")

# --- 2. Format the Dataset for the SFTTrainer ---
# The original Alpaca dataset has 'instruction', 'input', and 'output' columns.
# We must combine these into a single 'text' column for the SFTTrainer.

def formatting_function(example):
    """
    Formats the Alpaca instruction/input/output into a single 'text' string
    in the Alpaca template, which the SFTTrainer will use.
    """
    instruction = example['instruction']
    output = example['output']
    
    # Check if there is an additional 'input' field (for context)
    if example.get("input"):
        template = f"""### Instruction:
{instruction}

### Input:
{example['input']}

### Response:
{output}"""
    else:
        template = f"""### Instruction:
{instruction}

### Response:
{output}"""
    
    return {"text": template}

# Apply the formatting function to the entire dataset
dataset = dataset.map(formatting_function, remove_columns=['instruction', 'input', 'output'])
dataset = dataset.select_columns(['text']) # Keep only the required 'text' column

print("\nFormatted Dataset Example:")
print(dataset[0]['text'])

# --- 3. Save Dataset Locally and Upload to S3 ---
local_data_path = 'alpaca_qlora_data'
if not os.path.exists(local_data_path):
    os.makedirs(local_data_path)

# Save the dataset to a local folder in the correct format
dataset.save_to_disk(local_data_path)
print(f"\nDataset saved locally to: {local_data_path}")

# Setup S3 paths
sess = sagemaker.Session()
bucket = sess.default_bucket()
s3_prefix = 'qlora-alpaca-validation'
s3_data_uri = f"s3://{bucket}/{s3_prefix}/data/train"

# Upload the local dataset directory to S3
S3Uploader.upload(
    local_path=local_data_path, 
    desired_s3_uri=s3_data_uri, 
    sagemaker_session=sess
)

print(f"\nTraining data successfully uploaded to S3: {s3_data_uri}")

# --- Set the Input for the Training Job ---
inputs = {
    'training': s3_data_uri 
}

Loading dataset: yahma/alpaca-cleaned


Map:   0%|          | 0/51760 [00:00<?, ? examples/s]


Formatted Dataset Example:
### Instruction:
Give three tips for staying healthy.

### Response:
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.


Saving the dataset (0/1 shards):   0%|          | 0/51760 [00:00<?, ? examples/s]


Dataset saved locally to: alpaca_qlora_data

Training data successfully uploaded to S3: s3://sagemaker-us-east-1-981304421142/qlora-alpaca-validation/data/train


In [3]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# --- 1. Define Sagemaker Session and Execution Role ---
# Get the default SageMaker Session and the IAM role used by the notebook
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

# --- 2. Configuration for the Training Job ---
# Ensure these versions are compatible with your requirements.txt
# (e.g., PyTorch 2.1 supports transformers 4.37 and higher)
instance_type = 'ml.g5.2xlarge'  # The powerful GPU instance for training
instance_count = 1
# This is a good base image for QLoRA and SFTTrainer
transformers_version = '4.46.1'  # Changed from 4.37 to a supported version
pytorch_version = '2.1'
py_version = 'py310'

# Hyperparameters for QLoRA (These are passed to your train.py script)
hyperparameters = {
    'model_id': 'mistralai/Mistral-7B-v0.1', # The base model you are fine-tuning
    'output_dir': '/opt/ml/model',         # Where the final LoRA weights will be saved
    'epochs': 3,                           # Set low for initial validation run
    'per_device_train_batch_size': 4,
    'gradient_accumulation_steps': 2,      # Use a small batch size, accumulate gradients
    'learning_rate': 2e-4,                 # Standard QLoRA learning rate
    'max_seq_length': 1024,
    'packing': True,                       # Efficient memory usage
    'lora_r': 64,                          # LoRA rank
    'lora_alpha': 16,                      # LoRA scaling factor
}

# --- 3. Create the HuggingFace Estimator ---
huggingface_estimator = HuggingFace(
    entry_point='train.py',       # Your training script
    source_dir='scripts',         # Folder containing train.py and requirements.txt
    instance_type=instance_type,
    instance_count=instance_count,
    role=role,
    transformers_version=transformers_version,
    pytorch_version=pytorch_version,
    py_version=py_version,
    hyperparameters=hyperparameters,
    max_run=36000, # Max run time in seconds (10 hours)
    # Important: Setting 'use_spot_instances=True' can save a lot of money (50-70%)
    # But spot instances can be interrupted. We'll leave it as False for this validation run.
    # use_spot_instances=False,
)

# --- 4. Launch the Training Job ---
print(f"Launching QLoRA training job on {instance_type}...")

# Pass the S3 input URI using the 'inputs' variable
huggingface_estimator.fit(
    inputs=inputs, 
    wait=False  # Set wait=False so the notebook continues and you can stop the CPU kernel
)

print("\nTraining Job launched! Monitor progress in the SageMaker Console.")
print(f"Job Name: {huggingface_estimator.latest_training_job.job_name}")

Launching QLoRA training job on ml.g5.2xlarge...
