In [1]:
import sagemaker
from sagemaker.s3 import S3Uploader
from datasets import load_dataset
import os

# --- 1. Define the Hugging Face Dataset and Load It ---
# We will use the 'yahma/alpaca-cleaned' dataset.
# It is already split into 'train' and formatted for instruction tuning.
DATASET_ID = "yahma/alpaca-cleaned"
print(f"Loading dataset: {DATASET_ID}")

# Load the dataset directly from the Hugging Face Hub
# We explicitly load the 'train' split
dataset = load_dataset(DATASET_ID, split="train")

# --- 2. Format the Dataset for the SFTTrainer ---
# The original Alpaca dataset has 'instruction', 'input', and 'output' columns.
# We must combine these into a single 'text' column for the SFTTrainer.

def formatting_function(example):
    """
    Formats the Alpaca instruction/input/output into a single 'text' string
    in the Alpaca template, which the SFTTrainer will use.
    """
    instruction = example['instruction']
    output = example['output']
    
    # Check if there is an additional 'input' field (for context)
    if example.get("input"):
        template = f"""### Instruction:
{instruction}

### Input:
{example['input']}

### Response:
{output}"""
    else:
        template = f"""### Instruction:
{instruction}

### Response:
{output}"""
    
    return {"text": template}

# Apply the formatting function to the entire dataset
dataset = dataset.map(formatting_function, remove_columns=['instruction', 'input', 'output'])
dataset = dataset.select_columns(['text']) # Keep only the required 'text' column

print("\nFormatted Dataset Example:")
print(dataset[0]['text'])

# --- 3. Save Dataset Locally and Upload to S3 ---
local_data_path = 'alpaca_qlora_data'
if not os.path.exists(local_data_path):
    os.makedirs(local_data_path)

# Save the dataset to a local folder in the correct format
dataset.save_to_disk(local_data_path)
print(f"\nDataset saved locally to: {local_data_path}")

# Setup S3 paths
sess = sagemaker.Session()
bucket = sess.default_bucket()
s3_prefix = 'qlora-alpaca-validation'
s3_data_uri = f"s3://{bucket}/{s3_prefix}/data/train"

# Upload the local dataset directory to S3
S3Uploader.upload(
    local_path=local_data_path, 
    desired_s3_uri=s3_data_uri, 
    sagemaker_session=sess
)

print(f"\nTraining data successfully uploaded to S3: {s3_data_uri}")

# --- Set the Input for the Training Job ---
inputs = {
    'training': s3_data_uri 
}

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Loading dataset: yahma/alpaca-cleaned


In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace

# --- 1. Define Sagemaker Session and Execution Role ---
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

# --- 2. Configuration for the Training Job ---
instance_type = 'ml.g5.2xlarge'  
instance_count = 1
# *** Using the LATEST SUPPORTED BASE IMAGE ***
transformers_version = '4.46.1'  
pytorch_version = '2.3.0'        # Corrected: Latest version supported by your SDK
py_version = 'py311'             # Corrected: Python 3.11 is typically used with PyTorch 2.3.0

# Hyperparameters (no change)
hyperparameters = {
    'model_id': 'mistralai/Mistral-7B-v0.1', 
    'output_dir': '/opt/ml/model',         
    'epochs': 3,                           
    'per_device_train_batch_size': 4,
    'gradient_accumulation_steps': 2,      
    'learning_rate': 2e-4,                 
    'max_seq_length': 1024,
    'packing': True,                       
    'lora_r': 64,                          
    'lora_alpha': 16,                      
}

# --- 3. Create the HuggingFace Estimator ---
huggingface_estimator = HuggingFace(
    entry_point='train.py',       
    source_dir='scripts',         
    instance_type=instance_type,
    instance_count=instance_count,
    role=role,
    transformers_version=transformers_version,
    pytorch_version=pytorch_version,
    py_version=py_version,
    hyperparameters=hyperparameters,
    max_run=36000, 
)

# --- 4. Launch the Training Job ---
print(f"Launching QLoRA training job on {instance_type}...")

# Use the 'inputs' variable created from the S3 upload
huggingface_estimator.fit(
    inputs=inputs, 
    wait=False  # Keep wait=False for remote monitoring
)

print("\nTraining Job launched! Monitor progress in the SageMaker Console.")
print(f"Job Name: {huggingface_estimator.latest_training_job.job_name}")