# Fine-tuning Setup for Llama Model

## Environment Setup
- Load environment variables
- HuggingFace authentication setup
- Load stored dataset paths
- Initialize SageMaker session
- Set up AWS role and bucket configuration
- Configure region settings

In [1]:
import os
from dotenv import load_dotenv

In [None]:
load_dotenv('../end.local')
HF_TOKEN = os.getenv('HF_TOKEN')
if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found in environment variables")

!huggingface-cli login --token {HF_TOKEN}

In [3]:
%store -r data_folder
%store -r json_paths

In [None]:
print(data_folder)
print(json_paths)

In [None]:
import sagemaker
import boto3

sess = sagemaker.Session()
sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    role = boto3.client('iam').get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sagemaker_session_bucket}")
print(f"sagemaker session region: {sess.boto_region_name}")

## Data Preparation
- Create S3 paths for datasets
- Upload training, validation and test datasets to S3
- Create and upload model configuration file

In [None]:
# Create S3 paths
DATASET_NAME = "dolly_dataset"
s3_base_path = f's3://{sess.default_bucket()}/datasets/{DATASET_NAME}'

s3_paths = {}
for split_name, local_path in json_paths.items():
    filename = os.path.basename(local_path)
    s3_paths[split_name] = f"{s3_base_path}/{split_name}/{filename}"
    print(f"{split_name} dataset S3 path: {s3_paths[split_name]}")

    
for split_name, local_path in json_paths.items():
    try:
        s3_prefix = os.path.dirname(s3_paths[split_name])
        s3_path = S3Uploader.upload(
            local_path=local_path,
            desired_s3_uri=s3_prefix
        )
        print(f"Successfully uploaded {local_path} to {s3_path}")
    except Exception as e:
        print(f"Error uploading {split_name} dataset: {str(e)}")
        raise

In [None]:
print("\nVerifying S3 uploads:")
!aws s3 ls {s3_base_path} --recursive --human-readable

train_s3_path = s3_paths['train']
validation_s3_path = s3_paths['validation']
test_s3_path = s3_paths['test']

In [11]:
import os
config_folder_name = "accelerator_config"
os.makedirs(config_folder_name, exist_ok=True)

In [12]:
config_content = """
# script parameters
model_id: "meta-llama/Llama-3.2-3B"
max_seq_length: 2048
train_dataset_path: "/opt/ml/input/data/train/"
validation_dataset_path: "/opt/ml/input/data/validation/"
output_dir: "/tmp/llama3"

# training parameters
report_to: "tensorboard"
learning_rate: 2e-4
lr_scheduler_type: "constant"

num_train_epochs: 3
per_device_train_batch_size: 4
per_device_eval_batch_size: 4
gradient_accumulation_steps: 8

optim: adamw_torch
logging_steps: 10
save_strategy: epoch
eval_strategy: epoch
max_grad_norm: 0.3
warmup_ratio: 0.03
bf16: true
tf32: true
gradient_checkpointing: true
"""

config_file_path = os.path.join(config_folder_name, "sm_llama_3_2_3b_qlora.yaml")
with open(config_file_path, 'w') as f:
    f.write(config_content)

In [None]:
%%writefile ../scripts/requirements.txt

datasets==3.0.0
trl==0.11.1
bitsandbytes==0.44.0
peft==0.12.0
accelerate==0.34.2
sagemaker==2.232.1
transformers==4.44.2

In [None]:
CONFIG_FILE_PATH = "accelerator_config/sm_llama_3_2_3b_qlora.yaml"
config_s3_prefix = f"{s3_base_path}/config"

# Upload config file to S3
try:
    train_config_s3_path = S3Uploader.upload(
        local_path=CONFIG_FILE_PATH,
        desired_s3_uri=config_s3_prefix
    )
    print(f"\nConfiguration file uploaded successfully to {train_config_s3_path}")
except Exception as e:
    print(f"Error uploading configuration: {str(e)}")
    raise

## Training Configuration 
- Set up training instance configuration (ml.g5.4xlarge)
- Configure metrics for monitoring
- Set hyperparameters and environment variables

In [20]:
s3_data = {
    'train': train_s3_path,
    'validation': validation_s3_path,
    'config': train_config_s3_path
}

INSTANCE_CONFIG = {
    'type': 'ml.g5.4xlarge',
    'count': 1
}

training_config = {
    'instance_type': INSTANCE_CONFIG['type'],
    'instance_count': INSTANCE_CONFIG['count'],
    'metric_definitions': [
        {"Name": "train:loss", "Regex": "'train_loss':(.*?),"},
        {"Name": "validation:loss", "Regex": "'eval_loss':(.*?),"}
    ]
}

In [21]:
from sagemaker.huggingface import HuggingFace
import time

# Training job configuration
JOB_CONFIG = {
    'entry_point': 'train.py',
    'source_dir': '../scripts',
    'max_run_seconds': 1*24*60*60,  # 1 day
    'volume_size': 256,
    'framework_version': {
        'transformers': '4.36.0',
        'pytorch': '2.1.0',
        'python': 'py310'
    }
}

# Generate unique job name
job_name = f'llama3-2-3b-finetune-{time.strftime("%Y-%m-%d-%H-%M-%S")}'


# Initialize SageMaker estimator
huggingface_estimator = HuggingFace(
    entry_point=JOB_CONFIG['entry_point'],
    source_dir=JOB_CONFIG['source_dir'],
    instance_type=training_config['instance_type'],
    instance_count=training_config['instance_count'],
    sagemaker_session=sagemaker.session.Session(),
    max_run=JOB_CONFIG['max_run_seconds'],
    base_job_name=job_name,
    role=role,
    volume_size=JOB_CONFIG['volume_size'],
    transformers_version=JOB_CONFIG['framework_version']['transformers'],
    pytorch_version=JOB_CONFIG['framework_version']['pytorch'],
    py_version=JOB_CONFIG['framework_version']['python'],
    metric_definitions=training_config['metric_definitions'],
    hyperparameters={
        "config": "/opt/ml/input/data/config/sm_llama_3_2_3b_qlora.yaml"
    },
    disable_output_compression=True,
    keep_alive_period_in_seconds=3600,
    distribution={"torch_distributed": {"enabled": False}},
    environment={
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
        "HF_TOKEN": HF_TOKEN
    }
)

## Training Job Launch
- Initialize HuggingFace estimator
- Configure training job settings
- Launch training experiment
- Monitor training logs

In [None]:
from sagemaker.experiments.run import Run

# Initialize SageMaker experiment
experiment_config = {
    'name': 'dolly-ft',
    'run_name': f'training-job-experiment-{time.strftime("%Y-%m-%d-%H-%M-%S")}'
}

# Start experiment and training
with Run(
    experiment_name=experiment_config['name'], 
    run_name=experiment_config['run_name'], 
    sagemaker_session=sagemaker.session.Session()
) as run:
    huggingface_estimator.fit(s3_data, wait=False)

In [None]:
huggingface_estimator.logs()

## Model Artifacts
- Store model S3 path for later use

In [None]:
model_s3_path = huggingface_estimator.model_data
print("model_s3_path: \n", model_s3_path)

%store model_s3_path