# Fine-tune Gemma on Sagemaker with QLoRA

- Notebook running with a different EC2 instance type
- Use Huggingface library
  - SQL generator Dataset from Huggingface
  - Transformer with Gemma model
  - SFT Trainer from TRL library by HugginFace
  - QLoRA based training from PEFT
  - Deploy using saved artifacts on S3
  - evaluate the model

In [33]:
! pip install -r ./requirements_local.txt

#Add Token as .env file for the model gemma-2b-it , pbtain permission from Higging face and create a Token in UI, copy the token as HF_TOKEN=[Your token ] in .env file#

Collecting sagemaker==2.227.0 (from -r ./requirements_local.txt (line 2))
  Using cached sagemaker-2.227.0-py3-none-any.whl.metadata (15 kB)
Collecting boto3==1.34.152 (from -r ./requirements_local.txt (line 3))
  Using cached boto3-1.34.152-py3-none-any.whl.metadata (6.6 kB)
Collecting datasets==2.18.0 (from -r ./requirements_local.txt (line 4))
  Using cached datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting botocore<1.35.0,>=1.34.152 (from boto3==1.34.152->-r ./requirements_local.txt (line 3))
  Using cached botocore-1.34.162-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3==1.34.152->-r ./requirements_local.txt (line 3))
  Using cached s3transfer-0.10.4-py3-none-any.whl.metadata (1.7 kB)
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets==2.18.0->-r ./requirements_local.txt (line 4))
  Using cached fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Using cached sagemaker-2.227.0-py3-none-any.whl (1

# ADD HF_TOKEN

# Setup SageMaker

In [46]:
# Load environment variables, SageMaker SDK, and AWS SDK
import dotenv
import sagemaker
import boto3
from botocore.exceptions import ClientError

# Load environment variables from the .env file
# Useful for keeping AWS credentials, role names, or other secrets outside the code
print(dotenv.load_dotenv('./.env'))

# ---------- Configuration Section ----------

# Unique user identifier to namespace resources
USER_ID = 'u5'

# Define the S3 bucket name for storing training/testing data and model artifacts
s3_bucket = f"{USER_ID}-av-llmops-sagemaker"

# SageMaker training job and endpoint model names
job_name = f"{USER_ID}-qlora-gemma-2b-sql-generator"
deploy_model_name = f"{USER_ID}-sql-generator-model"

# Uncomment and set your IAM role name if needed (for local or Studio execution)
# role_name = "llmops_workshop_sagemaker_exec_role"

# Define the instance types for training and deployment
train_instance = 'ml.g5.2xlarge'
deploy_instance = 'ml.g5.2xlarge'

# Base model from Hugging Face to fine-tune
model_id = "google/gemma-2b-it"

# Local paths for training and test datasets and their S3 upload locations
train_local, train_path = './tmp/train.jsonl', 'dataset/train.jsonl'
test_local, test_path = './tmp/test.jsonl', 'dataset/test.jsonl'

# ---------- S3 Bucket Setup ----------

def create_bucket(bucket_name, region="ap-south-1"):
    """
    Create an S3 bucket in the specified region if it doesn't already exist.
    """
    s3_client = boto3.client('s3', region_name=region)
    try:
        location = {'LocationConstraint': region}
        s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location)
        print(f"Bucket {bucket_name} created successfully.")
    except ClientError as e:
        print(f"Bucket {bucket_name} got response {e.response['Error']['Code']}")


create_bucket(s3_bucket)

# ---------- IAM Role Setup ----------

# Attempt to get the SageMaker execution role automatically
try:
    role = sagemaker.get_execution_role()
except ValueError:
    # Fallback to manually fetching the role via boto3
    iam = boto3.client('iam')
    role = iam.get_role(RoleName=role_name)['Role']['Arn']

# ---------- SageMaker Session Setup ----------

# Initialize the SageMaker session with the default bucket
sess = sagemaker.Session(default_bucket=s3_bucket)

# ---------- Output Configuration Summary ----------

# Print details for verification
print(f"{role =}")
print(f"{s3_bucket =}")
print(f"{sess.boto_region_name =}")

True
Bucket u5-av-llmops-sagemaker-workshop got response BucketAlreadyExists
role ='arn:aws:iam::536769277173:role/BI-DEV'
s3_bucket ='u5-av-llmops-sagemaker-workshop'
sess.boto_region_name ='us-west-2'


# Dataset Overview

In [47]:
from datasets import load_dataset

ds = load_dataset("gretelai/synthetic_text_to_sql")
print(ds["train"][0])

{'id': 5097, 'domain': 'forestry', 'domain_description': 'Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.', 'sql_complexity': 'single join', 'sql_complexity_description': 'only one join (specify inner, outer, cross)', 'sql_task_type': 'analytics and reporting', 'sql_task_type_description': 'generating reports, dashboards, and analytical insights', 'sql_prompt': 'What is the total volume of timber sold by each salesperson, sorted by salesperson?', 'sql_context': "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');", 'sql': 'SELECT

In [36]:
print(ds.shape)
ds

{'train': (100000, 11), 'test': (5851, 11)}


DatasetDict({
    train: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 5851
    })
})

In [37]:
ds["train"].features # Data Set Features#

{'id': Value(dtype='int32', id=None),
 'domain': Value(dtype='string', id=None),
 'domain_description': Value(dtype='string', id=None),
 'sql_complexity': Value(dtype='string', id=None),
 'sql_complexity_description': Value(dtype='string', id=None),
 'sql_task_type': Value(dtype='string', id=None),
 'sql_task_type_description': Value(dtype='string', id=None),
 'sql_prompt': Value(dtype='string', id=None),
 'sql_context': Value(dtype='string', id=None),
 'sql': Value(dtype='string', id=None),
 'sql_explanation': Value(dtype='string', id=None)}

In [22]:
for label, val in ds["train"][0].items():
    print(f"{label}: {val}\n\n")

id: 5097


domain: forestry


domain_description: Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.


sql_complexity: single join


sql_complexity_description: only one join (specify inner, outer, cross)


sql_task_type: analytics and reporting


sql_task_type_description: generating reports, dashboards, and analytical insights


sql_prompt: What is the total volume of timber sold by each salesperson, sorted by salesperson?


sql_context: CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');


sql: SELECT salesperson_id, name, SUM(vo

# Prepare Dataset

In [48]:
USER_PROMPT_TEMPLATE = """ 
You are a database management system expert, proficient in Structured Query Language (SQL). 
Your job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. 
Use SQLite syntax and please output only SQL without any kind of explanations. 
### Schema: {sql_context} 
 
### Knowledge: This "{sql_task_type}" type task is commonly used for {sql_task_type_description} in the domain of {domain}, which involves {domain_description}. 
 
### Question: {sql_prompt} 
"""

In [49]:
def get_messages(item_dict):
    """
    Formats a dictionary of SQL task metadata into a message exchange format
    suitable for chat-based fine-tuning or prompting with an LLM."""
    return { "messages": [
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(**item_dict)},
        {"role": "assistant", "content": item_dict["sql"]}
        ]
    }

In [50]:
from transformers import AutoTokenizer
from datasets import load_dataset  # Only needed if you're loading from HF Hub

# Load tokenizer (ensure model_id is defined before this line)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Apply transformation to dataset
# It’s good practice to map over each split independently (e.g., train/test)
formated_ds = ds.map(
    get_messages,
    remove_columns=ds["train"].column_names,  # Or use list(ds["train"].features)
    batched=False
)

# Inspect a sample record
print(formated_ds['train'][1])

{'messages': [{'content': ' \nYou are a database management system expert, proficient in Structured Query Language (SQL). \nYour job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. \nUse SQLite syntax and please output only SQL without any kind of explanations. \n### Schema: CREATE TABLE equipment_maintenance (equipment_type VARCHAR(255), maintenance_frequency INT); \n \n### Knowledge: This "analytics and reporting" type task is commonly used for generating reports, dashboards, and analytical insights in the domain of defense industry, which involves Defense contract data, military equipment maintenance, threat intelligence metrics, and veteran employment stats.. \n \n### Question: List all the unique equipment types and their corresponding total maintenance frequency from the equipment_maintenance table. \n',
   'role': 'user'},
  {'content': 'SELECT equipment_type, SUM(maintenance_frequency) AS 

In [51]:
tokenizer.apply_chat_template(formated_ds["train"][5]['messages'], tokenize=False)

'<bos><start_of_turn>user\nYou are a database management system expert, proficient in Structured Query Language (SQL). \nYour job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. \nUse SQLite syntax and please output only SQL without any kind of explanations. \n### Schema: CREATE SCHEMA if not exists defense; CREATE TABLE if not exists eu_humanitarian_assistance (id INT PRIMARY KEY, year INT, spending INT); INSERT INTO defense.eu_humanitarian_assistance (id, year, spending) VALUES (1, 2019, 1500), (2, 2020, 1800), (3, 2021, 2100); \n \n### Knowledge: This "analytics and reporting" type task is commonly used for generating reports, dashboards, and analytical insights in the domain of defense operations, which involves Defense data on military innovation, peacekeeping operations, defense diplomacy, and humanitarian assistance.. \n \n### Question: What is the total spending on humanitarian assistance 

In [52]:
ds['train'][1]

{'id': 5098,
 'domain': 'defense industry',
 'domain_description': 'Defense contract data, military equipment maintenance, threat intelligence metrics, and veteran employment stats.',
 'sql_complexity': 'aggregation',
 'sql_complexity_description': 'aggregation functions (COUNT, SUM, AVG, MIN, MAX, etc.), and HAVING clause',
 'sql_task_type': 'analytics and reporting',
 'sql_task_type_description': 'generating reports, dashboards, and analytical insights',
 'sql_prompt': 'List all the unique equipment types and their corresponding total maintenance frequency from the equipment_maintenance table.',
 'sql_context': 'CREATE TABLE equipment_maintenance (equipment_type VARCHAR(255), maintenance_frequency INT);',
 'sql': 'SELECT equipment_type, SUM(maintenance_frequency) AS total_maintenance_frequency FROM equipment_maintenance GROUP BY equipment_type;',
 'sql_explanation': 'This query groups the equipment_maintenance table by equipment_type and calculates the sum of maintenance_frequency fo

In [56]:
import boto3

s3 = boto3.client('s3')
buckets = [bucket['Name'] for bucket in s3.list_buckets()['Buckets']]
print("Available Buckets:", buckets)

Available Buckets: ['536769277173-sagemaker-us-west-2', 'allens-test', 'auth0-dev-events', 'auth0-email-events-dev', 'aws-glue-cdm-output', 'aws-glue-cdm-test-push', 'aws-glue-scripts-536769277173-us-west-2', 'aws-glue-temporary-536769277173-us-west-2', 'capricorn-sagemaker', 'cb-knowledgebase-test', 'cdm-d-partner-delivery-001', 'cdm-data-engg-tf-infra-backend-dev', 'cdm-data-science', 'cdm-dev-auth0-activity', 'cdm-dev-auth0-ses-activity', 'cdm-dev-chatbot-feedback', 'cdm-ds-batch-scoring', 'cdm-patch-manager-logs-536769277173', 'cdm-revjet-post-processing', 'cdm-simple-test', 'cdmtestsftp', 'cf-templates-1w94ssdr08wz4-us-west-2', 'chatbot-intent-recognition-test', 'ds-changepoint-detection-dev', 'esteban-dev', 'folders-sagemaker', 'foo-test-push', 'ga-temp-dev', 'genaiic-mlflow-artifacts-536769277173-us-west-2', 'gsc-topic-modeling', 'institutional-knowledge-agent-kb-test', 'kgoud', 'lhp-rtp-initial-match', 'manual-cdm-dev-chatbot-feedback', 'manual-cdm-dev-chatbot-interactions', 'm

# Upload Dataset to S3

In [54]:
import boto3
import os

# pandas orient=‘records’ for jsonl. List like [{column -> value}, … , {column -> value}]
formated_ds["train"].shuffle().select(range(1000)).to_json(train_local, orient="records")
formated_ds["test"].shuffle().select(range(100)).to_json(test_local, orient="records")



Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

121946

In [60]:
import boto3
print(boto3.session.Session().region_name)
s3=boto3.client('s3')
s3.upload_file(train_local, s3_bucket, train_path)
s3.upload_file(test_local, s3_bucket, test_path)

us-west-2


# Training

In [61]:
hyperparameters = {
  # Dataset path for the trainer script to access input data
  'dataset_path': '/opt/ml/input/data/training/train.jsonl',

  # Base model (e.g., "google/gemma-2b-it")
  'model_id': model_id,

  # Tokenization and model input control
  'max_seq_len': 3072,  # Good for models like Gemma (optimize based on GPU RAM)

  # Enable QLoRA-style adapter-based fine-tuning
  'use_qlora': True,

  # Training loop parameters
  'num_train_epochs': 1,
  'per_device_train_batch_size': 1,
  'gradient_accumulation_steps': 4,  # 1x4 => effective batch size = 4
  'gradient_checkpointing': True,  # Saves memory for large models

  # Optimizer
  'optim': "adamw_torch_fused",  # NVIDIA fused optim (needs PyTorch 2.x)

  # Logging
  'logging_steps': 5,

  # Checkpointing
  'save_strategy': "epoch",

  # Learning rate and scheduler
  'learning_rate': 2e-4,
  'lr_scheduler_type': "constant",
  'warmup_ratio': 0.03,

  # Numerical precision settings
  'bf16': True,   # Use BF16 if your GPU supports it (like A100, H100)
  'tf32': True,   # Use TF32 for matrix multiplies on Ampere GPUs (NVIDIA)

  # Gradient control
  'max_grad_norm': 0.3,

  # Output logging and artifacts
  'report_to': "tensorboard",        # Optional: "wandb" or None
  'output_dir': '/tmp/tun',

  # Post-training: merge LoRA adapters into base model weights
  'merge_adapters': True
}

In [62]:
from sagemaker.huggingface import HuggingFace

# image_uri = "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04"
huggingface_estimator = HuggingFace(
     base_job_name=job_name,
    # image_uri=image_uri,
    # if not image_uri
    transformers_version = '4.36.0',
    pytorch_version      = '2.1.0',
    
    instance_type=train_instance,
    instance_count=1,
    max_run=int(3600 * 0.5),
    role=role,
    environment={
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
        "HF_TOKEN": os.environ["HF_TOKEN"]
    },
    py_version='py310',
    entry_point='qlora.py',
    source_dir=".", # Copy source to S3 and auto installs the requirements.txt file 
    hyperparameters=hyperparameters,
    disable_output_compression = True, # not compress output to save training time and cost
    metric_definitions=[
      {'Name': 'loss', 'Regex': "'loss': (.*?),"},
      {'Name': 'grad_norm', 'Regex': "'grad_norm': (.*?),"},
      {'Name': 'learning_rate', 'Regex': "'learning_rate': (.*?),"},
      {'Name': 'epoch', 'Regex': "'epoch': (.*?)}"}
    ]
)

In [63]:
data = {'training': f's3://{s3_bucket}/dataset'}
huggingface_estimator.fit(data, wait=True) # Training the model

2025-04-21 20:27:43 Starting - Starting the training job
...........20:27:43 Pending - Training job waiting for capacity.
..25-04-21 20:29:44 Pending - Preparing the instances for training.
..25-04-21 20:30:15 Downloading - Downloading input data.
.................30 Downloading - Downloading the training image.
.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m..
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2025-04-21 20:34:00,845 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-04-21 20:34:00,862 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-04-21 20:34:00,871 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2025-04-21 20:34:00,873 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m


# Create Model and Endpoint

In [72]:
from sagemaker.huggingface import get_huggingface_llm_image_uri
import sagemaker
import boto3

#S3 path to the model artifacts from training

model_s3_path = huggingface_estimator.model_data["S3DataSource"]["S3Uri"]

# Container URI for Hugging Face Text Generation Inference (TGI)
# Make sure region matches where you'll deploy the model
 
llm_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.3.0-tgi2.2.0-gpu-py310-cu121-ubuntu22.04-v2.0"

In [73]:
import json
from sagemaker.huggingface import HuggingFaceModel
 
# Environment variables for TGI container
config = {
  'HF_MODEL_ID': "/opt/ml/model",
  'SM_NUM_GPUS': '1',
  'MAX_INPUT_LENGTH': json.dumps(1024),
  'MAX_TOTAL_TOKENS': json.dumps(2048) # req prompt tokens + req generated tokens in the GPU for this req
}
# HuggingFaceModel expects `model_data` to be a string (S3 URI), not a dictionar
llm_model = HuggingFaceModel(
  name=deploy_model_name,
  role=role,
  image_uri=llm_image,
  model_data={'S3DataSource':{'S3Uri': model_s3_path,'S3DataType': 'S3Prefix','CompressionType': 'None'}},
  env=config
)

In [74]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=deploy_instance,
  container_startup_health_check_timeout=300,
)

-----------!

# Evaluation

In [75]:
import os
from transformers import AutoTokenizer
from sagemaker import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from datasets import load_dataset
# ---------- Configuration ----------
os.makedirs('./tmp', exist_ok=True)
# Ensure model_id, s3_bucket, and test_path are defined earlier in your script
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Download test dataset from S3
boto3.client('s3').download_file(s3_bucket, test_path, test_local)
# Load dataset using Hugging Face Datasets
test_dataset = load_dataset("json", data_files=test_local, split="train")

#  Make sure `llm.endpoint_name` or the actual endpoint name is defined
deployed_llm = Predictor(
    endpoint_name=llm.endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)
# ---------- Inference Function ----------
def request(sample):
    prompt = tokenizer.apply_chat_template(sample, tokenize=False, add_generation_prompt=True)
    print(f"prompt:\n {prompt} \n\n")
    outputs = deployed_llm.predict({
      "inputs": prompt,
      "parameters": {
        "max_new_tokens": 512,
        "temperature": 0.01,
        "return_full_text": False,
      }
    })
    return {"role": "assistant", "content": outputs[0]["generated_text"].strip()}

random_sample = test_dataset[10]
print(f"input message:\n {random_sample['messages'][0]} \n\n")
print(f"expected output:\n {random_sample['messages'][1]} \n\n")
print(f"generated output:\n {request([random_sample['messages'][0]])} \n\n")

Generating train split: 0 examples [00:00, ? examples/s]

input message:
 {'content': ' \nYou are a database management system expert, proficient in Structured Query Language (SQL). \nYour job is to write an SQL query that answers the following question, based on the given database schema and any additional information provided. \nUse SQLite syntax and please output only SQL without any kind of explanations. \n### Schema: CREATE TABLE costs (county_id INT, year INT, cost INT); \n \n### Knowledge: This "analytics and reporting" type task is commonly used for generating reports, dashboards, and analytical insights in the domain of rural health, which involves Detailed records on healthcare access, disease prevalence, and resource allocation in rural health.. \n \n### Question: Identify counties in New Mexico with increasing healthcare costs over the past 4 years. \n', 'role': 'user'} 


expected output:
 {'content': "SELECT county_id, COUNT(*) AS years FROM costs WHERE costs[ROW_NUMBER() OVER (PARTITION BY county_id ORDER BY year) - 1] < cost G

In [76]:

#Deleting the end points
deployed_llm.delete_model()
deployed_llm.delete_endpoint()