In [2]:
!pip install transformers torch bitsandbytes peft trl sagemaker huggingface_hub sentencepiece

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Co

In [3]:
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split
import random
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training
import torch
import sagemaker
import boto3
import os
from huggingface_hub import login


  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker role arn: arn:aws:iam::329599621791:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole
sagemaker bucket: sagemaker-us-east-1-329599621791
sagemaker session region: us-east-1


Data Loading, Shuffing and splitting

In [34]:
def load_shuffle_split_dataset():
    raw_data_path = f's3://{sess.default_bucket()}/context_immigration_data.json'
    dataset = load_dataset('json', data_files=raw_data_path, split='train')
    print(len(dataset))
    
    dataset = dataset.shuffle(seed=42)
    
    train_val_split = dataset.train_test_split(test_size=0.2)
    
    val_test_split = train_val_split['test'].train_test_split(test_size=0.5)
    dataset_dict = DatasetDict({
        'train': train_val_split['train'],
        'validation': val_test_split['train'],
        'test': val_test_split['test']
    })
    
    print(dataset_dict['train'][0])
    print(dataset_dict['validation'][0])
    print(dataset_dict['test'][0])
    
    os.makedirs("split_data", exist_ok=True)
    
    # Save locally first
    dataset_dict['train'].to_json("split_data/train.jsonl", lines=True)
    dataset_dict['validation'].to_json("split_data/validation.jsonl", lines=True)
    dataset_dict['test'].to_json("split_data/test.jsonl", lines=True)
    
    s3 = boto3.client('s3')
    s3_prefix = 'split_data'
    
    for split in ['train', 'validation', 'test']:
        local_file = f"split_data/{split}.jsonl"
        s3_key = f"{s3_prefix}/{split}.jsonl"
        s3.upload_file(local_file, sess.default_bucket(), s3_key)
        print(f"Uploaded to s3://{sess.default_bucket()}/{s3_key}")


# load_shuffle_split_dataset()
train_data_path = f's3://{sess.default_bucket()}/split_data/train.jsonl'
train_dataset = load_dataset('json', data_files=train_data_path, split='train', streaming=False)
print(len(train_dataset))

validation_data_path = f's3://{sess.default_bucket()}/split_data/validation.jsonl'
validation_dataset = load_dataset('json', data_files=validation_data_path, split='train', streaming=False)
print(len(validation_dataset))

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


7117


Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

890


Tokenazing data, preparing chat tempelate

In [None]:
from huggingface_hub import login

login(token="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")

def format_chat_template(batch, tokenizer):
    system_prompt = """"You are a legal assistant specializing in U.S. immigration law. Think through each question and provide an answer. Don't make things up, if you're unable to answer a question advise the user that you're unable to answer as it is outside of your scope."""

    samples = []

    # Access the inputs from the batch
    questions = batch["question"]
    answers = batch["answer"]

    for i in range(len(questions)):
        row_json = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": questions[i]},
            {"role": "assistant", "content": answers[i]}
        ]

        # Apply chat template and append the result to the list
        tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
        text = tokenizer.apply_chat_template(row_json, tokenize=False)
        samples.append(text)

    # Return a dictionary with lists as expected for batched processing
    return {
        "instruction": questions,
        "response": answers,
        "text": samples  # The processed chat template text for each row
    }


base_model = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    trust_remote_code=True,
    token="",
    use_fast=True
)

train_dataset = train_dataset.map(lambda x: format_chat_template(x, tokenizer), num_proc=8, batched=True, batch_size=10)
print(train_dataset[0])

validation_dataset = validation_dataset.map(lambda x: format_chat_template(x, tokenizer), num_proc=8, batched=True, batch_size=10)
print(validation_dataset[0])

test_dataset = test_dataset.map(lambda x: format_chat_template(x, tokenizer), num_proc=8, batched=True, batch_size=10)
print(test_dataset[0])

In [None]:
# Print total number of samples
print(f"Total number of samples: {len(train_dataset)}")

local_train_file = "train_dataset.jsonl"
train_dataset.to_json(local_train_file, lines=True)

s3 = boto3.client('s3')
s3_prefix = "processed/mistral"
s3_key = f"{s3_prefix}/train_dataset.jsonl"
training_input_path = f"s3://{sess.default_bucket()}/{s3_key}"

s3.upload_file(local_train_file, sess.default_bucket(), s3_key)
print(f"Uploaded to s3://{sess.default_bucket()}/{s3_key}")


local_validation_file = "validation_dataset.jsonl"
validation_dataset.to_json(local_validation_file, lines=True)

s3_prefix = "processed/mistral"
s3_key = f"{s3_prefix}/validation_dataset.jsonl"
validation_input_path = f"s3://{sess.default_bucket()}/{s3_key}"

s3.upload_file(local_validation_file, sess.default_bucket(), s3_key)
print(f"Uploaded to s3://{sess.default_bucket()}/{s3_key}")


local_test_file = "test_dataset.jsonl"
test_dataset.to_json(local_test_file, lines=True)

s3_prefix = "processed/mistral"
s3_key = f"{s3_prefix}/test_dataset.jsonl"
test_input_path = f"s3://{sess.default_bucket()}/{s3_key}"

s3.upload_file(local_test_file, sess.default_bucket(), s3_key)
print(f"Uploaded to s3://{sess.default_bucket()}/{s3_key}")

In [64]:
from huggingface_hub import HfFolder

job_name = f'huggingface-qlora-{hyperparameters["model_id"].replace("/","-").replace(".","-")}'

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': base_model,                             # pre-trained model
  # 'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 2,                            # number of training epochs
  'per_device_train_batch_size': 3,                 # batch size for training
  'gradient_accumulation_steps': 2,                 # Number of updates steps to accumulate
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 2e-4,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type": "constant",                   # learning rate scheduler
  'save_strategy': "steps",                         # save strategy for checkpoints
  "evaluation_strategy": "steps",
  "eval_steps": 50,
  "load_best_model_at_end": True,
  "metric_for_best_model": "loss",
  "greater_is_better": False,
  "logging_steps": 10,                              # log every x steps
  "merge_adapters": True,                           # wether to merge LoRA into the model (needs more memory)
  "output_dir": f"s3://{sess.default_bucket()}/checkpoints/{job_name}",                         # output directory, where to save assets during training
  "save_total_limit": 2,                                                  # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training
}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token()

In [65]:
from sagemaker.huggingface import HuggingFace

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_qlora.py',    # train script
    source_dir           = './scripts',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True         # not compress output to save training time and cost
)

In [66]:
!cat ./scripts/run_qlora.py | grep load_
s3 = boto3.client("s3")
obj = s3.head_object(Bucket=sess.default_bucket(), Key="processed/mistral/train_dataset.jsonl")
print(f"File size: {obj['ContentLength'] / 1024:.2f} KB")

from datasets import load_dataset
    train_dataset = load_dataset("json", data_files="/opt/ml/input/data/training/train_dataset.jsonl")["train"]
    validation_dataset = load_dataset("json", data_files="/opt/ml/input/data/validation/validation_dataset.jsonl")["train"]
        load_in_4bit=True,
File size: 15148.45 KB


In [None]:
data = {
    'training': training_input_path,
    'validation': validation_input_path
}
huggingface_estimator.fit(data, wait=True)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-qlora-mistralai-Mistral-7B--2025-06-13-22-34-50-126


2025-06-13 22:34:50 Starting - Starting the training job
2025-06-13 22:34:50 Pending - Training job waiting for capacity.....[34m88%|████████▊ | 98/112 [11:37<01:40,  7.19s/it]#033[A[0m
[34m88%|████████▊ | 99/112 [11:44<01:33,  7.19s/it]#033[A[0m
[34m89%|████████▉ | 100/112 [11:51<01:26,  7.19s/it]#033[A[0m
[34m90%|█████████ | 101/112 [11:59<01:19,  7.19s/it]#033[A[0m
[34m91%|█████████ | 102/112 [12:06<01:11,  7.19s/it]#033[A[0m
[34m92%|█████████▏| 103/112 [12:13<01:04,  7.19s/it]#033[A[0m
[34m93%|█████████▎| 104/112 [12:20<00:57,  7.19s/it]#033[A[0m
[34m94%|█████████▍| 105/112 [12:27<00:50,  7.19s/it]#033[A[0m
[34m95%|█████████▍| 106/112 [12:35<00:43,  7.19s/it]#033[A[0m
[34m96%|█████████▌| 107/112 [12:42<00:35,  7.19s/it]#033[A[0m
[34m96%|█████████▋| 108/112 [12:49<00:28,  7.19s/it]#033[A[0m
[34m97%|█████████▋| 109/112 [12:56<00:21,  7.19s/it]#033[A[0m
[34m98%|█████████▊| 110/112 [13:03<00:14,  7.19s/it]#033[A[0m
[34m99%|█████████▉| 111/112 [13:10<00:07,  7

In [68]:
huggingface_estimator.model_data["S3DataSource"]["S3Uri"].replace("s3://", "https://s3.console.aws.amazon.com/s3/buckets/")

'https://s3.console.aws.amazon.com/s3/buckets/sagemaker-us-east-1-329599621791/huggingface-qlora-mistralai-Mistral-7B--2025-06-13-22-34-50-126/output/model/'

In [14]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0",
  session=sess,
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


In [6]:
!rm -f /home/ec2-user/SageMaker/model/tokenizer.json

In [9]:
!ls /home/ec2-user/SageMaker/

fine_tune_mistral7B  lost+found  model


In [10]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
devtmpfs         16G     0   16G   0% /dev
tmpfs            16G     0   16G   0% /dev/shm
tmpfs            16G  688K   16G   1% /run
tmpfs            16G     0   16G   0% /sys/fs/cgroup
/dev/nvme0n1p1  135G   89G   47G  66% /
tmpfs           3.1G     0  3.1G   0% /run/user/0
/dev/nvme2n1     30G   14G   15G  49% /home/ec2-user/SageMaker
tmpfs           3.1G     0  3.1G   0% /run/user/1002
tmpfs           3.1G     0  3.1G   0% /run/user/1001
tmpfs           3.1G     0  3.1G   0% /run/user/1000


In [12]:
import tarfile
import os

# print(huggingface_estimator.model_data["S3DataSource"])

prefix = "huggingface-qlora-mistralai-Mistral-7B--2025-06-13-22-34-50-126/output/model/"  # with trailing slash
model_dir = "/home/ec2-user/SageMaker/model"

s3 = boto3.resource('s3')
bucket = s3.Bucket(sess.default_bucket())
# for obj in bucket.objects.filter(Prefix=prefix):
#     target = os.path.join(model_dir, os.path.relpath(obj.key, prefix))
#     os.makedirs(os.path.dirname(target), exist_ok=True)
#     bucket.download_file(obj.key, target)

tarball_path = "model.tar.gz"

# with tarfile.open(tarball_path, "w:gz") as tar:
#     tar.add(model_dir, arcname=".")

s3_prefix = "huggingface-qlora-mistralai-Mistral-7B--2025-06-13-22-34-50-126/output/compressed_model"
s3_key = f"{s3_prefix}/model.tar.gz"

s3.meta.client.upload_file(tarball_path, sess.default_bucket(), s3_key)
print(f"Uploaded to s3://{sess.default_bucket()}/{s3_key}")

Uploaded to s3://sagemaker-us-east-1-329599621791/huggingface-qlora-mistralai-Mistral-7B--2025-06-13-22-34-50-126/output/compressed_model/model.tar.gz


In [None]:
from sagemaker.huggingface import HuggingFaceModel

hub = {
    'HF_MODEL_ID': 'mistralai/Mistral-7B-Instruct-v0.3',
    'SM_NUM_GPUS': json.dumps(1),
    'HUGGING_FACE_HUB_TOKEN':'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'
}

llm_model = HuggingFaceModel(
    image_uri=llm_image,
    role=role,
    env=hub,
    model_data=f"s3://{sess.default_bucket()}/huggingface-qlora-mistralai-Mistral-7B--2025-06-13-22-34-50-126/output/compressed_model/model.tar.gz",
    sagemaker_session=sess
)

predictor = llm_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge"
)

----