# Fine tune Mixtral-8x7B with QLoRA and SageMaker remote decorator 
We are using SageMaker remote decorator for running the fine-tuning job on Amazon SageMaker Training job
SageMaker Studio Kernel: PyTorch 2.0.0 Python 3.10

JupyterLab Instance Type: ml.t3.medium

Fine-Tuning:

Instance Type: ml.g5.12xlarge

Install the required libriaries, including the Hugging Face libraries, and restart the kernel.

### Install Proper Libraries

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -q -U datasets>=2.18.0
%pip install -q -U scikit-learn
%pip install -q -U awswrangler

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Import most of the Libraries

In [3]:
import awswrangler as wr
import boto3
import os
import pandas as pd
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

from sklearn.model_selection import train_test_split
from IPython.core.display import display, HTML

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


  from IPython.core.display import display, HTML


In [4]:
# check dependencies are stored
%store

Stored variables and their in-db values:
bucket_name                                       -> 'wizard-of-tasks-dataset-54321'
ingest_create_athena_db_passed                    -> True
ingest_create_athena_table_tsv_passed             -> True
s3_private_data_path_csv                          -> 's3://sagemaker-us-east-1-114106928417/aai-540-2-1
s3_private_path_tsv                               -> 's3://sagemaker-us-east-1-114106928417/amazon-revi
s3_public_data_path_csv                           -> '/home/sagemaker-user/aai-540-homework/homework-2-
s3_public_path_tsv                                -> 's3://dsoaws/amazon-reviews-pds/tsv'
setup_dependencies_passed                         -> True
setup_s3_bucket_passed                            -> True


In [5]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = get_execution_role()
sagemaker_session = Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [6]:
# get s3 path to data from stored variable
%store -r bucket_name
s3_train_dataset_path = 's3://{}/data/train_df.csv'.format(bucket_name)
print(s3_train_dataset_path)

s3://wizard-of-tasks-dataset-54321/data/train_df.csv


In [7]:
s3_validation_dataset_path = 's3://{}/data/val_df.csv'.format(bucket_name)
print(s3_validation_dataset_path)

s3://wizard-of-tasks-dataset-54321/data/val_df.csv


In [8]:
s3_test_dataset_path = 's3://{}/data/test_df.csv'.format(bucket_name)
print(s3_test_dataset_path)

s3://wizard-of-tasks-dataset-54321/data/test_df.csv


### Setup Configuration file path
We are setting the directory in which the config.yaml file resides so that remote decorator can make use of the settings through SageMaker Defaults.

This notebook is using the Hugging Face container for the us-east-1 region. Make sure you are using the right image for your AWS region, otherwise edit config.yaml.

In [9]:
# Set path to config file
os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

### Import the data from S3 into Pandas Dataframe
1. using aws wrangler
2. review the dataset

In [10]:
train_df_data = wr.s3.read_csv(path=s3_train_dataset_path, sep='^')

In [11]:
val_df_data = wr.s3.read_csv(path=s3_validation_dataset_path, sep='^')

In [12]:
test_df_data = wr.s3.read_csv(path=s3_test_dataset_path, sep='^')

In [13]:
train_df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404 entries, 0 to 403
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   question               404 non-null    object
 1   intent_question        404 non-null    object
 2   history                364 non-null    object
 3   conversation_id        404 non-null    object
 4   document_url_question  404 non-null    object
 5   domain_question        404 non-null    object
 6   text_answer            404 non-null    object
 7   intent_answer          404 non-null    object
 8   domain_answer          404 non-null    object
 9   question_id            404 non-null    object
 10  title                  404 non-null    object
 11  description            404 non-null    object
 12  ingredients            191 non-null    object
 13  steps                  404 non-null    object
 14  data_split             404 non-null    object
dtypes: object(15)
memory us

In [14]:
val_df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   question               271 non-null    object
 1   intent_question        271 non-null    object
 2   history                240 non-null    object
 3   conversation_id        271 non-null    object
 4   document_url_question  271 non-null    object
 5   domain_question        271 non-null    object
 6   text_answer            271 non-null    object
 7   intent_answer          271 non-null    object
 8   domain_answer          271 non-null    object
 9   question_id            271 non-null    object
 10  title                  271 non-null    object
 11  description            271 non-null    object
 12  ingredients            117 non-null    object
 13  steps                  271 non-null    object
 14  data_split             271 non-null    object
dtypes: object(15)
memory us

In [15]:
test_df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653 entries, 0 to 652
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   question               653 non-null    object
 1   intent_question        653 non-null    object
 2   history                583 non-null    object
 3   conversation_id        653 non-null    object
 4   document_url_question  653 non-null    object
 5   domain_question        653 non-null    object
 6   text_answer            653 non-null    object
 7   intent_answer          653 non-null    object
 8   domain_answer          653 non-null    object
 9   question_id            653 non-null    object
 10  title                  653 non-null    object
 11  description            644 non-null    object
 12  ingredients            262 non-null    object
 13  steps                  653 non-null    object
 14  data_split             653 non-null    object
dtypes: object(15)
memory us

In [16]:
train_df_data.head(2).to_string()

"                                                                                      question            intent_question                                                                                                                                                                                                                                                                                                                                                                                                                                    history       conversation_id                               document_url_question domain_question                                                                                                                              text_answer                 intent_answer domain_answer question_id                              title                                                                                                                                     

### Create a prompt and load the dataset to try question and answering
1. function takes the document_url_question and creates a domain from it
2. applies the columns from the datasets to the prompt template

Prompt template designed to apply as much context as needed from the dataset

In [17]:
from random import randint


def template_dataset(sample):
    # Check if 'document_url_question' contains 'wholefoods'
    if 'wholefoods' in sample['document_url_question']:
        domain = 'food'
    else:
        domain = 'diy'

    title = ""
    description = ""
    ingredients = ""
    steps = ""
    data_split = ""

    prompt_template  = f"""\
    <s>[INST]
    Question: {title} [/INST]
    Context: {description}. For this task, you will need the following: {', '.join(ingredients)}. This is a {domain} task.
    Answer: Please provide a step-by-step guide. {', '.join(steps)}</s>
    [INST]
    Data Split: This is a {data_split} example for the {domain} task.
    [/INST]
    """
    
    sample["text"] = prompt_template.format(title=sample['title'], 
                                            description=sample['description'], 
                                            ingredients=sample['ingredients'], 
                                            steps=sample['steps'], 
                                            domain=domain, 
                                            eos_token=tokenizer.eos_token)
    return sample

### Use Huggingface to import the Mixtral8x7B model

Also using it to import the the Hugging Face Trainer class to fine-tune the model. Define the hyperparameters we want to use. We also create a DataCollator that will take care of padding our inputs and labels.

In [18]:
access_token = "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"

In [19]:
! huggingface-cli login --token {access_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/sagemaker-user/.cache/huggingface/token
Login successful


In [20]:
from transformers import AutoTokenizer

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.eos_token

### Apply the Train, Validation, and Test Datasets to HugginFace Dataset Library and apply the prompt template to the required columns

In [21]:
from datasets import Dataset, DatasetDict

train_dataset = Dataset.from_pandas(train_df_data)
validation_dataset = Dataset.from_pandas(val_df_data)
test_dataset = Dataset.from_pandas(test_df_data)

dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})

train_dataset = dataset["train"].map(template_dataset, remove_columns=list(dataset["train"].features))

print(train_dataset[randint(0, len(dataset))]["text"])

validation_dataset = dataset["validation"].map(template_dataset, remove_columns=list(dataset["validation"].features))

test_dataset = dataset["test"].map(template_dataset, remove_columns=list(dataset["test"].features))

Map:   0%|          | 0/404 [00:00<?, ? examples/s]

    <s>[INST]
    Question:  [/INST]
    Context: . For this task, you will need the following: . This is a diy task.
    Answer: Please provide a step-by-step guide. </s>
    [INST]
    Data Split: This is a  example for the diy task.
    [/INST]
    


Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Map:   0%|          | 0/653 [00:00<?, ? examples/s]

To train our model, we need to convert our inputs (text) to token IDs. This is done by a Hugging Face Transformers Tokenizer. In addition to QLoRA, we will use bitsanbytes 4-bit precision to quantize out frozen LLM to 4-bit and attach LoRA adapters on it.

In [22]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

Utility method for finding the target modules and update the necessary matrices. Visit this link for additional info.

In [23]:
import bitsandbytes as bnb

def find_all_linear_names(hf_model):
    lora_module_names = set()
    for name, module in hf_model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


### Define the Train Function and Execute it
Train Function applies hyperparameters, and total GPU count, trains the model on the tokenized dataset, and runs an evaluation on the test dataset

Evaluations will be run on the test dataset.

In [24]:
from accelerate import Accelerator
from huggingface_hub import login
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sagemaker.remote_function import remote
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import transformers

# Start training
@remote(volume_size=100, job_name_prefix=f"train-{model_id.split('/')[-1].replace('.', '-')}-merge")
def train_fn(
        model_name,
        train_ds,
        test_ds=None,
        lora_r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        num_train_epochs=1,
        chunk_size=2048,
        gradient_checkpointing=False,
        merge_weights=False,
        token=None
):  
    print("############################################")
    print("Number of GPUs: ", torch.cuda.device_count())
    print("############################################")
    
    accelerator = Accelerator()
    
    if token is not None:
        login(token=token)

    # tokenize and chunk dataset
    with accelerator.main_process_first():
        lm_train_dataset = train_ds.map(
            lambda sample: tokenizer(sample["text"]), batched=True, batch_size=per_device_train_batch_size, remove_columns=list(train_ds.features)
        )

    # Print total number of samples
    print(f"Total number of train samples: {len(lm_train_dataset)}")


    if test_ds is not None:
        with accelerator.main_process_first():
            lm_test_dataset = test_ds.map(
                lambda sample: tokenizer(sample["text"]), batched=True, batch_size=per_device_eval_batch_size, remove_columns=list(test_ds.features)
            )

        print(f"Total number of test samples: {len(lm_test_dataset)}")
    else:
        lm_test_dataset = None

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto",
        cache_dir="/tmp/.cache"
    )

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

    # get lora target modules
    modules = find_all_linear_names(model)
    print(f"Found {len(modules)} modules to quantize: {modules}")

    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

    model = model.to(accelerator.device)

    if test_ds is not None:
        model, lm_train_dataset, lm_test_dataset = accelerator.prepare(
            model, lm_train_dataset, lm_test_dataset
        )
    else:
        model, lm_train_dataset = accelerator.prepare(
            model, lm_train_dataset
        )

    trainer = transformers.Trainer(
        model=model,
        train_dataset=lm_train_dataset,
        eval_dataset=lm_test_dataset if lm_test_dataset is not None else None,
        args=transformers.TrainingArguments(
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            gradient_checkpointing=gradient_checkpointing,
            logging_steps=2,
            num_train_epochs=num_train_epochs,
            learning_rate=learning_rate,
            bf16=True,
            save_strategy="no",
            output_dir="outputs"
        ),
        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )
    model.config.use_cache = False

    trainer.train()

    if merge_weights:
        output_dir = "/tmp/model"

        # merge adapter weights with base model and save
        # save int 4 model
        trainer.model.save_pretrained(output_dir, safe_serialization=False)
        # clear memory
        del model
        del trainer
        
        torch.cuda.empty_cache()

        # load PEFT model in fp16
        model = AutoPeftModelForCausalLM.from_pretrained(
            output_dir,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
            cache_dir="/tmp/.cache"
        )
        
        # Merge LoRA and base model and save
        model = model.merge_and_unload()
        model.save_pretrained(
            "/opt/ml/model", safe_serialization=True, max_shard_size="2GB"
        )
    else:
        model.save_pretrained("/opt/ml/model", safe_serialization=True)

    tmp_tokenizer = AutoTokenizer.from_pretrained(model_name)
    tmp_tokenizer.save_pretrained("/opt/ml/model")

sagemaker.config INFO - Fetched defaults config from location: /home/sagemaker-user/AAI_540_SU_04/code
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.ImageUri
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.Dependencies
sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.RemoteFunction.InstanceType


In [25]:
train_fn(
    model_id,
    train_ds=train_dataset,
    test_ds=test_dataset,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    merge_weights=True,
    token=access_token
)

2024-06-09 07:34:03,834 sagemaker.remote_function INFO     Serializing function code to s3://sagemaker-us-east-1-114106928417/train-Mixtral-8x7B-Instruct-v0-1-merge-2024-06-09-07-34-03-834/function
2024-06-09 07:34:04,131 sagemaker.remote_function INFO     Serializing function arguments to s3://sagemaker-us-east-1-114106928417/train-Mixtral-8x7B-Instruct-v0-1-merge-2024-06-09-07-34-03-834/arguments
2024-06-09 07:34:04,496 sagemaker.remote_function INFO     Copied dependencies file at './requirements.txt' to '/tmp/tmpioxnghqn/temp_workspace/sagemaker_remote_function_workspace/requirements.txt'
2024-06-09 07:34:04,498 sagemaker.remote_function INFO     Successfully created workdir archive at '/tmp/tmpioxnghqn/workspace.zip'
2024-06-09 07:34:04,536 sagemaker.remote_function INFO     Successfully uploaded workdir to 's3://sagemaker-us-east-1-114106928417/train-Mixtral-8x7B-Instruct-v0-1-merge-2024-06-09-07-34-03-834/sm_rf_user_ws/workspace.zip'
2024-06-09 07:34:04,541 sagemaker.remote_func

ClientError: An error occurred (AccessDeniedException) when calling the CreateTrainingJob operation: User: arn:aws:sts::114106928417:assumed-role/LabRole/SageMaker is not authorized to perform: sagemaker:CreateTrainingJob on resource: arn:aws:sagemaker:us-east-1:114106928417:training-job/train-Mixtral-8x7B-Instruct-v0-1-merge-2024-06-09-07-34-03-834 with an explicit deny in an identity-based policy

### Deploy Fine-Tuned model
Note: Run train_fn with merge_weights=True

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

In [None]:
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

bucket_name = sagemaker_session.default_bucket()
job_prefix = f"train-{model_id.split('/')[-1].replace('.', '-')}-merge"

In [None]:
def get_last_job_name(job_name_prefix):
    import boto3
    sagemaker_client = boto3.client('sagemaker')
    
    search_response = sagemaker_client.search(
        Resource='TrainingJob',
        SearchExpression={
            'Filters': [
                {
                    'Name': 'TrainingJobName',
                    'Operator': 'Contains',
                    'Value': job_name_prefix
                },
                {
                    'Name': 'TrainingJobStatus',
                    'Operator': 'Equals',
                    'Value': "Completed"
                }
            ]
        },
        SortBy='CreationTime',
        SortOrder='Descending',
        MaxResults=1)
    
    return search_response['Results'][0]['TrainingJob']['TrainingJobName']

In [None]:
job_name = get_last_job_name(job_prefix)

job_name

### Inference configurations

In [None]:
instance_count = 1
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
health_check_timeout = 700

In [None]:
image_uri = get_huggingface_llm_image_uri(
    "huggingface",
    version="1.4"
)

image_uri

In [None]:
model = HuggingFaceModel(
    image_uri=image_uri,
    model_data=f"s3://{bucket_name}/{job_name}/{job_name}/output/model.tar.gz",
    role=get_execution_role(),
    env={
        'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
        'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
        'HF_MODEL_QUANTIZE': "bitsandbytes"
    }
)

In [None]:
predictor = model.deploy(
    initial_instance_count=instance_count,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    model_data_download_timeout=3600
)

### Predict

In [None]:
from sagemaker.huggingface.model import HuggingFacePredictor

In [None]:
endpoint_name = "<ENDPOINT_NAME>" #Required if you want to create a predictor without running the previous code

In [None]:
if 'predictor' not in locals() and 'predictor' not in globals():
    print("Create predictor")
    predictor = HuggingFacePredictor(
        endpoint_name=endpoint_name
    )

In [None]:
base_prompt = f"""
<s>[INST]
{{question}} 
[/INST]
"""

In [None]:
prompt = base_prompt.format(question="How do I make Windy City Wasabeans?")

predictor.predict({
	"inputs": prompt,
    "parameters": {
        "n_predict": -1,
        "temperature": 0.2,
        "top_p": 0.9
    }
})

### Delete Endpoint

In [None]:
predictor.delete_model()
predictor.delete_endpoint(delete_endpoint_config=True)

### Shutdown Notebook

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}