# **Fine-Tuning LLaMA 2 chat model**

### Import all the necessary libraries

In [None]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline
from peft import LoraConfig, PeftModel
from datasets import load_dataset, load_from_disk, Dataset
from trl import SFTTrainer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Setting up training parameters for training the model according to available computational resources

In [None]:
os.environ['TENSORBOARD_BINARY'] = '/data/home/ec23781/.conda/envs/finetune/bin/tensorboard'
os.environ["HF_TOKEN"] = "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"

model_name = "meta-llama/Llama-2-7b-chat-hf"

dataset_name = "theatticusproject/cuad-qa"

new_model = "/data/scratch/ec23781/Llama-2-7b-chat-Finetuned"

lora_r = 32

lora_alpha = 64

lora_dropout = 0.1

use_4bit = True

bnb_4bit_compute_dtype = "float16"

bnb_4bit_quant_type = "nf4"

use_nested_quant = True

output_dir = "/data/scratch/ec23781/results-llama-chat"

num_train_epochs = 1

fp16 = False
bf16 = False

per_device_train_batch_size = 8

gradient_accumulation_steps = 2

gradient_checkpointing = True

max_grad_norm = 0.3

learning_rate = 2e-4

weight_decay = 0.001

optim = "paged_adamw_32bit"

lr_scheduler_type = "cosine"

max_steps = -1

warmup_ratio = 0.02

group_by_length = True

save_steps = 35

save_total_limit = 2

logging_steps = 25

max_seq_length = 4096

packing = False

device_map = {"": 0}

quantization_config = BitsAndBytesConfig(
   load_in_4bit=use_4bit,
   bnb_4bit_quant_type=bnb_4bit_quant_type,
   bnb_4bit_use_double_quant=use_nested_quant,
   bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
)

### Loading the base LLaMA 2 chat model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map=device_map)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.24s/it]


### Creating a chat prompt template for the model. The template used here is the official template provided by Meta to be followed.

In [None]:
def create_prompt(context, question, answer):
    return
f'''<s>[INST] You are a helpful assistant. Answer the given question from the context provided.

Context:
{context}

Question:
{question}

If you cannot answer the question from given context then don't try to make up an answer.[/INST] {answer} </s>'''

### Pre-process the train dataset for efficient training

In [None]:
filtered_examples = []

train_dataset = load_dataset("theatticusproject/cuad-qa",split="train")

# Calculating tokens required
total_tokens = 0
max_tokens = 0

num_examples= 0

for example in tqdm(train_dataset):
    context = example["context"]
    question = example["question"]
    answer_text = example["answers"]["text"]

    if len(answer_text):
        details_index = question.find("Details: ")

        # Extract the substring starting from the index after "Details: "
        question = question[details_index + len("Details: "):].strip()

        # Create the prompt
        prompt = create_prompt(context, question, answer_text[0])
        # Tokenize the input
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.squeeze()
        num_tokens = len(input_ids)

        # Check if the token length is within the maximum sequence length
        if num_tokens <= max_seq_length:
            filtered_examples.append({"text": prompt})
            num_examples += 1
            total_tokens += num_tokens
            max_tokens = max(max_tokens, num_tokens)

filtered_dataset = Dataset.from_list(filtered_examples)

# Calculate the average token length
avg_tokens = total_tokens / num_examples

# Print the results
print(f"{num_examples} are good for training out of {len(train_dataset)} examples in dataset")
print(f"Maximum token length: {max_tokens}")
print(f"Average token length: {avg_tokens:.2f}")

# Saving the pre-processed training dataset locally for future use
os.makedirs("/data/scratch/ec23781/filtered-dataset-llama", exist_ok=True)
filtered_dataset.save_to_disk("/data/scratch/ec23781/filtered-dataset-llama")

In [None]:
# Loading the saved training dataset from local disk
filtered_dataset = load_from_disk("/data/scratch/ec23781/filtered-dataset-llama")

### Training the model using SFTTrainer

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    learning_rate=learning_rate,
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    max_steps=max_steps,
    weight_decay=weight_decay,
    bf16=bf16,
    fp16=fp16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)


# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=filtered_dataset,
    tokenizer=tokenizer,
    dataset_text_field="text",
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    args=training_arguments,
    packing=False,
)

trainer.train(resume_from_checkpoint=True)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 1062/1062 [00:08<00:00, 130.16 examples/s]
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  return fn(*args, **kwargs)


Step,Training Loss
25,1.3278
50,1.1546


  return fn(*args, **kwargs)


TrainOutput(global_step=66, training_loss=1.2197666746197324, metrics={'train_runtime': 1619.8966, 'train_samples_per_second': 0.656, 'train_steps_per_second': 0.041, 'total_flos': 1.114009666167767e+17, 'train_loss': 1.2197666746197324, 'epoch': 0.9924812030075187})

### Save the fine-tuned model and tokenizer locally for future use

In [None]:
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('/data/scratch/ec23781/Llama-2-7b-chat-Finetuned/tokenizer_config.json',
 '/data/scratch/ec23781/Llama-2-7b-chat-Finetuned/special_tokens_map.json',
 '/data/scratch/ec23781/Llama-2-7b-chat-Finetuned/tokenizer.json')

### View the training loss and other graphs by tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir /data/scratch/ec23781/results/runs

### Load the locally saved finetuned model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(new_model)
model = AutoModelForCausalLM.from_pretrained(new_model, device_map={"": 0})

Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.92s/it]


### Merging the weights from LoRA with the base model.

In [None]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.03s/it]


### Uploading the model to huggingface

In [None]:
model.push_to_hub("jay11125/Llama-2-7b-chat-finetune", check_pr=True)

tokenizer.push_to_hub("jay11125/Llama-2-7b-chat-finetune",check_pr=True)


model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s][A

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s][A[A


model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s][A[A[A
model-00001-of-00003.safetensors:   0%|          | 1.62M/4.94G [00:00<05:07, 16.0MB/s][A


model-00003-of-00003.safetensors:   0%|          | 1.52M/3.59G [00:00<04:01, 14.8MB/s][A[A[A
model-00001-of-00003.safetensors:   0%|          | 6.19M/4.94G [00:00<02:29, 33.0MB/s][A


model-00003-of-00003.safetensors:   0%|          | 5.36M/3.59G [00:00<02:05, 28.5MB/s][A[A[A
model-00001-of-00003.safetensors:   0%|          | 15.7M/4.94G [00:00<01:20, 60.9MB/s][A


model-00003-of-00003.safetensors:   0%|          | 12.7M/3.59G [00:00<01:13, 48.9MB/s][A[A[A


model-00003-of-00003.safetensors:   0%|          | 17.6M/3.59G [00:00<01:37, 36.6MB/s][A[A[A


model-00003-of-00003.safetensors:   1%|          | 21.6M/3.59G [00:00<01:44, 34.3MB/s][A[A[A
mode

model-00002-of-00003.safetensors:  17%|█▋        | 826M/4.95G [00:25<02:03, 33.4MB/s][A


model-00001-of-00003.safetensors:  19%|█▊        | 919M/4.94G [00:25<01:36, 41.8MB/s][A[A[A


model-00003-of-00003.safetensors:  24%|██▍       | 872M/3.59G [00:25<01:01, 44.3MB/s][A[A[A
model-00001-of-00003.safetensors:  19%|█▉        | 944M/4.94G [00:25<01:34, 42.3MB/s][A
model-00002-of-00003.safetensors:  17%|█▋        | 848M/4.95G [00:26<02:25, 28.2MB/s][A


model-00001-of-00003.safetensors:  19%|█▉        | 960M/4.94G [00:26<01:34, 42.2MB/s][A[A[A
model-00002-of-00003.safetensors:  17%|█▋        | 864M/4.95G [00:26<02:09, 31.5MB/s][A


model-00001-of-00003.safetensors:  20%|█▉        | 976M/4.94G [00:26<01:41, 39.0MB/s][A[A[A
model-00002-of-00003.safetensors:  18%|█▊        | 880M/4.95G [00:26<01:59, 34.1MB/s][A


model-00001-of-00003.safetensors:  20%|██        | 992M/4.94G [00:27<01:35, 41.5MB/s][A[A[A


model-00003-of-00003.safetensors:  26%|██▌       | 928M/3.59G [00:27

model-00001-of-00003.safetensors:  40%|███▉      | 1.97G/4.94G [00:53<01:16, 38.7MB/s][A


model-00001-of-00003.safetensors:  40%|████      | 1.98G/4.94G [00:53<01:09, 42.6MB/s][A[A[A
model-00002-of-00003.safetensors:  39%|███▉      | 1.95G/4.95G [00:53<01:27, 34.4MB/s][A


model-00001-of-00003.safetensors:  40%|████      | 1.99G/4.94G [00:53<01:09, 42.2MB/s][A[A[A
model-00001-of-00003.safetensors:  40%|████      | 1.99G/4.94G [00:53<01:25, 34.5MB/s][A
model-00002-of-00003.safetensors:  40%|███▉      | 1.97G/4.95G [00:53<01:10, 42.3MB/s][A


model-00001-of-00003.safetensors:  40%|████      | 2.00G/4.94G [00:53<01:18, 37.3MB/s][A[A[A
model-00001-of-00003.safetensors:  41%|████      | 2.00G/4.94G [00:54<01:44, 28.1MB/s][A


model-00003-of-00003.safetensors:  54%|█████▍    | 1.94G/3.59G [00:54<00:37, 43.9MB/s][A[A[A
model-00001-of-00003.safetensors:  41%|████      | 2.02G/4.94G [00:54<01:31, 31.8MB/s][A


model-00003-of-00003.safetensors:  54%|█████▍    | 1.95G/3.59G [00

model-00001-of-00003.safetensors:  62%|██████▏   | 3.04G/4.94G [01:21<00:42, 45.0MB/s][A


model-00001-of-00003.safetensors:  62%|██████▏   | 3.05G/4.94G [01:22<00:37, 50.8MB/s][A[A[A
model-00002-of-00003.safetensors:  61%|██████    | 3.01G/4.95G [01:22<00:49, 39.0MB/s][A


model-00001-of-00003.safetensors:  62%|██████▏   | 3.06G/4.94G [01:22<00:46, 40.8MB/s][A[A[A


model-00001-of-00003.safetensors:  62%|██████▏   | 3.08G/4.94G [01:22<00:39, 46.8MB/s][A[A[A


model-00003-of-00003.safetensors:  81%|████████  | 2.90G/3.59G [01:22<00:16, 41.9MB/s][A[A[A
model-00001-of-00003.safetensors:  63%|██████▎   | 3.09G/4.94G [01:22<00:31, 58.0MB/s][A


model-00003-of-00003.safetensors:  81%|████████  | 2.91G/3.59G [01:23<00:16, 39.9MB/s][A[A[A
model-00001-of-00003.safetensors:  63%|██████▎   | 3.09G/4.94G [01:23<00:39, 46.5MB/s][A


model-00001-of-00003.safetensors:  63%|██████▎   | 3.10G/4.94G [01:23<00:41, 43.8MB/s][A[A[A
model-00001-of-00003.safetensors:  63%|██████▎   | 3

model-00001-of-00003.safetensors:  90%|████████▉ | 4.43G/4.94G [02:01<00:51, 9.80MB/s][A
model-00002-of-00003.safetensors:  90%|████████▉ | 4.43G/4.95G [02:01<00:14, 35.9MB/s][A
model-00001-of-00003.safetensors:  90%|█████████ | 4.45G/4.94G [02:02<00:38, 12.7MB/s][A
model-00001-of-00003.safetensors:  90%|█████████ | 4.46G/4.94G [02:02<00:28, 16.5MB/s][A
model-00002-of-00003.safetensors:  90%|█████████ | 4.46G/4.95G [02:02<00:13, 35.8MB/s][A
model-00001-of-00003.safetensors:  91%|█████████ | 4.48G/4.94G [02:02<00:21, 21.0MB/s][A
model-00002-of-00003.safetensors:  91%|█████████ | 4.48G/4.95G [02:02<00:12, 36.3MB/s][A
model-00001-of-00003.safetensors:  92%|█████████▏| 4.54G/4.94G [02:04<00:11, 33.3MB/s][A
model-00002-of-00003.safetensors:  91%|█████████ | 4.50G/4.95G [02:04<00:27, 16.5MB/s][A
model-00002-of-00003.safetensors:  91%|█████████ | 4.51G/4.95G [02:04<00:17, 24.3MB/s][A
model-00001-of-00003.safetensors:  92%|█████████▏| 4.56G/4.94G [02:04<00:12, 29.8MB/s][A
model-0000

CommitInfo(commit_url='https://huggingface.co/jay11125/Llama-2-7b-chat-finetune/commit/7f9a53b006aab91feed0fe70855ec251b7b938a7', commit_message='Upload tokenizer', commit_description='', oid='7f9a53b006aab91feed0fe70855ec251b7b938a7', pr_url=None, pr_revision=None, pr_num=None)

### Load our finetuned model from Huggingface

In [None]:
tokenizer = AutoTokenizer.from_pretrained("jay11125/Llama-2-7b-chat-finetune")
model = AutoModelForCausalLM.from_pretrained("jay11125/Llama-2-7b-chat-finetune", device_map={"":0})

Loading checkpoint shards: 100%|██████████| 3/3 [00:13<00:00,  4.62s/it]


### Test the model with an example

In [None]:
prompt = '''<s>[INST] You are a helpful assistant. Answer the given question from the context provided.

Context:
MODEL BUSINESS ASSOCIATE AGREEMENT
This BUSINESS ASSOCIATE AGREEMENT (the "BAA") is made and entered into as of
by and between
organized under the laws of the
a
("Covered Entity") and
organized under the laws of
a
("Business Associate", in accordance with the meaning given to those terms at 45 CFR §164. 501). In this BAA, Covered Entity and Business Associate are each a "Party" and, collectively, are the "Parties".
BACKGROUND
1.
Covered Entity is either a "covered entity" or "business associate" of a covered entity as each are defined under the Health Insurance Portability and Accountability Act of 1996, Public Law 104-191, as amended by the HITECH Act (as defined below) and the related regulations promulgated by HHS (as defined below) (collectively, "HIPAA”) and, as such, is required to comply with HIPAA's provisions regarding the confidentiality and privacy of Protected Health Information (as defined below);
II. The Parties have entered into or will enter into one or more agreements under which Business Associate provides or will provide certain specified services to Covered Entity (collectively, the "Agreement");
III. In providing services pursuant to the Agreement, Business Associate will have access to Protected Health Information;
IV. By providing the services pursuant to the Agreement, Business Associate will become a "business associate" of the Covered Entity as such term is defined under HIPAA;
V. Both Parties are committed to complying with all federal and state laws governing the confidentiality and privacy of health information, including, but not limited to, the Standards for Privacy of Individually Identifiable Health Information found at 45 CFR Part 160 and Part 164, Subparts A and E (collectively, the "Privacy Rule"); and
VI. Both Parties intend to protect the privacy and provide for the security of Protected Health Information disclosed to Business Associate pursuant to the terms of this Agreement, HIPAA and other applicable laws.
AGREEMENT

Question:
What is the context about?

If you cannot answer the question from given context then don't try to make up an answer.[/INST]'''

pipe = pipeline(task="text-generation", model=model, max_new_tokens=300, tokenizer=tokenizer, return_full_text=False)
result = pipe(prompt)
print(result[0]['generated_text'].strip())

Answer:

The context is about a Model Business Associate Agreement between two parties i.e Covered Entity and Business Associate. The parties have entered into one or more agreements under which Business Associate provides or will provide certain specified services to Covered Entity. Both Parties are committed to complying with all federal and state laws governing the confidentiality and privacy of health information including the Standards for Privacy of Individually Identifiable Health Information found at 45 CFR Part 160 and Part 164, Subparts A and E (collectively, the "Privacy Rule").


### Pre-process the testing dataset to evaluate our finetuned model on the unseen data

In [None]:
test_dataset = load_dataset("theatticusproject/cuad-qa",split="test")

test_dataset = test_dataset.filter(lambda x: len(x['answers']["text"])!= 0)
filter_ex = []

for example in tqdm(test_dataset):
    details_index = example["question"].find("Details: ")

    # Extract the substring starting from the index after "Details: "
    example["question"] = example["question"][details_index + len("Details: "):].strip()

    # Create the prompt
    prompt = create_prompt(example["context"], example["question"], example["answers"]["text"][0])

    # Tokenize the input
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.squeeze()
    num_tokens = len(input_ids)

    # Check if the token length is within the maximum sequence length
    if num_tokens <= max_seq_length:
        filter_ex.append(example)

test_dataset = Dataset.from_list(filter_ex)
os.makedirs("/data/scratch/ec23781/filtered-test-dataset-llama", exist_ok=True)
test_dataset.save_to_disk("/data/scratch/ec23781/filtered-test-dataset-llama")

100%|██████████| 1244/1244 [00:50<00:00, 24.53it/s]
Saving the dataset (1/1 shards): 100%|██████████| 149/149 [00:00<00:00, 2907.20 examples/s]


In [None]:
test_dataset = load_from_disk("/data/scratch/ec23781/filtered-test-dataset-llama")

In [None]:
pipe = pipeline(task="text-generation", model=model, max_new_tokens=300, tokenizer=tokenizer,  return_full_text=False)

def create_test_prompt(context, question):
    return f'''<s>[INST] You are a helpful assistant. Answer the given question from the context provided.

Context:
{context}

Question:
{question}

If you cannot answer the question from given context then don't try to make up an answer.[/INST]'''

predictions = []
references = []

for example in tqdm(test_dataset):
    context = example["context"]
    question = example["question"]

    prompt = create_test_prompt(context, question)

    result = pipe(prompt)
    answer_text = (result[0]['generated_text'].strip())

    # Append the prediction and reference for metric calculation
    predictions.append({
        "id": example["id"],
        "prediction_text": answer_text
    })
    references.append({
        "id": example["id"],
        "answers": example["answers"]
    })

  7%|▋         | 10/149 [02:05<29:19, 12.66s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 149/149 [25:23<00:00, 10.22s/it]


### Evaluate the model using SQuAD metrics to get the F1 and Exact Match scores.


In [None]:
from datasets import load_metric

metric = load_metric("squad")

results = metric.compute(predictions=predictions, references=references)
print(results)

{'exact_match': 0.0, 'f1': 20.23131178035994}


In [None]:
for i, x in enumerate(predictions):
    print(metric.compute(predictions=[x], references=[references[i]]))

{'exact_match': 0.0, 'f1': 1.3793103448275863}
{'exact_match': 0.0, 'f1': 9.937888198757763}
{'exact_match': 0.0, 'f1': 15.068493150684933}
{'exact_match': 0.0, 'f1': 17.647058823529413}
{'exact_match': 0.0, 'f1': 71.2}
{'exact_match': 0.0, 'f1': 21.62162162162162}
{'exact_match': 0.0, 'f1': 80.0}
{'exact_match': 0.0, 'f1': 2.094240837696335}
{'exact_match': 0.0, 'f1': 71.42857142857143}
{'exact_match': 0.0, 'f1': 2.1390374331550803}
{'exact_match': 0.0, 'f1': 16.304347826086957}
{'exact_match': 0.0, 'f1': 6.0606060606060606}
{'exact_match': 0.0, 'f1': 11.627906976744187}
{'exact_match': 0.0, 'f1': 10.476190476190476}
{'exact_match': 0.0, 'f1': 43.24324324324324}
{'exact_match': 0.0, 'f1': 26.60098522167488}
{'exact_match': 0.0, 'f1': 66.66666666666666}
{'exact_match': 0.0, 'f1': 1.4598540145985401}
{'exact_match': 0.0, 'f1': 85.71428571428571}
{'exact_match': 0.0, 'f1': 50.0}
{'exact_match': 0.0, 'f1': 16.666666666666664}
{'exact_match': 0.0, 'f1': 0.0}
{'exact_match': 0.0, 'f1': 6.17