## Fine-tuning of Llama-2-13b on Legal CUAD

Install and Load Required Libraries

In [1]:
! pip3 install -q -U transformers
! pip3 install -q -U datasets
! pip3 install -q -U peft
! pip3 install -q -U trl
! pip3 install -q -U auto-gptq
! pip3 install -q -U optimum
! pip3 install -q -U bitsandbytes

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/kmb85/rds/hpc-work/huggingface'

In [None]:
import transformers
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from trl import SFTTrainer

### Load Llama-2-13b and Tokenizer

In [None]:
model_name_or_path = "meta-llama/Llama-2-13b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4b_quant_type='nf4',
    torch_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    use_safetensors=True,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
    token=""
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token

In [6]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

### Load LoRA Adapter

In [7]:
config = LoraConfig(
    r=32,
    lora_alpha=16,
    bias="none",
    task_type="CASUAL_LM",
)

In [8]:
model=get_peft_model(model, config)

### Dataset preparation

In [3]:
cuad_names = ['cuad_affiliate_license-licensee', 'cuad_affiliate_license-licensor', 'cuad_anti-assignment', 'cuad_audit_rights', 'cuad_cap_on_liability', 'cuad_change_of_control', 'cuad_competitive_restriction_exception', 'cuad_covenant_not_to_sue', 'cuad_effective_date', 'cuad_exclusivity', 'cuad_expiration_date', 'cuad_governing_law', 'cuad_insurance', 'cuad_ip_ownership_assignment', 'cuad_irrevocable_or_perpetual_license', 'cuad_joint_ip_ownership', 'cuad_license_grant', 'cuad_liquidated_damages', 'cuad_minimum_commitment', 'cuad_most_favored_nation', 'cuad_no-solicit_of_customers', 'cuad_no-solicit_of_employees', 'cuad_non-compete', 'cuad_non-disparagement', 'cuad_non-transferable_license', 'cuad_notice_period_to_terminate_renewal', 'cuad_post-termination_services', 'cuad_price_restrictions', 'cuad_renewal_term', 'cuad_revenue-profit_sharing', 'cuad_rofr-rofo-rofn', 'cuad_source_code_escrow', 'cuad_termination_for_convenience', 'cuad_third_party_beneficiary', 'cuad_uncapped_liability', 'cuad_unlimited-all-you-can-eat-license', 'cuad_volume_restriction', 'cuad_warranty_duration']

In [None]:
cuad_datasets_train = {}
cuad_datasets_test = {}

for cuad_name in cuad_names:
    cuad_datasets_train[cuad_name] = load_dataset('nguha/legalbench', cuad_name)['train']
    cuad_datasets_test[cuad_name] = load_dataset('nguha/legalbench', cuad_name)['test']

In [None]:
from datasets import concatenate_datasets

combined_cuad_dataset_train = concatenate_datasets(cuad_datasets_train.values())
combined_cuad_dataset_test = concatenate_datasets(cuad_datasets_test.values())

In [None]:
combined_cuad_dataset_train[0]

In [None]:
DEFAULT_PROMPT = "Below is an input on a legal topic and a contract clause document. Answer only with 'Yes' or 'No' whether the input is based on the clause of the document.Answer only based on the provided infromation:"

def generate_train_prompt(data_point):
    text = data_point[ 'text']
    document_name = data_point['document_name']
    answer = data_point[ 'answer']
    text = f'{DEFAULT_PROMPT}\n###Input:\n{text}\n###Document:\n{document_name}###Output:\n{answer}'
    return {'text': text, 'labels': answer}

In [None]:
train_dataset = combined_cuad_dataset_train.shuffle().map(generate_train_prompt)

In [None]:
def generate_test_prompt(data_point):
    text = data_point[ 'text']
    document_name = data_point['document_name']
    answer = data_point[ 'answer']
    text = f'{DEFAULT_PROMPT}\n###Input:\n{text}\n###Document:\n{document_name}###Output:\n'
    return {'text': text}

In [None]:
test_dataset = combined_cuad_dataset_test.shuffle().map(generate_test_prompt)

### Training

In [17]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=0.001,
    fp16=True,
    num_train_epochs=8,
    save_strategy="epoch",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    output_dir='./experiments',
    remove_unused_columns=False,
    warmup_ratio=0.05,
    logging_strategy='epoch',
    label_names=['labels'],
    group_by_length=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    tokenizer=tokenizer,
    dataset_text_field='text',
    peft_config=config,
    max_seq_length=4096
)

Map: 100%|██████████| 228/228 [00:00<00:00, 5745.52 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
model.config.use_cache = False
trainer.state.log_history = True
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkmb85[0m ([33mcam_kiril[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.






Step,Training Loss
57,1.2089
114,0.7687
171,0.5536
228,0.3636
285,0.2204
342,0.1284
399,0.0812
456,0.0643


Checkpoint destination directory ./experiments/checkpoint-57 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-114 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-171 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-228 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-285 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-342 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-399 already exists and is non-empty.Saving will proceed

TrainOutput(global_step=456, training_loss=0.4236430300955187, metrics={'train_runtime': 1024.7348, 'train_samples_per_second': 1.78, 'train_steps_per_second': 0.445, 'total_flos': 2.931725893312512e+16, 'train_loss': 0.4236430300955187, 'epoch': 8.0})

### Save the fine-tuned model

In [19]:
model.save_pretrained(f'Llama-2-13b_legal_CUAD_8_epochs')

### Evaluate Fine-Tuned Model

In [None]:
import requests

url = "http://127.0.0.1:5000/api/v1/generate"

In [None]:
test_dataset = test_dataset.shuffle(seed=42)

In [None]:
request = {
    'max_new_tokens': 5,
    'temperature': 0.1,
    'repetition_penalty': 1,
    'top_p': 0.7,
    'stopping_strings': [' ']
}
headers = {'Content-Type': 'application/json'}

In [None]:
total_correct = 0
num_samples = 1000

In [15]:
for i in range(num_samples):
    request['prompt'] = test_dataset[i]['text']
    response = requests.post(url, json=request)
    if test_dataset[i]['answer'].lower() in response.text.lower():
        total_correct+=1

In [16]:
correct_percentage = (total_correct / num_samples) * 100
print(f'Correctness percentage {correct_percentage}%')

Correctness percentage 49.9%


### Evaluate RAG Model

In [None]:
import requests

url_rag = "https://b1b6-131-111-184-110.ngrok-free.app/search"

payload = {
    "text": combined_cuad_dataset_test[0]['text'],
    "number_documents": 10,
    "collection": "legal_cuad"
}

In [None]:
combined_cuad_dataset_test = combined_cuad_dataset_test.shuffle(seed=42)

In [None]:
total_correct = 0
num_samples = 1000

In [None]:
request = {
    'max_new_tokens': 100,
    'temperature': 0.1,
    'repetition_penalty': 1,
    'top_p': 0.7,
    'stopping_strings': ['\n', '###']
}
headers = {'Content-Type': 'application/json'}

In [None]:
def generate_rag_prompt(data_point):
    text = data_point[ 'text']
    answer = data_point[ 'answer']
    text = f'###Court case:\n{text}\n###Output:\n{answer}\n'
    return text

In [None]:
import ast

def extract_after_output(text):
    index = text.find('output')
    if index != -1:
        return text[index + len('output'):]
    else:
        return text

In [34]:
for i in range(num_samples):
    request['prompt'] = DEFAULT_PROMPT+'\n'

    payload['text'] = combined_cuad_dataset_test[i]['text']

    response_rag = requests.get(url_rag, json=payload)
    data_rag = response_rag.json()

    for record in data_rag:
        request['prompt'] += generate_rag_prompt(record)

    request['prompt'] += f'###Court case:\n{combined_cuad_dataset_test[i]["text"]}\n###Output:\n'
    response = requests.post(url, json=request)
    prediction = extract_after_output(ast.literal_eval(response.text)["results"][0]['text'].lower())
    if combined_cuad_dataset_test[i]['answer'].lower() in prediction:
        total_correct+=1
    elif combined_cuad_dataset_test[i]['answer'].lower() == 'yes' and 'diversity jurisdiction exists' in prediction:
        total_correct+=1
    elif combined_cuad_dataset_test[i]['answer'].lower() == 'no' and 'diversity jurisdiction does not exists' in prediction:
        total_correct+=1

In [35]:
correct_percentage = (total_correct / num_samples) * 100
print(f'Correctness percentage {correct_percentage}%')

Correctness percentage 52.400000000000006%
