## Fine-tuning of WizardLM-13B on Med_QA med_qa_en_4options_bigbio_qa

Install and Load Required Libraries

In [1]:
! pip3 install -q -U transformers
! pip install -q -U datasets
! pip3 install -q -U peft
! pip install -q -U trl
! pip3 install -q -U auto-gptq
! pip3 install -q -U optimum
! pip3 install -q -U bitsandbytes

In [2]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/kmb85/rds/hpc-work/huggingface'

In [3]:
import transformers
import torch
from datasets import load_dataset
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


### Load WizardLM-13B and Tokenizer

In [5]:
model_name_or_path = "WizardLM/WizardLM-13B-V1.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4b_quant_type='nf4',
    torch_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    use_safetensors=True,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)



In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token

In [7]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

### Load LoRA Adapter

In [8]:
config = LoraConfig(
    r=32,
    lora_alpha=16,
    bias="none",
    task_type="CASUAL_LM",
)

In [9]:
model=get_peft_model(model, config)

### Dataset preparation

In [4]:
dataset = load_dataset('bigbio/med_qa', 'med_qa_en_4options_bigbio_qa')

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question_id', 'document_id', 'question', 'type', 'choices', 'context', 'answer'],
        num_rows: 10178
    })
    test: Dataset({
        features: ['id', 'question_id', 'document_id', 'question', 'type', 'choices', 'context', 'answer'],
        num_rows: 1273
    })
    validation: Dataset({
        features: ['id', 'question_id', 'document_id', 'question', 'type', 'choices', 'context', 'answer'],
        num_rows: 1272
    })
})

In [6]:
DEFAULT_PROMPT = "Below is a medical question and four choices for answer. Output the correct choice to answer the question."

def generate_train_prompt(data_point):
    question = data_point[ 'question']
    choices = data_point['choices']
    choices_str = ''
    answer = data_point['answer']
    for choice in choices:
        choices_str += choice + "\n"
    answer = data_point[ 'answer'][0]
    text = f'{DEFAULT_PROMPT}\n###Question:\n{question}\n###Choices:\n{choices_str}###Output:\n{answer}'
    return {'text': text, 'labels': answer}

In [7]:
train_dataset = dataset['train'].shuffle().map(generate_train_prompt)

Map: 100%|██████████| 10178/10178 [00:01<00:00, 7327.53 examples/s]


In [8]:
validation_dataset = dataset['validation'].shuffle().map(generate_train_prompt)

Map: 100%|██████████| 1272/1272 [00:00<00:00, 4078.28 examples/s]


In [9]:
def generate_test_prompt(data_point):
    question = data_point[ 'question']
    choices = data_point['choices']
    choices_str = ''
    for choice in choices:
        choices_str += choice + "\n"
    text = f'{DEFAULT_PROMPT}\n###Question:\n{question}\n###Choices:\n{choices_str}###Output:\n'
    return {'text':text}

In [10]:
test_dataset = dataset['test'].shuffle().map(generate_test_prompt)

Map: 100%|██████████| 1273/1273 [00:00<00:00, 6976.46 examples/s]


### Training

In [18]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=32,
    gradient_accumulation_steps=32,
    learning_rate=0.0001,
    fp16=True,
    num_train_epochs=16,
    save_strategy="epoch",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    output_dir='./experiments',
    remove_unused_columns=False,
    warmup_ratio=0.03,
    logging_strategy='steps',
    evaluation_strategy='steps',
    logging_steps=15,
    label_names=['labels'],
    eval_steps=15,
    group_by_length=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    args=training_args,
    tokenizer=tokenizer,
    dataset_text_field='text',
    peft_config=config,
    max_seq_length=4096
)

Map: 100%|██████████| 10178/10178 [00:01<00:00, 9004.32 examples/s]
Map: 100%|██████████| 1272/1272 [00:00<00:00, 8478.63 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
model.config.use_cache = False
trainer.state.log_history = True
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkmb85[0m ([33mcam_kiril[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.






Step,Training Loss,Validation Loss
15,1.7687,1.540286
30,1.4162,1.284978
45,1.2614,1.218524
60,1.2221,1.195351
75,1.2006,1.180034
90,1.1884,1.169667
105,1.1782,1.163396
120,1.1749,1.160368
135,1.1709,1.159395


Checkpoint destination directory ./experiments/checkpoint-39 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-79 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./experiments/checkpoint-119 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=144, training_loss=1.2797388898001776, metrics={'train_runtime': 26192.9491, 'train_samples_per_second': 6.217, 'train_steps_per_second': 0.005, 'total_flos': 3.3541348535151206e+18, 'train_loss': 1.2797388898001776, 'epoch': 14.45})

### Save the fine-tuned model

In [21]:
model.save_pretrained(f'WizardLM-13B-v0.1_med_qa_en_4options_bigbio_qa_batch_size_32')

### Evaluate the fine-tuned model

In [11]:
import requests

url = "http://127.0.0.1:5000/api/v1/generate"

In [12]:
test_dataset = test_dataset.shuffle(seed=42)

In [13]:
import textdistance
import tiktoken
import ast

def num_of_tokens_from_text(text):
    encoding=tiktoken.encoding_for_model(model_name='gpt-3.5-turbo')
    num_tokens=len(encoding.encode(text=text))
    return num_tokens


def similiary(str1, str2):
    return textdistance.hamming.normalized_similarity(str1, str2)

In [14]:
request = {
    'max_new_tokens': 200,
    'temperature': 0.1,
    'repetition_penalty': 1,
    'top_p': 0.7,
    'stopping_strings': ['\n', '###']
}
headers = {'Content-Type': 'application/json'}

In [15]:
total_correct = 0
num_samples = 1000

In [16]:
for i in range(num_samples):
    request['prompt'] = test_dataset[i]['text']
    response = requests.post(url, json=request)

    prediction = ast.literal_eval(response.text)["results"][0]['text'].lower()
    total_correct += similiary(test_dataset[i]['answer'][0].lower(), prediction)

In [17]:
correct_percentage = (total_correct / num_samples) * 100
print(f'Correctness percentage {correct_percentage}%')

Correctness percentage 44.73601886421212%


### Evaluate the RAG model

In [35]:
import requests

url_rag = "https://b1b6-131-111-184-110.ngrok-free.app/search"

payload = {
    "text": '',
    "number_documents": 5,
    'collection': 'med_qa_4options'
}

In [36]:
dataset['test'] = dataset['test'].shuffle(seed=42)

In [37]:
total_correct = 0
num_samples = 200

In [38]:
request = {
    'max_new_tokens': 200,
    'temperature': 0.1,
    'repetition_penalty': 1,
    'top_p': 0.7,
    'stopping_strings': ['\n', '###']
}
headers = {'Content-Type': 'application/json'}

In [39]:
def generate_rag_prompt(data_point):
    question = data_point['text']
    answer = data_point['answer']
    text = f'{question}{answer}\n'
    return text

In [40]:
DEFAULT_PROMPT = "Below are some medical questions wtih four choices and answers. Output the correct choice to answer the last question only based on the provided choices."


for i in range(num_samples):
    request['prompt'] = DEFAULT_PROMPT+'\n'

    payload['question'] = dataset['test'][i]['question']

    response_rag = requests.get(url_rag, json=payload)
    data_rag = response_rag.json()

    for record in data_rag:
        request['prompt'] += generate_rag_prompt(record)

    question = dataset['test'][i]['question']
    choices = dataset['test'][i]['choices']
    choices_str = ''
    for choice in choices:
        choices_str += choice + "\n"
    request['prompt'] += f'###Question:\n{question}\n###Choices:\n{choices_str}###Output:\n'
    response = requests.post(url, json=request)

    prediction = ast.literal_eval(response.text)["results"][0]['text'].lower()
    prediction = prediction.replace("'", '')
    prediction = prediction.replace('[', '')
    prediction = prediction.replace(']', '')

    total_correct += similiary(dataset['test'][i]['answer'][0].lower(), prediction)

In [41]:
correct_percentage = (total_correct / num_samples) * 100
print(f'Correctness percentage {correct_percentage}%')

Correctness percentage 49.60761509250624%
