# Training Llama2-7b

Training a lightweight model on the visiondataset we created.

In [None]:
!python -m pip install "transformers==4.34.0" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.23.0" "bitsandbytes==0.41.1" "trl==0.4.7" "safetensors>=0.3.1" "scipy==1.11.4" --upgrade


In [None]:
# for semantic similarity metrics
!python -m pip install sentence-transformers

In [2]:
import multiprocessing
multiprocessing.cpu_count()

32

In [None]:
import multiprocessing
multiprocessing.cpu_count()

!python -c "import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'"
!python -m pip install ninja packaging
!MAX_JOBS=16
!python -m pip install flash-attn --no-build-isolation


In [None]:
!python -m pip install scipy

In [None]:
!python -m pip install transformers

In [None]:
!python -m pip install sentencepiece

In [None]:
!python -m pip install protobuf

In [None]:
!python -m pip install ipywidgets

In [11]:
gs_ratio=0.15
va_ratio=0.30
sr_ratio=0.40
cv_ratio=0.15

from math import floor, ceil
from random import randint
samp_size = 100
gs_rng = floor(gs_ratio * samp_size)
va_rng = ceil(va_ratio*samp_size)
sr_rng = ceil(sr_ratio*samp_size)
cv_rng = floor(cv_ratio*samp_size)
print(gs_rng + va_rng + sr_rng + cv_rng == samp_size)

gs_samps = []
va_samps = []
sr_samps = []
cv_samps = []
samps = [gs_samps, va_samps, sr_samps, cv_samps]

for i in range(samp_size):
    num = randint(1,samp_size)
    if num < gs_rng:
        gs_samps.append(num)
    if num > gs_rng and num < va_rng:
        va_samps.append(num)
    if num > va_rng and num < cv_rng:
        sr_samps.append(num)
    if num < sr_rng:
        cv_samps.append(num)

for s in samps:
    print(len(s))

True
14
18
0
33


In [1]:
from vision_dataset import VisionDatasetCreator, VisionDataset

# avg percentages of exam phase lengths
gs_len=0.14
sp_len=0.25
ac_len=0.50
cv_len=0.11
# percentage of samples taken from each exam phase
gs_ratio=0.15
sp_ratio=0.30
ac_ratio=0.40
cv_ratio=0.15
# minumum dialogue lengths
gs_min = 2
sp_min = 4
ac_min = 8
cv_min = 6
# maximum dialogue lengths
gs_max = 5
sp_max = 14
ac_max = 18
cv_max = 10

sampling_strategy = dict(
    gs=[gs_len, gs_ratio, gs_min, gs_max],
    sp=[sp_len, sp_ratio, sp_min, sp_max],
    ac=[ac_len, ac_ratio, ac_min, ac_max],
    cv=[cv_len, cv_ratio, cv_min, cv_max]
)

data_dir = '/data/datasets/Exam_v3/'
# set seed to get randomization with reprducible results
dataset_creator = VisionDatasetCreator(sampling_strategy, seed=42)

# identify the number of samples from each file in the training set, which has 21 files total
samples = 15
# 25 samples from each file in the training set, which has 21 files total
size = (32*samples)
dataset_creator.load(data_dir, 'train', size)
# 25 samples from each validation file, 3 files total
size = (5*samples)
dataset_creator.load(data_dir, 'val', size)
# 25 samples from each test file, 6 files total
size = (8*samples)
dataset_creator.load(data_dir, 'test', size)

for i in ['train', 'val', 'test']:
    print('\n', i, len(dataset_creator.dataset[i]))

32 files found. Sampling 15 times per file.
sampling file: /data/datasets/Exam_v3/train/000000.txt
sampling file: /data/datasets/Exam_v3/train/000001.txt
sampling file: /data/datasets/Exam_v3/train/000002.txt
sampling file: /data/datasets/Exam_v3/train/000003.txt
sampling file: /data/datasets/Exam_v3/train/000004.txt
sampling file: /data/datasets/Exam_v3/train/000005.txt
sampling file: /data/datasets/Exam_v3/train/000006.txt
sampling file: /data/datasets/Exam_v3/train/000007.txt
sampling file: /data/datasets/Exam_v3/train/000008.txt
sampling file: /data/datasets/Exam_v3/train/000009.txt
sampling file: /data/datasets/Exam_v3/train/000010.txt
sampling file: /data/datasets/Exam_v3/train/000011.txt
sampling file: /data/datasets/Exam_v3/train/000012.txt
sampling file: /data/datasets/Exam_v3/train/000013.txt
sampling file: /data/datasets/Exam_v3/train/000014.txt
sampling file: /data/datasets/Exam_v3/train/000015.txt
sampling file: /data/datasets/Exam_v3/train/000016.txt
sampling file: /data/

In [35]:
def format_mixtral(data, model_input=True):
    instruction = 'You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.'
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    response_string = ''
    for i in data['response']:
        response_string += f'''{i['role']}: {i['content']}\n'''
    if model_input:
        return f'''<s>[INST]{instruction}\n{dialogue_string}[/INST]{response_string}</s>'''
    else:
        return f'''<s>[INST]{instruction}\n{dialogue_string}[/INST]'''

def format_inference(data, model_input=True):
    instruction = 'You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.'
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    if model_input:
        return f'''### Instruction:\n{instruction}\n### Input:\n{dialogue_string}\n### Response:\n'''
    else:
        response_string = ''
        for i in data['response']:
            response_string += f'''{i['role']}: {i['content']}\n'''
        return f'''### Response:\n{response_string}'''

train = VisionDataset(dataset_creator.dataset["train"])
val = VisionDataset(dataset_creator.dataset["val"])
test = VisionDataset(dataset_creator.dataset["test"])

for i in [train, val, test]:
    print(f'dataset length: {i.__len__()}  max sequence length: {i.get_max_len(format_inference)}')
train_len = str(train.__len__())

dataset length: 480  max sequence length: 1338
dataset length: 75  max sequence length: 1053
dataset length: 120  max sequence length: 1423


In [36]:
train.data[0]

{'dialogue': [{'role': 'LocalTech',
   'content': "Alright, and I'm gonna actually, let him look at me, my nose specifically."},
  {'role': 'LocalTech', 'content': 'And can her right PD come in, please?'}],
 'response': [{'role': 'Assistant',
   'content': 'Adjusting the right PD. Is that better?'}]}

In [37]:
from random import randrange

def format_mixtral(data, model_input=True):
    instruction = 'You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.'
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    response_string = ''
    for i in data['response']:
        response_string += f'''{i['role']}: {i['content']}\n'''
    if model_input:
        return f'''<s>[INST]{instruction}\n{dialogue_string}[/INST]{response_string}</s>'''
    else:
        return f'''<s>[INST]{instruction}\n{dialogue_string}[/INST]'''

def format_inference(data, model_input=True):
    instruction = 'You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.'
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    if model_input:
        return f'''### Instruction:\n{instruction}\n### Input:\n{dialogue_string}\n### Response:\n'''
    else:
        response_string = ''
        for i in data['response']:
            response_string += f'''{i['role']}: {i['content']}\n'''
        return f'''### Response:\n{response_string}'''
idx = randrange(test.__len__())
sample = test.data[idx]
print(format_inference(sample))
print(f"Ground truth:\n{format_inference(sample, model_input=False)}")
print('MIXTRAL TEMPLATE')
print(format_mixtral(sample))
print(f"Without response:\n{format_mixtral(sample, model_input=False)}")

### Instruction:
You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.
### Input:
Assistant: OK, so we'll start without any prescription using both eyes. It's going to be a little blurry, but try to read the smallest line that you can without squinting.
Patient__: kdnro
Assistant: OK, perfect. Now I'll cover the left eye for you, remember not to squint. What's the smallest row you can read?
Patient__: Everything's blurry.
Assistant: I see. Can you read the top line at all?
Patient__: Yeah, RKDHC, but it's all blurred.

### Response:

Ground truth:
### Response:
Assistant: OK, blurring is to be expected because there's no prescription. I'll cover your right eye. Using your left eye, what's the smallest row you can read?

MIXTRAL TEMPLATE
<s>[INST]You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam

In [38]:
def get_max_len(dataset,format_func):
    max_len = 0
    for sample in train.data:
        inpt = format_func(sample)
        if len(inpt) > max_len:
            max_len = len(inpt)
    return max_len

max_len_train = get_max_len(train, format_mixtral)
max_len_val = get_max_len(val, format_mixtral)
max_len_test = get_max_len(test, format_mixtral)
print('Mixtral max len')
print(f'train: {max_len_train}\nval: {max_len_val}\ntest:{max_len_test}')
max_len_train = get_max_len(train, format_inference)
max_len_val = get_max_len(val, format_inference)
max_len_test = get_max_len(test, format_inference)
print('\nNormal max len')
print(f'train: {max_len_train}\nval: {max_len_val}\ntest:{max_len_test}')

Mixtral max len
train: 1427
val: 1427
test:1427

Normal max len
train: 1338
val: 1338
test:1338


In [39]:
for dset in [train, val, test]:
    for idx in range(len(dset.data)):
        length = len(dset.data[idx]["dialogue"])
        if length > 14:
            print("max len exceeded", idx, len(dset.data[idx]["dialogue"]))
        if length < 2:
            print("min len not met", idx, len(dset.data[idx]["dialogue"]))

max len exceeded 24 17
max len exceeded 27 16
max len exceeded 34 16
max len exceeded 39 17
max len exceeded 62 15
max len exceeded 68 17
max len exceeded 73 17
max len exceeded 101 16
max len exceeded 139 17
max len exceeded 149 16
max len exceeded 159 15
max len exceeded 170 17
max len exceeded 173 16
max len exceeded 181 16
max len exceeded 184 16
max len exceeded 191 16
max len exceeded 203 17
max len exceeded 205 17
max len exceeded 233 15
max len exceeded 248 17
max len exceeded 254 17
max len exceeded 257 17
max len exceeded 268 16
max len exceeded 281 17
max len exceeded 284 15
max len exceeded 289 17
max len exceeded 298 15
max len exceeded 305 16
max len exceeded 308 17
max len exceeded 322 17
max len exceeded 328 15
max len exceeded 336 17
max len exceeded 358 17
max len exceeded 366 17
max len exceeded 369 15
max len exceeded 380 15
max len exceeded 384 16
max len exceeded 395 16
max len exceeded 401 15
max len exceeded 404 17
max len exceeded 409 17
max len exceeded 416 17

In [40]:
unique_responses = dict(
    train=[],
    val=[],
    test=[]
)

splits = ['train', 'val', 'test']
datasets = [train, val, test]
for i in range(3):
    dset = datasets[i]
    for d in dset.data:
        response = d["response"][0]["content"]
        if response not in unique_responses[splits[i]]:
            unique_responses[splits[i]].append(response)

for k in unique_responses.keys():
    print(k, f'unique responses: {len(unique_responses[k])}')

train unique responses: 201
val unique responses: 34
test unique responses: 48


In [41]:
from random import randrange
def unpack(dialogue):
    new_string = ''
    for i in dialogue:
        new_string += f'''{i['role']}: {i['content']}\n'''
    return new_string
idx = randrange(train.__len__())
sample = train.data[idx]
print(sample['dialogue'])
print(unpack(sample['dialogue']))

[{'role': 'Assistant', 'content': "Okay, good. So you're all set for distance. What we'll check next is the near vision. [LOCALTECH] will place a card in front of you. Let me know when the card is in place."}, {'role': 'Patient__', 'content': "The card's already there."}, {'role': 'Assistant', 'content': 'Great! On this card, read the smallest line you can.'}, {'role': 'Patient__', 'content': 'A P E O R P D Z.'}, {'role': 'Assistant', 'content': "Perfect. So I'll send that to the doctor. They're going to be on screen with you next. It was a pleasure working with you!"}, {'role': 'Patient__', 'content': 'Okay.'}]
Assistant: Okay, good. So you're all set for distance. What we'll check next is the near vision. [LOCALTECH] will place a card in front of you. Let me know when the card is in place.
Patient__: The card's already there.
Assistant: Great! On this card, read the smallest line you can.
Patient__: A P E O R P D Z.
Assistant: Perfect. So I'll send that to the doctor. They're going t

In [42]:
from random import randrange

extra = "A vision exam has several phases. Phase 1 consists of a greeting and the Local Tech will help the Patient__ get comfortable. Phase 2 consists of a visual acuity test. Phase 3 consists of a subjective refraction, where Optician switches lenses and the patient evaluates the choices offered. Phase 4 consists of a close vision test where a card is placed in front of the patient for a final evaluation."
def format_instruction(data):
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    response_string = '' 
    for i in data['response']:
        response_string += f'''{i['role']}: {i['content']}\n'''
    return f'''### Instruction:
You are an Optician conducting a vision exam. Use the dialogue below to create the Assistant's response that best guides Patient__ through a vision exam.

### Input:
{dialogue_string}

### Response:
{response_string}
'''
idx = randrange(train.__len__())
print(f'Before formatting: \n{train.data[idx]}\n')
print(f'After formatting: \n{format_instruction(train.data[idx])}')

Before formatting: 
{'dialogue': [{'role': 'Patient__', 'content': 'Yeah.'}, {'role': 'Assistant', 'content': 'Okay, the green side it better. Is it better on the red side, green side, or are they about the same?'}, {'role': 'Patient__', 'content': 'the same'}, {'role': 'Assistant', 'content': 'Okay. Blink a few times. Read the smallest row you can with your right eye.'}, {'role': 'Patient__', 'content': 'H Z. C, maybe K O.'}, {'role': 'Assistant', 'content': "OK, now I'll test your left eye and we'll start again. Is it better with one? With two? Or the same?"}, {'role': 'Patient__', 'content': 'One'}, {'role': 'Assistant', 'content': "Alright, and same thing with dots on the screen. There's one. And two. Or are they the same?"}, {'role': 'Patient__', 'content': 'to'}, {'role': 'Assistant', 'content': 'Are they better with one? Or two?'}, {'role': 'Patient__', 'content': "i can't tell a difference."}, {'role': 'Assistant', 'content': 'OK. Which color looks better? Red? green? Or are th

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

use_flash_attention = False
# COMMENT IN TO USE FLASH ATTENTION
# replace attention with flash attention
if torch.cuda.get_device_capability()[0] >= 8:
    from llama_patch import replace_attn_with_flash_attn
    print("Using flash attention")
    replace_attn_with_flash_attn()
    use_flash_attention = True


###################### Hugging Face model ids ##################
# model_id = "NousResearch/Llama-2-7b-hf" # non-gated
# model_id = "meta-llama/Llama-2-7b-hf" # gated
# model_id = "NousResearch/Llama-2-7b-chat-hf"

# model_id = "meta-llama/Llama-2-7b-chat-hf"
# cache_dir = '/data/chat-models/local/NousResearch/Llama-2-7b-chat-hf'
    
# model_id = "NousResearch/Llama-2-13b-hf"
    
# model_id = "NousResearch/Llama-2-13b-chat-hf"
# cache_dir = '/data/chat-models/foundation/NousResearch/Llama-2-13b-chat-hf'
# model_id = "meta-llama/Llama-2-70b-chat-hf"

model_id = "Mistral-7B-Instruct-v0.2-int4-Exam_v3-20_480-early-stop"
cache_dir = '/data/chat-models/foundation/mistralai/Mistral-7B-Instruct-v0.2'
# model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"


# model_id = "openchat/openchat-3.5-1210"
# cache_dir = '/data/chat-models/foundation/openchat/openchat-3.5-1210'


##################### Local Models #######################
# model_id = "/data/localModels/Llama-2-13b-chat-hf"
# model_id =  '/data/chat-models/llama2/Llama-2-13b-chat-hf/'

################## model names ###########################
epochs = 30
data_nm= data_dir.split('/')[-2]
# model_name = "llama2-7b-int4-Exam_v1-30_2100-early_stop" #epochs_trainsize
model_name = model_id.split('/')[-1] + f"-int4-{data_nm}-{epochs}_{train_len}-early_stop"

# model_name = "Mistral-7B-Instruct-v0.2-int4-Exam_v1-30_2100-early_stop"
mistral_model = False
if model_id.split('/')[0] == "mistralai":
    mistral_model = True
# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             use_flash_attention_2=True,
                                             trust_remote_code=False,
                                             resume_download=True)

model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          trust_remote_code=False,
                                          resume_download=True
                                          )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.save_pretrained('/data/chat_models/foundation/' + model_id)
tokenizer.save_pretrained('/data/chat_models/foundation/' + model_id)

  from .autonotebook import tqdm as notebook_tqdm


Using flash attention


NameError: name 'train_len' is not defined

In [None]:
# if mistral_model:
#     idx = randrange(train.__len__())
#     print(f'Before formatting: \n{train.data[idx]}\n')
#     encodeds = tokenizer.apply_chat_template(train.data[idx], return_tensors="pt")
#     decoded = tokenizer.batch_decode(encodeds)
#     print(f'After formatting: \n{format_instruction(decoded)}')
for name, mod in model.named_modules():
    print(name, mod)


In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
# specify the linear layers of the mistral-7b model per the PEFT paper
target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
]
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= target_modules if mistral_model else None
)


# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
from transformers import EarlyStoppingCallback

# Early stopping patience (number of epochs without improvement)
early_stopping_patience = 3

# Early stopping threshold (minimum relative improvement to continue training)
early_stopping_threshold = -0.001

# Create the callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience, early_stopping_threshold)


In [None]:
epochs=30
use_flash_attention=False

In [None]:
from transformers import TrainingArguments


args = TrainingArguments(
    output_dir=model_name, 
    num_train_epochs=epochs,
    per_device_train_batch_size=6 if use_flash_attention else 4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy='epoch',
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=True, # disable tqdm since with packing values are incorrect
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss'
)


In [None]:
from trl import SFTTrainer

max_seq_length = 2048 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
    model=model,
    train_dataset=train.data,
    eval_dataset=val.data,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction,
    args=args,
    callbacks=[early_stopping_callback]
)

trainer.save_model(model_id.split('/')[-1])




In [None]:
# train
trainer.train() # there will not be a progress bar since tqdm is disabled

# save model
trainer.save_model()


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in float16.


{'loss': 1.1644, 'learning_rate': 0.0002, 'epoch': 0.06}
{'eval_loss': 0.9461890459060669, 'eval_runtime': 24.6886, 'eval_samples_per_second': 12.151, 'eval_steps_per_second': 1.539, 'epoch': 0.07}




{'loss': 0.9117, 'learning_rate': 0.0002, 'epoch': 1.04}
{'eval_loss': 0.7030491232872009, 'eval_runtime': 24.1935, 'eval_samples_per_second': 12.4, 'eval_steps_per_second': 1.571, 'epoch': 1.08}




{'loss': 0.7592, 'learning_rate': 0.0002, 'epoch': 2.02}
{'loss': 0.6194, 'learning_rate': 0.0002, 'epoch': 2.07}
{'eval_loss': 0.5447949767112732, 'eval_runtime': 24.3763, 'eval_samples_per_second': 12.307, 'eval_steps_per_second': 1.559, 'epoch': 2.07}




{'loss': 0.5152, 'learning_rate': 0.0002, 'epoch': 3.05}
{'eval_loss': 0.46092841029167175, 'eval_runtime': 24.3663, 'eval_samples_per_second': 12.312, 'eval_steps_per_second': 1.56, 'epoch': 3.08}




{'loss': 0.45, 'learning_rate': 0.0002, 'epoch': 4.03}
{'eval_loss': 0.42840448021888733, 'eval_runtime': 24.3885, 'eval_samples_per_second': 12.301, 'eval_steps_per_second': 1.558, 'epoch': 4.07}




{'loss': 0.3998, 'learning_rate': 0.0002, 'epoch': 5.01}
{'loss': 0.3492, 'learning_rate': 0.0002, 'epoch': 5.07}
{'eval_loss': 0.421701580286026, 'eval_runtime': 24.4018, 'eval_samples_per_second': 12.294, 'eval_steps_per_second': 1.557, 'epoch': 5.08}




{'loss': 0.3166, 'learning_rate': 0.0002, 'epoch': 6.05}
{'eval_loss': 0.425231397151947, 'eval_runtime': 24.4608, 'eval_samples_per_second': 12.265, 'eval_steps_per_second': 1.554, 'epoch': 6.07}




{'loss': 0.2901, 'learning_rate': 0.0002, 'epoch': 7.03}
{'eval_loss': 0.433732271194458, 'eval_runtime': 24.2426, 'eval_samples_per_second': 12.375, 'eval_steps_per_second': 1.567, 'epoch': 7.08}




{'loss': 0.2516, 'learning_rate': 0.0002, 'epoch': 8.01}
{'loss': 0.2234, 'learning_rate': 0.0002, 'epoch': 8.07}
{'eval_loss': 0.4603998064994812, 'eval_runtime': 23.0981, 'eval_samples_per_second': 12.988, 'eval_steps_per_second': 1.645, 'epoch': 8.07}
{'train_runtime': 5500.9742, 'train_samples_per_second': 11.453, 'train_steps_per_second': 0.954, 'train_loss': 0.5191331606758528, 'epoch': 8.07}


In [None]:
trainer.save_model()
trainer.args

In [13]:
model.modules

<bound method Module.modules of GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attent

# Test Model

In [4]:
import os
# os.environ['CUDA_VISIBLE_DEVICES']="0"

import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, BitsAndBytesConfig

model_name = ''
args = TrainingArguments(
    output_dir=model_name, 
)
###################### DEV MODEL
# args.output_dir ="/home/digitalopt/proj/chatbots/chatbot-v1/Llama-2-13b-chat-hf-int4-Exam_v2-30_420-early_stop"
# cache_dir = '/data/chat-models/foundation/NousResearch/Llama-2-13b-chat-hf'
######################
# args.output_dir = '/data/chat-models/local/' + model_name + '/'
# args.output_dir = 'Llama-2-7b-chat-hf-int4-Exam_v2-30_525-early_stop'
# args.output_dir = "Llama-2-13b-chat-hf-int4-Exam_v2-30_525-early_stop"

# cache_dir = '/data/chat-models/foundation/NousResearch/Llama-2-13b-chat-hf'
# args.output_dir = 'Llama-2-13b-chat-hf-int4-Exam_v3-10_320-early_stop'

# cache_dir = '/data/chat-models/foundation/openchat/openchat-3.5-1210'
# args.output_dir = 'openchat-3.5-1210-int4-Exam_v3-15_480-early_stop'
# args.output_dir ='openchat-3.5-1210-int4-Exam_v2-15_420-early_stop' # <--- could be really good
# args.output_dir = 'openchat-3.5-1210-int4-Exam_v3-3_480-early_stop'# <--- maybe pretty good?

# cache_dir = '/data/chat-models/foundation/mistralai/Mixtral-8x7B-Instruct-v0.1'
# args.output_dir = 'Mixtral-8x7B-Instruct-v0.1-int4-Exam_v3-10_320-early_stop'

# cache_dir = '/data/chat-models/foundation/openchat/openchat-3.5-0106'
# args.output_dir = 'openchat-3.5-0106-int4-Exam_v3-15_480-early-stop'

# cache_dir = '/data/chat-models/foundation/microsoft/phi-2'
# args.output_dir = 'phi-2-int4-phoropter_v3-20_480-early-stop' # 'microsoft/phi-2'

# cache_dir = '/data/chat-models/foundation/Intel/neural-chat-7b-v3-3'
# args.output_dir = 'Intel/neural-chat-7b-v3-3'

args.output_dir = "google/gemma-7b-it"
cache_dir = '/data/chat-models/foundation/google/gemma-7b-it'

phoropterModel = False
foundation_model_nm = cache_dir.split('/')[4]
# if args.output_dir[:3] != foundation_model_nm[:3]:
#     raise Exception(f"model in output_dir doesn't match model in cache dir\noutput_dir:{args.output_dir}\ncache_dir:{foundation_model_nm}")

# load base LLM model and tokenizer
# model = AutoPeftModelForCausalLM.from_pretrained(
#     args.output_dir,
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16,
#     load_in_4bit=True,
#     # attn_implementation="flash_attention_2",# use_flash_attention_2=True,
#     temperature=0,
#     cache_dir = cache_dir,
#     trust_remote_code=True
#     # device_map="auto",
# )
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(args.output_dir,
                                            quantization_config=bnb_config,
                                            # device_map='cuda',
                                            device_map="auto",
                                            attn_implementation="flash_attention_2",#use_flash_attention_2=True,
                                            temperature=.9,
                                            do_sample=True,
                                            cache_dir= cache_dir,
                                            torch_dtype=torch.bfloat16,
                                            trust_remote_code=False)
tokenizer = AutoTokenizer.from_pretrained(args.output_dir, cache_dir=cache_dir, padding_side='left')


Downloading shards: 100%|██████████| 4/4 [00:00<00:00,  8.33it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it]
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.
tokenizer_config.json: 100%|██████████| 2.16k/2.16k [00:00<00:00, 24.9MB/s]


In [5]:
from vision_dataset import VisionDatasetCreator, VisionDataset
import os 
os.environ['PYTORCH_CUDA_ALLOC_CONF']='max_split_size_mb:50'
# avg percentages of exam phase lengths
gs_len=0.25
sp_len=0.25
ac_len=0.25
cv_len=0.25
# percentage of samples taken from each exam phase
gs_ratio=0.25
sp_ratio=0.25
ac_ratio=0.25
cv_ratio=0.25
if phoropterModel:
    # minumum dialogue lengths
    gs_min = 2
    sp_min = 4
    ac_min = 4
    cv_min = 4
    # maximum dialogue lengths
    gs_max = 6
    sp_max = 6
    ac_max = 6
    cv_max = 6
else:
    # minumum dialogue lengths
    gs_min = 2
    sp_min = 4
    ac_min = 8
    cv_min = 6
    # maximum dialogue lengths
    gs_max = 5
    sp_max = 14
    ac_max = 18
    cv_max = 10


sampling_strategy = dict(
    gs=[gs_len, gs_ratio, gs_min, gs_max],
    sp=[sp_len, sp_ratio, sp_min, sp_max],
    ac=[ac_len, ac_ratio, ac_min, ac_max],
    cv=[cv_len, cv_ratio, cv_min, cv_max]
)

if phoropterModel:
    data_dir = '/data/datasets/phoropter_v3/'
    dataset_creator = VisionDatasetCreator(sampling_strategy, seed=42, assistant=False)
else:
    data_dir = '/data/datasets/Exam_v3/'
    # random seed for testing
    dataset_creator = VisionDatasetCreator(sampling_strategy)



num_samples = 3
# just use test set
size = (6*num_samples)
dataset_creator.load(data_dir, 'test', size)
size = (5*num_samples)
dataset_creator.load(data_dir, 'val', size)

print(len(dataset_creator.dataset['test']))
test = VisionDataset(dataset_creator.dataset["test"])
print(len(dataset_creator.dataset['val']))
val = VisionDataset(dataset_creator.dataset["val"])

8 files found. Sampling 2 times per file.
sampling file: /data/datasets/Exam_v3/test/000030.txt
sampling file: /data/datasets/Exam_v3/test/000033.txt
sampling file: /data/datasets/Exam_v3/test/000036.txt
sampling file: /data/datasets/Exam_v3/test/000039.txt
sampling file: /data/datasets/Exam_v3/test/000041.txt
sampling file: /data/datasets/Exam_v3/test/000042.txt
sampling file: /data/datasets/Exam_v3/test/000043.txt
sampling file: /data/datasets/Exam_v3/test/000044.txt
Expected length of data: 18
Actual length: 16
5 files found. Sampling 3 times per file.
sampling file: /data/datasets/Exam_v3/val/000034.txt
sampling file: /data/datasets/Exam_v3/val/000035.txt
sampling file: /data/datasets/Exam_v3/val/000037.txt
sampling file: /data/datasets/Exam_v3/val/000038.txt
sampling file: /data/datasets/Exam_v3/val/000040.txt
Expected length of data: 15
Actual length: 15
16
15


In [6]:
big_test = test.data + val.data

max_len = 0
for i in test.data:
    data = i['dialogue']
    lenD = 0
    for d in data:
        lenD += len(d["content"])
    if lenD > max_len:
        max_len = lenD
print(f'max sequence length: {max_len}\nlength of test set: {len(big_test)}')

max sequence length: 625
length of test set: 31


In [7]:
from random import randrange

def format_mixtral(data, model_input=True):
    instruction = 'You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.'
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    response_string = ''
    for i in data['response']:
        response_string += f'''{i['role']}: {i['content']}\n'''
    if model_input:
        return f'''<s>[INST]{instruction}\n{dialogue_string}[/INST]{response_string}</s>'''
    else:
        return f'''<s>[INST]{instruction}\n{dialogue_string}[/INST]'''

def format_inference(data, model_input=True):
    dialogue_string = ''
    for i in data['dialogue']:
        dialogue_string += f'''{i['role']}: {i['content']}\n'''
    if model_input:
        return f'''### Instruction:
You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.

### Input:
{dialogue_string}

### Response:
'''
    else:
        response_string = ''
        for i in data['response']:
            response_string += f'''{i['role']}: {i['content']}\n'''
        return f'''### Response:\n{response_string}'''
idx = randrange(test.__len__())
sample = test.data[idx]
# print(format_inference(sample))
# print(f"Ground truth:\n{format_inference(sample, model_input=False)}")
print(format_mixtral(sample))
print(f"Ground truth:\n{format_mixtral(sample, model_input=False)}")

<s>[INST]You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.
Patient__: Yes.
Assistant: Okay, open your eyes. I put in your current prescription. Just like before, we'll start with both eyes. When you're ready, read the smallest row you can, ok?
Patient__: OZRSN.
Assistant: Good job. Read the smallest line you can with the right eye.
Patient__: O R K S E.
Assistant: Good. What's the smallest row you can read with your left eye?
Patient__: NCKHD.
Assistant: Thank you. Now close your eyes again for me. Are they closed?
Patient__: Yes.
Assistant: Now open your eyes. I'm testing your right eye first. I'll show you two choices and you will pick the one that looks most clear. Which is better, choice one? Choice two? Or are they similar?
Patient__: Can you go back?
[/INST]Assistant: Of course. This is number one. This is number two. Or are they about the same?
</s>
Ground truth:
<s>[INST]You are 

In [8]:
max_len = 0
for sample in test.data:
    inpt = format_mixtral(sample)
    if len(inpt) > max_len:
        max_len = len(inpt)
max_len


968

In [9]:

from sentence_transformers import SentenceTransformer, util
from datasets import load_metric
from contextlib import contextmanager
from time import process_time
import numpy as np
import random

@contextmanager
def timer():
    start = process_time()
    try:
        yield
    finally:
        end = process_time()
        elapsed = end - start
        times.append(elapsed)
        print(f'elapsed time: {elapsed:.3f} seconds')

class SimilarityMetrics(object):
    def __init__(self,model='sentence-transformers/all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model)
    def compute_similarity(self,pred, truth):
        #Compute embedding for both lists
        pred_embedding= self.model.encode(pred, convert_to_tensor=True)
        truth_embedding = self.model.encode(truth, convert_to_tensor=True)
        cos_sim = util.pytorch_cos_sim(pred_embedding, truth_embedding)
        return cos_sim.cpu().numpy()
    
    def compute_bleu(pred:list, truth:list):
        metric = load_metric('bleu')
        # predictions (list of strs): Translations to score.
        # references (list of lists of strs): references for each translation.
        metric.add(predictions=pred, references=[truth])
        report = metric.compute()
        return report['bleu'] *100, report

times = []
tok_ps = []
metrics = SimilarityMetrics()
bad = 0
good = 0

if phoropterModel:
    threshold = .96
else:
    threshold = .45
big_test = test.data + val.data

tests = 30

if tests > len(big_test):
    print(f"Inadequate test set length: {len(big_test)}")
else:
    sample_idxs = random.sample(range(0,len(big_test)), tests)

for i in range(len(sample_idxs)):
    sample = big_test[sample_idxs[i]]

    prompt = format_inference(sample)
    # prompt = format_mixtral(sample, model_input=False)
    with timer():
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
        # with torch.inference_mode():
        outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.3)
        response = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
    tok_ps.append(outputs.detach().cpu().numpy().shape[1] / times[i])
    result = metrics.compute_similarity(response[11:],sample["response"][0]["content"])
    if np.round(result, 3) < threshold:
        bad += 1
    else: good +=1
    print(f"Prompt:\n{prompt}\n")
    print('____________________')
    print(f"Generated response:\n{response}")
    print(f"Ground truth:\n{format_inference(sample, model_input=False)}")
    print(f'Similarity Score: {np.round(result, 3)}_______')
    print('____________________')
print(f'\n\n\n\n OVERALL RESULTS\nGood predictions: {good}\nBad predictions:{bad}\nPrecision:{good/(good+bad)}')

  return self.fget.__get__(instance, owner)()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


elapsed time: 4.046 seconds
Prompt:
### Instruction:
You are an Optician conducting a vision exam. Use the dialogue below to create an Assistant response that guides the Patient__ through a vision exam.

### Input:
Assistant: Sure. Adjusting the right PD by one step. Is that good?
Patient__: Looks good and I think we're ready.
Assistant: Great! We'll start without any prescription using both eyes. The letters might look a little blurry. Without squinting, read the smallest line you can.
Patient__: It's blurry, but ZHC.


### Response:


____________________
Generated response:
Sure, here is the Assistant's response:

"Sure, I understand you're ready. Let's begin the exam without any prescription. I'll show you the smallest line you can read. Just focus on the letters and let me know if they're clear or if they're blurry. If they're blurry, I'll need you to say which letters are blurry so I can make adjustments."


**Additional notes:**

- The Assistant should be friendly and
Ground tru

In [10]:
def get_av(nums):
    sum = 0
    for i in nums:
        sum += i
    return sum / len(nums)

print(f'Average inference speed: {get_av(times):.2f} seconds')
print(f'Tokens / response: {get_av(tok_ps):.2f}')

Average inference speed: 3.70 seconds
Tokens / response: 68.00


In [11]:
strings =[]
for i in test.data:
    content = i["response"][0]["content"]
    if content not in strings:
        strings.append(content)
print(len(strings))

14
