In [1]:
import torch
from peft import PeftModel
from evaluate import load
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import numpy as np
from tqdm.auto import tqdm
import json
from datasets import load_dataset

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, sp.Integer):
            return int(obj)
        return json.JSONEncoder.default(self, obj)

# Define a function to apply templates to conversations
def apply_template(examples):
    messages = examples["conversations"]
    text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages]
    return {"text": text}

# Define a function for prompting the model
def prompt_model(prompt, model, tokenizer):

    messages = [
        {"from":"human", "value":prompt}
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to("cuda")
    generation = model.generate(input_ids=inputs, max_new_tokens=400, use_cache=True)
    decoded_seq = tokenizer.decode(generation[0],
                                   skip_special_tokens=True,
                                   do_sample=False)
    return decoded_seq.split("assistant")[1].replace(prompt, "").replace("Derivation:","")

model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
#model_name = "MathLLaMa-3.1-8B"

# Set parameters
max_seq_length = 1024
batch_size = 1
learning_rate = 5e-5


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Configure the tokenizer with chat templates
tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

  from .autonotebook import tqdm as notebook_tqdm
2024-10-08 22:45:41.985959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-08 22:45:41.997627: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-08 22:45:42.001303: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-08 22:45:42.010160: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3080. Max memory: 10.0 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.44s/it]
Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


In [2]:
# model = PeftModel.from_pretrained(model, "best_loras/epoch_6")
# model.push_to_hub_merged("jmeadows17/MathLLaMa-3.1-8B", tokenizer, save_method="merged_16bit")

In [3]:
def evaluate_model(epoch, eval, few_shot, model, tokenizer, data, lora=False):
    # saves {prompt, prediction, reference, rouge, bleu, gleu} per example in data as a list (json)
    # outputs average {rouge, bleu, gleu} as a dict

    all_results = []
    metrics = ["rouge","bleu","gleu"]

    #zero_shot_preamble = 'Derive the final equation using the premise equations from the following prompt (denoted by "Prompt:"). Give only the equations involved in the derivation. Do not include any text other than equations each separated by "and". Prompt: '

    # keys for dataset specific values
    if eval != "static":
        prompt_key = f"{eval} prompt"
        derivation_key = f"{eval} derivation"
    else:
        prompt_key = "prompt"
        derivation_key = "derivation"

    # load LoRA and set up model for inference
    if lora == True:
        path = f"best_loras/epoch_{epoch}"
        lora_model = PeftModel.from_pretrained(model, path)
    else:
        lora_model = model 
    with torch.no_grad():
        FastLanguageModel.for_inference(lora_model)
        tokenizer.padding_side = "left"

        # begin inference and evaluation per example
        for example in tqdm(data):
            if few_shot is True:
                prompt = example[prompt_key]
            else:
                prompt = example[prompt_key].split("Prompt: ")[-1]
            reference = example[derivation_key]
            prediction = prompt_model(prompt, lora_model, tokenizer)
            
            # initialise results dictionary per example
            results = {
                "few-shot " + prompt_key if few_shot is True else prompt_key : prompt,
                derivation_key + " prediction" : prediction,
                derivation_key + " reference" : reference,
            }

            # calculate scores
            for metric_name in metrics:

                if metric_name == "gleu":
                    metric = load("google_bleu")
                elif metric_name == "bleurt":
                    metric = load("bleurt", "bleurt-large-512")
                else:
                    metric = load(metric_name)
                try:
                    m = metric.compute(predictions=[prediction], references=[reference])

                    if metric_name == "rouge":
                        score = m["rouge2"]
                    elif metric_name == "bleu":
                        score = m["bleu"]
                    elif metric_name == "gleu":
                        score = m["google_bleu"]
                    elif metric_name == "bleurt":
                        score = m["scores"][0]
                except:
                    score = 0.0

                results[metric_name] = score
            all_results.append(results)

            # save current results to json after appending
            with open(f"{eval}_epoch={epoch}_few-shot={few_shot}.json","w") as f:
                json.dump(all_results, f, cls=NpEncoder)
    
    # calculate average scores
    averages = {}
    for metric_name in metrics:
        averages[metric_name] = round(np.mean([i[metric_name] for i in all_results])*100, 1)

    return averages

In [3]:
with open("gpt-4_results.json") as f:
    data = json.load(f)
data = data[:2] #for testing code works

eval_modes = ["static"]
few_shot_modes = [False]

for eval in tqdm(eval_modes):
    for few_shot in few_shot_modes:

        # evaluation
        scores = evaluate_model(
            epoch=1000,
            eval=eval, 
            few_shot=few_shot, 
            model=model, 
            tokenizer=tokenizer, 
            data=data,
            lora=False,
        )
        print(scores)

  0%|          | 0/1 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 2/2 [00:38<00:00, 19.39s/it]
100%|██████████| 1/1 [00:38<00:00, 38.78s/it]

{'rouge': 41.5, 'bleu': 40.1, 'gleu': 40.3}





In [3]:
# model results for picking lora

# data
with open("gpt-4_results.json") as f:
    data = json.load(f)
#data = data[:2] #for testing code works

eval_modes = ["static","VR","EE","AG","SR"]
few_shot_modes = [False, True]
results = []

for eval in tqdm(eval_modes):
    for few_shot in few_shot_modes:
        for epoch in [1,6]:

            # evaluation
            scores = evaluate_model(
                epoch=epoch,
                eval=eval, 
                few_shot=few_shot, 
                model=model, 
                tokenizer=tokenizer, 
                data=data,
                lora=True,
                save=False,
            )
            print(f"eval = {eval}", f"few-shot = {few_shot}")
            dict = {"epoch":epoch} | scores
            results.append(dict)
            print(dict)
            with open("lora_results.json","w") as f:
                json.dump(results,f,cls=NpEncoder)

  0%|          | 0/5 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 100/100 [23:32<00:00, 14.12s/it]


eval = static few-shot = False
{'epoch': 1, 'rouge': 89.4, 'bleu': 84.1, 'gleu': 85.1}


100%|██████████| 100/100 [24:14<00:00, 14.55s/it]


eval = static few-shot = False
{'epoch': 6, 'rouge': 91.4, 'bleu': 87.5, 'gleu': 88.2}


100%|██████████| 100/100 [28:01<00:00, 16.82s/it]


eval = static few-shot = True
{'epoch': 1, 'rouge': 81.8, 'bleu': 76.1, 'gleu': 77.3}


100%|██████████| 100/100 [28:31<00:00, 17.12s/it]
 20%|██        | 1/5 [1:44:24<6:57:38, 6264.59s/it]

eval = static few-shot = True
{'epoch': 6, 'rouge': 85.7, 'bleu': 79.2, 'gleu': 80.1}


100%|██████████| 100/100 [21:44<00:00, 13.05s/it]


eval = VR few-shot = False
{'epoch': 1, 'rouge': 89.4, 'bleu': 85.4, 'gleu': 86.2}


100%|██████████| 100/100 [21:56<00:00, 13.17s/it]


eval = VR few-shot = False
{'epoch': 6, 'rouge': 91.9, 'bleu': 88.0, 'gleu': 88.7}


100%|██████████| 100/100 [24:25<00:00, 14.65s/it]


eval = VR few-shot = True
{'epoch': 1, 'rouge': 83.5, 'bleu': 78.9, 'gleu': 79.9}


100%|██████████| 100/100 [26:13<00:00, 15.73s/it]
 40%|████      | 2/5 [3:18:47<4:55:31, 5910.46s/it]

eval = VR few-shot = True
{'epoch': 6, 'rouge': 83.8, 'bleu': 78.4, 'gleu': 79.1}


100%|██████████| 100/100 [25:45<00:00, 15.45s/it]


eval = EE few-shot = False
{'epoch': 1, 'rouge': 85.1, 'bleu': 78.9, 'gleu': 79.9}


100%|██████████| 100/100 [26:00<00:00, 15.61s/it]


eval = EE few-shot = False
{'epoch': 6, 'rouge': 85.1, 'bleu': 78.6, 'gleu': 79.5}


100%|██████████| 100/100 [27:10<00:00, 16.30s/it]


eval = EE few-shot = True
{'epoch': 1, 'rouge': 82.2, 'bleu': 75.3, 'gleu': 76.6}


100%|██████████| 100/100 [30:27<00:00, 18.27s/it]
 60%|██████    | 3/5 [5:08:13<3:26:59, 6209.73s/it]

eval = EE few-shot = True
{'epoch': 6, 'rouge': 80.7, 'bleu': 73.6, 'gleu': 74.2}


100%|██████████| 100/100 [23:58<00:00, 14.39s/it]


eval = AG few-shot = False
{'epoch': 1, 'rouge': 88.1, 'bleu': 81.8, 'gleu': 83.2}


100%|██████████| 100/100 [24:04<00:00, 14.45s/it]


eval = AG few-shot = False
{'epoch': 6, 'rouge': 91.3, 'bleu': 87.0, 'gleu': 87.9}


100%|██████████| 100/100 [27:55<00:00, 16.75s/it]


eval = AG few-shot = True
{'epoch': 1, 'rouge': 79.4, 'bleu': 72.2, 'gleu': 74.0}


100%|██████████| 100/100 [28:51<00:00, 17.31s/it]
 80%|████████  | 4/5 [6:53:05<1:44:02, 6242.37s/it]

eval = AG few-shot = True
{'epoch': 6, 'rouge': 84.0, 'bleu': 76.7, 'gleu': 77.7}


100%|██████████| 100/100 [22:47<00:00, 13.67s/it]


eval = SR few-shot = False
{'epoch': 1, 'rouge': 79.4, 'bleu': 70.5, 'gleu': 73.5}


100%|██████████| 100/100 [24:09<00:00, 14.50s/it]


eval = SR few-shot = False
{'epoch': 6, 'rouge': 81.0, 'bleu': 72.6, 'gleu': 74.8}


100%|██████████| 100/100 [25:42<00:00, 15.43s/it]


eval = SR few-shot = True
{'epoch': 1, 'rouge': 75.9, 'bleu': 67.8, 'gleu': 70.3}


100%|██████████| 100/100 [27:20<00:00, 16.41s/it]
100%|██████████| 5/5 [8:33:08<00:00, 6157.69s/it]  

eval = SR few-shot = True
{'epoch': 6, 'rouge': 76.4, 'bleu': 68.5, 'gleu': 70.7}





**Merging LoRA**

In [3]:
# model results for picking lora

# data
with open("gpt-4_results.json") as f:
    data = json.load(f)
data = data #for testing code works

eval = "static"
few_shot = False
results = []
for epoch in tqdm(range(6, 11)):

    # evaluation
    scores = evaluate_model(
        epoch=epoch,
        eval=eval, 
        few_shot=few_shot, 
        model=model, 
        tokenizer=tokenizer, 
        data=data
    )

    dict = {"epoch":epoch} | scores
    results.append(dict)
    print(dict)
    with open("lora_results.json","w") as f:
        json.dump(results,f,cls=NpEncoder)

  0%|          | 0/5 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 100/100 [23:46<00:00, 14.26s/it]
 20%|██        | 1/5 [23:47<1:35:08, 1427.17s/it]

{'epoch': 6, 'rouge': 91.9, 'bleu': 88.2, 'gleu': 88.8}


100%|██████████| 100/100 [23:51<00:00, 14.31s/it]
 40%|████      | 2/5 [47:39<1:11:29, 1429.92s/it]

{'epoch': 7, 'rouge': 91.0, 'bleu': 86.8, 'gleu': 87.3}


100%|██████████| 100/100 [23:27<00:00, 14.07s/it]
 60%|██████    | 3/5 [1:11:06<47:19, 1419.86s/it]

{'epoch': 8, 'rouge': 91.9, 'bleu': 87.8, 'gleu': 88.5}


100%|██████████| 100/100 [23:30<00:00, 14.11s/it]
 80%|████████  | 4/5 [1:34:38<23:36, 1416.55s/it]

{'epoch': 9, 'rouge': 90.5, 'bleu': 85.8, 'gleu': 86.5}


100%|██████████| 100/100 [22:57<00:00, 13.77s/it]
100%|██████████| 5/5 [1:57:36<00:00, 1411.25s/it]

{'epoch': 10, 'rouge': 91.6, 'bleu': 87.4, 'gleu': 88.1}



