### Install Dependancies

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

### Setup Main settings and Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, gradio, warnings
from datasets import Dataset, load_dataset
from trl import SFTTrainer

resume_from_checkpoint = True

# Base and New model information
base_model, new_model = "../base_models/HuggingFaceH4_zephyr-7b-beta", "output/FC_Zephyr7B_base_model/FC_Zephyr7B_LoRA"

dataset_name = "rizerphe/glaive-function-calling-v2-zephyr"
# Training System Message
system_message = "You are an unreal engine development assistant that is proficient explaining unreal engine to a beginner"

bos_token = "<s>"
eos_token = "</s>"
pad_token = "<pad>"

In [None]:
from huggingface_hub import notebook_login
notebook_login()

### Get Dataset

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset["text"][0]

### Load and Configure Model

In [None]:
# Load base model (Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Configure tokenizer
tokenizer.bos_token = bos_token
tokenizer.eos_token = eos_token
tokenizer.pad_token = pad_token

# Ensure the token is in the tokenizer's vocabulary
if tokenizer.eos_token not in tokenizer.get_vocab():
    tokenizer.add_tokens([tokenizer.eos_token, tokenizer.pad_token])

tokenizer.padding_side = 'right'  # Set padding side


In [None]:
#Adding the adapters in the layers
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

### Configure Training

In [None]:
# W and B monitoring
wandb.login(key = "6c79359e9f0e19899dfe5de61e089ee3896274e1")
run = wandb.init(project='Zephyr7B FC Base', job_type="training", anonymous="allow")

In [None]:
# Trainer Hyperparamters
training_arguments = TrainingArguments(
    output_dir= new_model,
    num_train_epochs= 1,
    per_device_train_batch_size= 10,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 250,
    logging_steps= 20,
    learning_rate= 5e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= 5000,
    warmup_ratio= 0.1,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb"
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

### TRAIN!

In [None]:
trainer.train(resume_from_checkpoint= resume_from_checkpoint)

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

In [None]:
# Define the stream function for testing
def stream(user_prompt, temp, tokens):
    runtimeFlag = "cuda:0"
    prompt = """
    <|system|>You are a function calling assistant.
    <|definition|>
{
    "name": "search_internet",
    "description": "When the user requests the assistant to search the internet or do research, the assistant will search the internet for the answer and return the results.",
    "parameters": {
        "type": "object",
        "properties": {
            "search_term": {
                "type": "string",
                "description": "Search term to search the internet for and analyze results.",
            }
        },
        "required": [
            "search_term"
        ]
    }
}</s>
<|user|>"""+user_prompt+"</s>"
    print(prompt)
    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=tokens, do_sample=True, temperature=temp)

In [None]:
# Test a function call
stream(user_prompt="Can you find me a Pizza Place in Iowa City", temp=.1, tokens=500)

In [None]:
# Reload Model

# Load base model (Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# Reload the base model
base = AutoModelForCausalLM.from_pretrained(
    base_model, low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    return_dict=True,torch_dtype=torch.bfloat16,
    device_map= {"": 0})

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
# Load lora and ensure it's on the same device
model = PeftModel.from_pretrained(base, new_model).to("cuda:0")

# Now attempt to merge
model = model.merge_and_unload()