In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, gradio, warnings
from datasets import Dataset
from trl import SFTTrainer

# Training System Message
system_message = "You are a "
# Choose your model
base_model, fc_lora = "output/merged/Zephyr7B_FunctionCaller_v1", "output/Zephyr_HUSH_General_Assistant_v0.01/checkpoint-1959"
save_path = "output/merged/Zephyr7B_HUSH_General_Assistant_FC_v0.01"

#### Unquantized

In [None]:
# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float32  # Using standard float32 precision
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Ensure the model is on the CPU
model.to('cpu')

# Load LoRA and ensure it's on the CPU
model = PeftModel.from_pretrained(model, fc_lora).to('cpu')

# Now attempt to merge
model = model.merge_and_unload()

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

### Quantized

In [None]:
# Load base model (Mistral 7B)

# Quantization Configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)
# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Ensure the model is on the CPU
model.to('cpu')

# Load LoRA and ensure it's on the CPU
model = PeftModel.from_pretrained(model, fc_lora).to('cpu')

# Now attempt to merge
model = model.merge_and_unload()

# model.save_pretrained(save_path)
# tokenizer.save_pretrained(save_path)

In [None]:
# Load Lora
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load lora and ensure it's on the same device
model = PeftModel.from_pretrained(model, fc_lora).to(device)
model = PeftModel.from_pretrained(model, hush_lora).to(device)

# Now attempt to merge
model = model.merge_and_unload()

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
# Define the stream function for testing
def stream(user_prompt, temp, tokens):
    runtimeFlag = "cuda:0"
    prompt = """
    <|system|>You are a function calling assistant.
    <|definition|>
{
    "name": "search_internet",
    "description": "When the user requests the assistant to search the internet or do research, the assistant will search the internet for the answer and return the results.",
    "parameters": {
        "type": "object",
        "properties": {
            "search_term": {
                "type": "string",
                "description": "Search term to search the internet for and analyze results.",
            }
        },
        "required": [
            "search_term"
        ]
    }
}</s>
    <|user|>"""+user_prompt+"</s>"
    print(prompt)
    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=tokens, do_sample=True, temperature=temp)

In [None]:
stream(user_prompt="can you search the internet for pizza places in iowa city?", temp=.1, tokens=3000)