In [None]:
!git clone https://github.com/artidoro/qlora.git
!pip install -r qlora/requirements.txt
!pip install peft

In [None]:
# Use huggingface account details here
%env USER=
%env TOKEN=

In [None]:
!GIT_LFS_SKIP_SMUDGE=1 git clone https://$USER:$TOKEN@huggingface.co/meta-llama/Meta-Llama-3-8B

!wget --header="Authorization: Bearer ${TOKEN}" https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/model-00001-of-00004.safetensors -O Meta-Llama-3-8B/model-00001-of-00004.safetensors
!wget --header="Authorization: Bearer ${TOKEN}" https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/model-00002-of-00004.safetensors -O Meta-Llama-3-8B/model-00002-of-00004.safetensors
!wget --header="Authorization: Bearer ${TOKEN}" https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/model-00003-of-00004.safetensors -O Meta-Llama-3-8B/model-00003-of-00004.safetensors
!wget --header="Authorization: Bearer ${TOKEN}" https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/model-00004-of-00004.safetensors -O Meta-Llama-3-8B/model-00004-of-00004.safetensors

In [None]:
# Pick an acceptable max_steps value for experiments. 100 is set by default and is reasonable for a 2 hour example experiment.
!python3 qlora/qlora.py --model_name_or_path Meta-Llama-3-8B/ --max_steps 100

Both models can't be loaded in simultaneously due to Colab memory constraints. Uncomment the original model and comment out the finetuned model if you wish to prompt the original Llama3.

In [None]:
import torch
from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig, BitsAndBytesConfig, pipeline
from peft import PeftModel

MODEL_DIR = "Meta-Llama-3-8B"  # base model
ADAPTER_PATH = "output/checkpoint-10/adapter_model"  # adapter weights

device = "cuda" if torch.cuda.is_available() else "cpu"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=False,
    load_in_8bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

#original_model = LlamaForCausalLM.from_pretrained(
#    MODEL_DIR,
#    return_dict=True,
#    quantization_config=quantization_config,
#    torch_dtype=torch.float16
#)

finetuned_model = LlamaForCausalLM.from_pretrained(
    MODEL_DIR,
    return_dict=True,
   quantization_config=quantization_config,
   torch_dtype=torch.float16
)
finetuned_model = PeftModel.from_pretrained(finetuned_model, ADAPTER_PATH, offload_folder="/content/sample_data")
finetuned_model.eval()

config = GenerationConfig(
    do_sample=True,
    temperature=0.9,
    max_new_tokens=20,
    top_p=1.0,
)

task = "text-generation"
#original_pipe = pipeline(task, model=original_model, tokenizer=tokenizer)
finetuned_pipe = pipeline(task, model=finetuned_model, tokenizer=tokenizer)

In [None]:
print("Input:")
prompt = input()

#original_output = original_pipe(prompt)[0]['generated_text'].split(prompt, 1)[1]
finetuned_output = finetuned_pipe(prompt)[0]['generated_text'].split(prompt, 1)[1]

#print("\nOriginal Model Output:\n", original_output)
print("\nFine-tuned Model Output:\n", finetuned_output)