In [None]:
%pip install -U bitsandbytes transformers accelerate peft trl

In [None]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')
login(token = hf_token)

In [None]:
base_model = "meta-llama/Llama-3.1-8B"
new_model = "/content/drive/MyDrive/llama-3-8b-chat-doctor/"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format

tokenizer = AutoTokenizer.from_pretrained(base_model)
base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()

In [None]:
messages = [{"role": "user", "content": "Hello doctor, I have bad acne. How do I get rid of it?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

In [None]:
model.save_pretrained("llama-3-8b-chat-doctor-full")
tokenizer.save_pretrained("llama-3-8b-chat-doctor-full")

In [None]:
model.push_to_hub("llama-3-8b-chat-doctor-full", use_temp_dir=False)
tokenizer.push_to_hub("llama-3-8b-chat-doctor-full", use_temp_dir=False)