In [None]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [None]:
run = wandb.init(
    project='Fine-tune Llama 3.2 3B on Medical Dataset',
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: (1) Private W&B dashboard, no account required
[34m[1mwandb[0m: (2) Use an existing W&B account


[34m[1mwandb[0m: Enter your choice: 1


[34m[1mwandb[0m: You chose 'Private W&B dashboard, no account required'
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "llama-3.2-3b-chat-doctor"

In [None]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [None]:
from google.colab import userdata
pipe = pipeline(
    "text-generation",
    model= base_model,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token = userdata.get("HK_TOKEN_NEW")
)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
tokenizer = pipe.tokenizer
tokenizer.pad_token = tokenizer.eos_token

messages = [{"role": "user", "content": "Who is Lech Walesa?"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 06 Dec 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is Lech Walesa?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Lech Walesa is a Polish politician and former trade union leader who served as the President of Poland from 1990 to 1995. He is best known for his role in the Solidarity movement, a trade union that played a key part in the fall of communism in Poland in 1989.

Born on September 29, 1943, in Popowo, Poland, Walesa grew up in a working-class family and began his career as a shipyard worker. In 1980, he joined the Solidarity movement, which aimed to improve working conditions and rights for Polish workers


In [None]:
from IPython.display import Markdown, display

messages = [
    {
        "role": "system",
        "content": "You are a skilled Python developer specializing in database management and optimization.",
    },
    {
        "role": "user",
        "content": "I'm experiencing a sorting issue in my database. Could you please provide Python code to help resolve this problem?",
    },
]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=512, do_sample=True)

display(
    Markdown(
            outputs[0]["generated_text"].split(
                "<|start_header_id|>assistant<|end_header_id|>"
            )[1]
        )
    )

In [None]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
dataset_name = "ruslanmv/ai-medical-chatbot"
#Importing the dataset
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Patient"]},
               {"role": "assistant", "content": row["Doctor"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 06 Dec 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nFell on sidewalk face first about 8 hrs ago. Swollen, cut lip bruised and cut knee, and hurt pride initially. Now have muscle and shoulder pain, stiff jaw(think this is from the really swollen lip),pain in wrist, and headache. I assume this is all normal but are there specific things I should look for or will I just be in pain for a while given the hard fall?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello and welcome to HCM,The injuries caused on various body parts have to be managed.The cut and swollen lip has to be managed by sterile dressing.The body pains, pain on injured site and jaw pain should be managed by pain killer and muscle relaxant.I suggest you to consult your primary healthcare provider for clinical assessment.In case there is evidence of infection in any of the injured si

In [None]:
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)



In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
90,5.3469,2.463182
180,4.8088,2.440625
270,4.2393,2.413564
360,5.3639,2.401378
450,4.4757,2.39263


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-lla

TrainOutput(global_step=450, training_loss=4.9104933224784, metrics={'train_runtime': 577.4622, 'train_samples_per_second': 1.559, 'train_steps_per_second': 0.779, 'total_flos': 3894050013425664.0, 'train_loss': 4.9104933224784, 'epoch': 1.0})

In [None]:
wandb.finish()
model.config.use_cache = True

0,1
eval/loss,█▆▃▂▁
eval/runtime,▁█▄▅▅
eval/samples_per_second,█▁▅▄▄
eval/steps_per_second,█▁▅▄▄
train/epoch,▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▅▇▆▆▂▄▆▄▄▄▆▅▅▆▅▁▃▁▃█▁▃▃▂▃▇█▂▄▄▃▅▄▃▃▅▆▅▃▅
train/learning_rate,▄▆█▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train/loss,▇█▅▂▄▃▃▄▅▄▆▅▂▅▄▂▄▆▃▃▅▄▂▃▁▂▅▄▃▅▁▂▄▄▅▃▃▃▁▃

0,1
eval/loss,2.39263
eval/runtime,22.6091
eval/samples_per_second,4.423
eval/steps_per_second,4.423
total_flos,3894050013425664.0
train/epoch,1.0
train/global_step,450.0
train/grad_norm,3.17894
train/learning_rate,0.0
train/loss,4.4757


In [None]:
messages = [
    {
        "role": "user",
        "content": "Hello doctor, I have a headache."
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=150,
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




Hi, I have gone through your query. For further information consult a neurologist online -->>>> http://healthcaremagic.com/consult/neurologist.aspx Regards, Dr. Sumanth Gaddam, General & Family Physician, USA. I have gone through your query. For further information consult a neurologist online -->>>> http://healthcaremagic.com/consult/neurologist.aspx Regards, Dr. Sumanth Gaddam, General & Family Physician, USA. I have gone through your query


In [None]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False,token=userdata.get("HK_TOKEN_NEW"))


Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]


Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jduda/llama-3.2-3b-chat-doctor/commit/212718c63ad94c9636885f54b5e01f245d6caf8f', commit_message='Upload model', commit_description='', oid='212718c63ad94c9636885f54b5e01f245d6caf8f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jduda/llama-3.2-3b-chat-doctor', endpoint='https://huggingface.co', repo_type='model', repo_id='jduda/llama-3.2-3b-chat-doctor'), pr_revision=None, pr_num=None)

In [None]:
!pip install -U accelerate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)

model = model.merge_and_unload()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



KeyError: 'base_model.model.model.model.embed_tokens'

In [None]:
model.save_pretrained("llama-3.2-3b-chat-doctor")
tokenizer.save_pretrained("llama-3.2-3b-chat-doctor")

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

In [None]:
!cd llama.cpp
!pip install -r /content/llama.cpp/requirements.txt

In [None]:
!python /content/llama.cpp/convert_hf_to_gguf.py /content/llama-3.2-3b-chat-doctor --outfile /content/llama-3.2-3b-chat-doctor.gguf --outtype q8_0