In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install datasets

In [2]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("ft_hfhub")
login(token = hf_token)

In [3]:
import os

# List contents of the dataset directory
dataset_dir = "/kaggle/input/checkpoint-2700-llama3-2-3b-it"
print("Contents of the dataset directory:")
print(os.listdir(dataset_dir))


Contents of the dataset directory:
['adapter_model.safetensors', 'trainer_state.json', 'training_args.bin', 'adapter_config.json', 'README.md', 'tokenizer.json', 'tokenizer_config.json', 'scheduler.pt', 'special_tokens_map.json', 'optimizer.pt', 'rng_state.pth']


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # Use 4-bit quantization
    bnb_4bit_quant_type="nf4",
)

# Load quantized base model
base_model_path = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"  # Adjust to your base model path
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Resize token embeddings for mismatch
base_model.resize_token_embeddings(128258)  # Adjust size to match checkpoint

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128258, 3072)

In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [9]:
from peft import PeftConfig

# Path to the LoRA adapter
adapter_path = "/kaggle/input/checkpoint-2700-llama3-2-3b-it"
adapter_config = PeftConfig.from_pretrained(adapter_path)
# Load the LoRA adapter
model = PeftModel.from_pretrained(base_model, adapter_path)

# Set the model to evaluation mode
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128258, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [10]:
# Merge the adapter weights into the base model
model = model.merge_and_unload()



In [11]:
merged_model_path = "/kaggle/working/merged_ft_llama_model"
model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)

('/kaggle/working/merged_ft_llama_model/tokenizer_config.json',
 '/kaggle/working/merged_ft_llama_model/special_tokens_map.json',
 '/kaggle/working/merged_ft_llama_model/tokenizer.json')

In [12]:
model = AutoModelForCausalLM.from_pretrained(merged_model_path).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(merged_model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [13]:
messages = [
    {
        "role": "system",
        "content": "Persona B's characteristics: My name is David, and I'm a 35-year-old math teacher. "
                   "I like to hike and spend time in nature. I'm married with two kids."
    },
    {
        "role": "user",
        "content": "Morning! I think I saw you at the parent meeting, what's your name?"
    }
]

In [14]:
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [15]:
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

In [30]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=200,
        num_return_sequences=1,
        temperature=0.8,  # Adjust the randomness
        top_p=0.9        # Nucleus sampling
    )

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [31]:
# Decode the response from the model
decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [32]:
# Extract the assistant's reply
if "assistant" in decoded_text:
    response = decoded_text.split("assistant", 1)[1].strip()
else:
    response = decoded_text.strip()

In [33]:
# Print the assistant's reply
print("Assistant's Reply:", response)

Assistant's Reply: Good morning! Yeah, I was at the parent meeting. My name's David, nice to meet you. I'm a math teacher here at the school. How about you, do you have kids in the school?


In [20]:
from huggingface_hub import HfApi, HfFolder

# Replace with your repo's path on Hugging Face
repo_name = "ishitas2365/llama-3.2-3b-instruct-finetunedToPersona"

In [21]:
# Upload the model
api = HfApi()
api.upload_folder(
    folder_path=merged_model_path, 
    repo_id=repo_name, 
    commit_message="Initial model upload"
)

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.37G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/ishitas2365/llama-3.2-3b-instruct-finetunedToPersona/commit/aef2b84c3e2a287ceeb3d4d86940cb996bac71b9', commit_message='Initial model upload', commit_description='', oid='aef2b84c3e2a287ceeb3d4d86940cb996bac71b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ishitas2365/llama-3.2-3b-instruct-finetunedToPersona', endpoint='https://huggingface.co', repo_type='model', repo_id='ishitas2365/llama-3.2-3b-instruct-finetunedToPersona'), pr_revision=None, pr_num=None)