In [1]:
%%capture
%pip install -U torch 
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U wandb

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf")
login(token = hf_token)

In [4]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='train2', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Currently logged in as: [33mishitas2365[0m ([33mishitas2365-indian-institute-of-technology-indore[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [5]:
base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"
new_model = "llama-3-3b-it-personaB"
torch_dtype = torch.float16
attn_implementation = "eager"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float32,   
    attn_implementation="sdpa"    
)

#eager and torch_dtype tha hi nahi

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [8]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [9]:
tokenizer.chat_template = None
model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

['k_proj', 'gate_proj', 'down_proj', 'v_proj', 'q_proj', 'up_proj', 'o_proj']


In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [12]:
# Load the dataset and shuffle it
ds = load_dataset("Cynaptics/persona-chat")
ds = ds.shuffle(seed=65)  # Shuffle the dataset to randomize the rows

# Select 6,000 random rows from the train split
ds = ds['train'].select(range(6000))  # Select the first 6,000 rows after shuffling

# Verify the sampled dataset
print(ds)


README.md:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['conv_id', 'persona_b', 'dialogue', 'reference', '__index_level_0__'],
    num_rows: 6000
})


In [13]:
def format_chat_template(row):
    persona_statements = " ".join(row["persona_b"])  # Join the statements into a single string
    # Use only the current row's Persona B as the system message
    persona = {"role": "system", "content": f"Persona B's characteristics: {row['persona_b']}"}
    
    # Process the conversation column
    conversation = row["dialogue"]
    chat_history = []
    for i, turn in enumerate(row["dialogue"]):
        turn_cleaned = turn.replace("Persona A: ", "").replace("Persona B: ", "")
        role = "user" if i % 2 == 0 else "assistant"  # Alternating roles
        chat_history.append({"role": role, "content": turn_cleaned})
    
    # Final assistant response from the answer column
    final_response = {"role": "assistant", "content": row["reference"]}
    
    # Combine everything
    row_json = [persona] + chat_history + [final_response]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

In [14]:
# Apply the formatting to the entire dataset
processed_dataset = ds.map(format_chat_template, num_proc=4)

# Check a sample row
print(processed_dataset[0]["text"])

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/6000 [00:00<?, ? examples/s]

<|im_start|>system
Persona B's characteristics: ["I've been working at my current job for a year.", 'My vintage cars are my pride and joy.', 'i am a very short woman.', 'I want to inspire people with my webcomic and music.', 'i m a girl.']<|im_end|>
<|im_start|>user
What is your name?<|im_end|>
<|im_start|>assistant
Hi, I'm Dillon How are you today?<|im_end|>
<|im_start|>user
I'm doing well, thanks for asking!<|im_end|>
<|im_start|>assistant
I like to draw and write, but also as a big fan of vintage cars.<|im_end|>
<|im_start|>user
Oh that's cool, I have always wanted to learn what it is you can draw?<|im_end|>
<|im_start|>assistant
Just keep practicing and you will get better. You could also take some lessons or research tutorials online to practise with!<|im_end|>
<|im_start|>user
Thanks for the advice, I will definitely look into it.<|im_end|>
<|im_start|>assistant
What else would you like to do?<|im_end|>
<|im_start|>user
I like to listen and watch movies, also has love for a horse

In [15]:
from datasets import DatasetDict

# Assuming `ds` is your DatasetDict object
# Apply train_test_split to the "train" portion of the DatasetDict
train_test_split = processed_dataset.train_test_split(test_size=0.1, seed=42)

# Update the DatasetDict to include the new train and validation splits
processed_dataset = DatasetDict({
    "train": train_test_split["train"],
    "test": train_test_split["test"]
})

In [16]:
new_model_base_dir = "llama-3-3b-it-personaB"  # Base directory for the model
checkpoint_prefix = "step_checkpoint"  # Add a prefix for each checkpoint
output_dir = os.path.join(new_model_base_dir, checkpoint_prefix)  # Final output directory

In [17]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",   #16bit
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=1,
    save_steps=200,  # 100
    save_total_limit=3,  # Keep only the 3 most recent checkpoints
    save_strategy="steps",  
    warmup_steps=50,
    logging_strategy="steps",
    learning_rate=1e-4,  # 3e-5
    fp16=False,
    bf16=True, #true  
    group_by_length=True,
    report_to="wandb"
)
# Log the output directory for clarity
print(f"Model checkpoints will be saved in: {training_arguments.output_dir}")

Model checkpoints will be saved in: llama-3-3b-it-personaB/step_checkpoint




In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    peft_config=peft_config,
    processing_class=tokenizer,  # Use the tokenizer object
    args=training_arguments,
)



Map:   0%|          | 0/5400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [19]:
model.config.use_cache = False
trainer.train()



Step,Training Loss,Validation Loss
50,5.1868,2.241447
100,4.1627,2.049918
150,3.9797,2.022768
200,3.7388,1.987654
250,4.2841,1.963702
300,4.0986,1.944645
350,4.1589,1.932925
400,3.4073,1.918638
450,4.0144,1.905796
500,4.2031,1.897264


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=2700, training_loss=3.5836684241118255, metrics={'train_runtime': 31711.2662, 'train_samples_per_second': 0.17, 'train_steps_per_second': 0.085, 'total_flos': 2.840165305812787e+16, 'train_loss': 3.5836684241118255, 'epoch': 1.0})

In [20]:
wandb.finish()
model.config.use_cache = True

0,1
eval/loss,█▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▇▆▅█▇▇▇▇▆▆▇█▇▇█▇▇▇▇▇▃▂▂▂▁▂▁▂▁▂▁▅▅▅▆▆▆▆▅▆
eval/samples_per_second,▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▄▄▄▄▄██▄▄▄▄▄▄▄▄▄
eval/steps_per_second,▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▄▄█▄▄▄▄▄▄█▄▄▄▄▄▄▄
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█████
train/grad_norm,▃▂▂▂▂▂▂▃▁▂▂▂▅▂▂▂▄▃▂▂▃▅▃▆▃▅█▆▃▆▄▄▄▅▆▅▃▇▄▄
train/learning_rate,████▇▇▇▇▇▇▇▇▇▇▇▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▁▁▁▁
train/loss,▇▅▅▆█▂▅▄▃▅▅▅▄▄▄▅▁▆▅▆▂▄▄▂▆▄▄▂▃▄▃▆▄▄▃▆▅▄▂▃

0,1
eval/loss,1.66205
eval/runtime,435.2444
eval/samples_per_second,1.379
eval/steps_per_second,1.379
total_flos,2.840165305812787e+16
train/epoch,1.0
train/global_step,2700.0
train/grad_norm,6.45393
train/learning_rate,0.0
train/loss,3.7512


In [21]:
import os
import zipfile
from tqdm import tqdm

# Define the directory to zip and output zip file
dir_to_zip = "/kaggle/working/llama-3-3b-it-personaB/step_checkpoint/checkpoint-2700"
output_zip_file = "checkpoint-2700_3b_ft.zip"

# Get all files to zip
file_paths = []
for root, _, files in os.walk(dir_to_zip):
    for file in files:
        file_paths.append(os.path.join(root, file))

# Zip the files with tqdm progress bar
with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_STORED) as zipf:
    for file in tqdm(file_paths, desc="Zipping files", unit="file"):
        zipf.write(file, os.path.relpath(file, dir_to_zip))

print(f"Zipping completed: {output_zip_file}")

Zipping files: 100%|██████████| 11/11 [00:08<00:00,  1.34file/s]

Zipping completed: checkpoint-2700_3b_ft.zip





In [23]:
from IPython.display import FileLink

# Create a download link for the zip file
FileLink('/kaggle/working/checkpoint-2700_3b_ft.zip')

In [22]:
checkpoint_path = "/kaggle/working/llama-3-3b-it-personaB/step_checkpoint/checkpoint-2700"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id


In [31]:
vocab_size = len(tokenizer)
print(vocab_size)

128258


In [35]:
# Load the fine-tuned PEFT model from checkpoint
model = PeftModel.from_pretrained(model, checkpoint_path)



In [37]:
# Resize token embeddings if necessary
model.resize_token_embeddings(len(tokenizer))

Embedding(128258, 3072)

In [77]:
# Define the conversation messages
messages = [
    {
        "role": "system",
        "content": "Persona B's characteristics: My name is David, and I'm a 35-year-old math teacher. "
                   "I like to hike and spend time in nature. I'm married with two kids."
    },
    {
        "role": "user",
        "content": "Morning! I think I saw you at the parent meeting, what's your name?"
    }
]

In [78]:
# Prepare the prompt using the chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [79]:
# Tokenize the input
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

In [80]:
# Run inference with the model
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=250,
        num_return_sequences=1
    )

In [81]:
# Decode the response from the model
decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [82]:
# Extract the assistant's reply (if available)
if "assistant" in decoded_text:
    response = decoded_text.split("assistant", 1)[1].strip()
else:
    response = decoded_text.strip()

In [83]:
# Print the assistant's reply
print(response)

Morning! My name is David, nice to meet you. Yeah, I was at the parent meeting, just discussing the upcoming math tests with the other teachers. How about you, how's your day going so far?


In [84]:
trainer.model.save_pretrained(new_model)



In [86]:
newhf_token = user_secrets.get_secret("ft_hfhub")
login(token = newhf_token)

In [87]:
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ishitas2365/llama-3-3b-it-personaB/commit/10d5f9a99e95d012eec37e0e272a0cbfd6cd4b17', commit_message='Upload model', commit_description='', oid='10d5f9a99e95d012eec37e0e272a0cbfd6cd4b17', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ishitas2365/llama-3-3b-it-personaB', endpoint='https://huggingface.co', repo_type='model', repo_id='ishitas2365/llama-3-3b-it-personaB'), pr_revision=None, pr_num=None)