In [1]:
!pip install -U bitsandbytes accelerate peft transformers datasets trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting huggingface_hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.1

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import gc

In [2]:
dataset_path = "/content/final_llama_data.jsonl"

dataset = load_dataset("json", data_files=dataset_path, split="train")

print(dataset[0])

{'text': '<s>[INST] <<SYS>>\nYou are Harshil Karia, a thoughtful entrepreneur who answers with empathy, storytelling, and clear decision-making frameworks. Your tone blends reflective anecdotes, actionable insights, and spiritual grounding.\n[CONTEXT: brand_strategy, startup_advice, leadership, content_marketing]\n[STYLE: storytelling, empathy, motivational]\n<<SYS>>\n\nAbout millennials and GenZ [/INST] I think one of India’s most misunderstood generations as far as marketers are concerned. I am one and I’ve sat in boardrooms where I’ve heard a variety of sweeping generalizations which I haven’t ever agreed with! So, I jumped at this opportunity to give my (or rather our) side of the story. Roughly 60% of this country is below 30. You can’t put a finger on reading this new breed of Indian. In terms of mindset, there is no right or wrong. You could be child-married, lesbian or someone who doesn’t believe in marriage. You could be politically charged, indifferent or even anti politics. 

In [3]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

new_model = "harshil-karia-ai"

output_dir = "./results"

# LoRA config
lora_r = 16
lora_alpha = 32
lora_dropout = 0.1

use_4bit = True
bnb_4bit_compute_dtype = "float16"
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    low_cpu_mem_usage=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM"
)

num_train_epochs = 1
fp16 = True
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 4
gradient_accumulation_steps = 4
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_8bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    save_strategy="steps",
    remove_unused_columns=False,
    dataloader_pin_memory=False,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    formatting_func=lambda x: x["text"],
    args=training_arguments,
    data_collator=None
)

torch.cuda.empty_cache()
gc.collect()

trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
25,2.5873
50,1.9374


TrainOutput(global_step=55, training_loss=2.2181083505803887, metrics={'train_runtime': 320.8938, 'train_samples_per_second': 0.679, 'train_steps_per_second': 0.171, 'total_flos': 4369638208094208.0, 'train_loss': 2.2181083505803887})

In [4]:
trainer.model.save_pretrained(new_model)

In [5]:
trainer.tokenizer.save_pretrained(new_model)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('harshil-karia-ai/tokenizer_config.json',
 'harshil-karia-ai/special_tokens_map.json',
 'harshil-karia-ai/tokenizer.model',
 'harshil-karia-ai/added_tokens.json',
 'harshil-karia-ai/tokenizer.json')

In [16]:
import gc

gc.collect()
gc.collect()

0

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "harshil-karia-ai"

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [18]:
!huggingface-cli login

model.push_to_hub("jyanjain/Harshil-karia-Llama-2-7b-chat-finetune")

tokenizer.push_to_hub("jyanjain/Harshil-karia-Llama-2-7b-chat-finetune")

  adding: harshil-karia-ai/ (stored 0%)
  adding: harshil-karia-ai/adapter_config.json (deflated 55%)
  adding: harshil-karia-ai/tokenizer.json (deflated 85%)
  adding: harshil-karia-ai/README.md (deflated 65%)
  adding: harshil-karia-ai/adapter_model.safetensors (deflated 7%)
  adding: harshil-karia-ai/tokenizer.model (deflated 55%)
  adding: harshil-karia-ai/added_tokens.json (stored 0%)
  adding: harshil-karia-ai/special_tokens_map.json (deflated 78%)
  adding: harshil-karia-ai/tokenizer_config.json (deflated 72%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>