In [None]:
!pip install transformers datasets accelerate peft bitsandbytes

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
#Libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import DataCollatorForLanguageModeling
from transformers import BitsAndBytesConfig #For quantization

#Base Model
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype='float16'
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")



lora_config = LoraConfig(
    r=8, # The higher the value of r allows for more complex updates, potentially leading to better performance on complex tasks :D .
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

#Dataset : In my case i used ChatML style dataset in a jsonl file format. In jsonl format every line conatians a json object.
data = load_dataset("json", data_files="Final_dataset.json")

#Preparation and tokenization of data
def tokenize(example):
    messages = (
        f"<|im_start|>instruction\n{example['instruction']}<|im_end|>\n"
        f"<|im_start|>input\n{example['input']}<|im_end|>\n"
        f"<|im_start|>output\n{example['output']}<|im_end|>"
    )
    return tokenizer(messages, truncation=True, padding='max_length', max_length=512)
# Mapping of tokens
tokenized_data = data["train"].map(tokenize)

# If you increase the number of epochs and per_device_train_batch_size and gradient_accumulation_steps , the training time will going to increase
training_args = TrainingArguments(
    output_dir="./tiny-assistant",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    bf16=False,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


SyntaxError: invalid syntax (ipython-input-1586301333.py, line 68)

In [None]:
# Mount your google drive and Save the trained weights on drive
from google.colab import drive
save_path = "path"
trainer.save_model(save_path)

In [None]:
# Now the weights that you trained are saved on google drive with files like  adapter_config.jsonand training_args.bin

In [None]:
#This below script conatins code to merge weights into the base model
from google.colab import drive
drive.mount('/content/drive')

from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Base model
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# LoRA checkpoint in Google Drive
lora_path = "/content/drive/MyDrive/fine_tune_outputs/checkpoint-5000"

# Where to save merged model in Drive
save_path = "/content/drive/MyDrive/merged_tinyllama_model"

# Load base model + tokenizer
model = AutoModelForCausalLM.from_pretrained(base_model_id)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Load LoRA and merge into base model
model = PeftModel.from_pretrained(model, lora_path)
model = model.merge_and_unload()

# Save merged model to Drive
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Merged model saved at {save_path}")


In [None]:
# Merged model saved on drive now should contains files like model.safetensors and tokenizer.json and tokenizer.model. This process usually takes 3 to 5 minutes

In [1]:
#This next step is to make a Model file and convert the model into a file with extension as gguf
#Ollama uses GGUF as the backend format for LLaMA-family models (including TinyLlama).

# you can use this video for reference - https://youtu.be/NirQJr85Qgc?si=dk_FgbUstckUUGxz

# I will going to recommend, you do this step inside colab because that would be much easier

In [None]:
#Step to convert model to gguf format
!git clone https://github.com/ggerganov/llama.cpp #clone the repo

In [None]:
%cd "llama.cpp"
!git checkpoint 46c69e0

In [None]:
!pip install -r requirements.txt

In [None]:
%cd ..

In [None]:
!python llama.cpp/convert_hf_to_gguf.py \
    /content/drive/MyDrive/deployed \
    --outfile path_to_gguf_file \
    --outtype auto

In [2]:
#now the to make model file
# This video is helpful -  https://youtu.be/pTaSDVz0gok?si=bpdTwifBJv1_Xju4
#Make a folder and put your gguf file in it
#open terminal go to the folder where your model file  and write this command

touch Modelfile #this command created a modelfile

nano Modelfile #opens the model file

#Paste this

FROM ./path_to_gguffile.gguf


PARAMETER temperature 0.7
PARAMETER top_p 0.9


TEMPLATE """<|im_start|>instruction
{{ .System }}<|im_end|>
<|im_start|>input
{{ .Prompt }}<|im_end|>
<|im_start|>output
"""

SYSTEM """You are Vulcan, an uncensored and unbiased AI assistant.

PARAMETER stop "<|im_end|>"


SyntaxError: invalid syntax (ipython-input-2044834616.py, line 5)

In [None]:
#Hit Ctrl X then Y + enter
# now thw final command
ollama create model_name -f Modelfile

#Then
ollama list