# Fine tuning OPT350m on a single GPU

**GOAL:** fine tune opt350m for code assistance 
**Method:**  train on response only using code-alpaca-20k dataset


### Import libraries

In [11]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from trl import SFTTrainer,DataCollatorForCompletionOnlyLM
from pynvml import *

### Some useful functions for analyzing the GPU

In [12]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [13]:
print_gpu_utilization()

GPU memory occupied: 2443 MB.


### Bits and Bytes configuration for using quantization

In [14]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


### Load the model

In [5]:

model = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-350m",
        #quantization_config=bnb_config,
        device_map={"": 0},
        trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token

In [6]:
print(model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

### Try the model

In [7]:
from transformers import pipeline, set_seed
from transformers import TextStreamer, pipeline
streamer = TextStreamer(tokenizer, skip_prompt=True)

set_seed(32)
generator = pipeline('text-generation', 
                     model=model, 
                     tokenizer=tokenizer, 
                     streamer = streamer, 
                     do_sample=True,
                     max_length= 256)
prompt = """Code a function for calculating prime numbers"""
_=generator(prompt)

, and write the code to define the first four number elements a given number of times, or the definition of the last 4 number elements a given number of times (or the third number element is an infinite number of times). The code can be used to compute the prime numbers independently for arbitrary values of the number; for example, if the function returns a single prime number, the calculation is performed by using the value of the first four number element of the first parameter alone, and only two prime numbers are required for the computation (i.e., the first 4 number element).
In one preferred embodiment, the function to calculate prime numbers may be a function to determine how many prime numbers a given number of times are included in a set of prime numbers. This function also preferably has the capability of calculating the prime numbers and prime numbers in successive numbers, and calculating the prime numbers and prime numbers in successive numbers independently of each other.

## TRAINING

### LoRa Setup

In [18]:

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.5,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj","k_proj"] # obtained by the output of the model
)

model.config.use_cache = False
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

### Trainer Arguments

In [20]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [21]:

training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    optim='adamw_bnb_8bit',
    save_steps=250,
    fp16=True,
    logging_steps=10,
    learning_rate=2e-5,
    max_grad_norm=0.3,
    #max_steps=5000,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    evaluation_strategy=    
)

trainable params: 4718592 || all params: 335915008 || trainable%: 1.4046981788917272


### Load Dataset

In [22]:
train_dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

Found cached dataset parquet (/home/harpo/.cache/huggingface/datasets/lucasmccabe-lmi___parquet/lucasmccabe-lmi--CodeAlpaca-20k-b92d1194a2c963a0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [23]:
train_dataset[:-1]

{'instruction': ['Create a function that takes a specific input and produces a specific output using any mathematical operators. Write corresponding code in Python.',
  'Generate a unique 8 character string that contains a lowercase letter, an uppercase letter, a numerical digit, and a special character. Write corresponding code in Python.',
  'Given a course consisting of 5 lessons, write a program to display the first lesson. Write corresponding code in Python.',
  'Create an algorithm to encourage work balance and productivity in remote work.',
  'Write a JavaScript that changes the text of a paragraph element, from "Hello, World!" to "Goodbye, World!"',
  'Implement a sorting algorithm which takes an array of integers as input and sorts it in ascending order.',
  'Generate a C code snippet to print a given string with a width of 20 characters. Write corresponding code in Python.',
  'Construct a loop in Swift to find duplicate numbers in an array. Write corresponding code in Python

### Format data for training

In [24]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

### Train model

In [25]:
import os
#os.environ["WANDB_DISABLED"] = "true"
#os.environ["WANDB_NOTEBOOK_NAME"] = "opt350m-codealpaca-20k2"
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    #dataset_text_field="instruction",
    formatting_func= formatting_prompts_func,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    data_collator= collator
)

#for name, module in trainer.model.named_modules():
#    if "norm" in name:
#        module = module.to(torch.float32)

trainer.train()


Loading cached processed dataset at /home/harpo/.cache/huggingface/datasets/lucasmccabe-lmi___parquet/lucasmccabe-lmi--CodeAlpaca-20k-b92d1194a2c963a0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-504195368976ddd5.arrow
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


skipped Embedding(50272, 512, padding_idx=1): 24.546875M params
skipped OPTLearnedPositionalEmbedding(2050, 1024): 26.548828125M params
skipped: 26.548828125M params


[34m[1mwandb[0m: Currently logged in as: [33mharpomaxx[0m ([33mlabsin[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.1707
20,1.9967


TrainOutput(global_step=5000, training_loss=1.4813128192901612, metrics={'train_runtime': 1220.433, 'train_samples_per_second': 32.775, 'train_steps_per_second': 4.097, 'total_flos': 1.44801829134336e+16, 'train_loss': 1.4813128192901612, 'epoch': 2.0})

In [26]:
model.save_pretrained("models/")

## INFERENCE

In [2]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from transformers import pipeline, set_seed
from transformers import TextStreamer, pipeline

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)


model = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-350m",
        quantization_config=bnb_config,
        device_map={"": 0},
        trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from peft import PeftModel
# You can comment and un comment this line to either use base model 
# or the peft model during the inference.
model = PeftModel.from_pretrained(model,'../../../models/',local_files_only=True)


In [4]:
set_seed(32)
streamer = TextStreamer(tokenizer, skip_prompt=True)
generator = pipeline('text-generation', 
                     model=model, 
                     tokenizer=tokenizer, 
                     streamer = streamer, 
                     do_sample=True,
                     max_length= 256)


The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBer

In [5]:
prompt = """Code a function in python for calculating prime numbers ### Answer:"""
_=generator(prompt)

 def prime_ numbers(): 
  return 0, 1, 2, 3, 4, 5, 6
  end

# print number result

total = (total #1, total #2, total #3, total #4) + 1
pipys = []
result = n1, result = pips2, result_placement = kpips1, result_value = kpips2>= kpips2 

print(result) # print (0) 
# print (1) 
# print (2) 
print(pipys) # print (3) 

# print (4) 
print(result_placement) # print (5) 
console.log(result) # Output: 
"1" 
console.log(result) # Output: "8"

# print (5) 
console.log(result_value) # Output: "11"
 
# print (6)
console.log(result_value) # Output: "13"
 
pipys_input = "1


### Gradio UI

In [None]:

import gradio as gr

demo = gr.Interface.from_pipeline(generator)
demo.launch(share=True)

## MERGE LoRa

In [7]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained("../../../models/opt350m-codealpaca-20k/")

### Try new model

In [1]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from transformers import pipeline, set_seed
from transformers import TextStreamer, pipeline

model_code = AutoModelForCausalLM.from_pretrained(
        "../../../models/opt350m-codealpaca-20k/",
        local_files_only= True,
        #quantization_config=bnb_config,
        device_map={"": 0},
        trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(model_code)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [9]:
set_seed(32)
streamer = TextStreamer(tokenizer, skip_prompt=True)
generator = pipeline('text-generation', 
                     model=model_code, 
                     tokenizer=tokenizer, 
                     streamer = streamer, 
                     do_sample=True,
                     max_length= 256)

In [10]:
prompt = """Code a function in python for calculating prime numbers ### Answer:"""
_=generator(prompt)

 def prime_ numbers(): 
  return 0, 1, 2, 3, 4, 5, 6
  end

# print number result

total = (total #1, total #2, total #3, total #4) + 1
pipys = []
result = n1, result = pips2, result_placement = kpips1, result_value = kpips2>= kpips2 

print(result) # print (0) 
# print (1) 
# print (2) 
print(pipys) # print (3) 

# print (4) 
print(result_placement) # print (5) 
console.log(result) # Output: 
"1" 
console.log(result) # Output: "8"

# print (5) 
console.log(result_value) # Output: "11"
 
# print (6)
console.log(result_value) # Output: "13"
 
pipys_input = "1


## REFERENCES

[1] https://towardsdatascience.com/fine-tuning-large-language-models-llms-23473d763b91

[2] https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only

[3] https://huggingface.co/facebook/opt-350m

[4] https://medium.com/@rohit.pegallapati/fine-tune-falcon-7b-instruct-model-on-single-commodity-gpu-cf65a86c043a

[5] https://huggingface.co/docs/transformers/v4.23.1/en/perf_train_gpu_one

[6] [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/pdf/2205.01068.pdf)

[7] https://huggingface.co/docs/peft/conceptual_guides/lora