# CodeMind fine tuning
## Methods:
* model: gemma-2b-it

In [1]:
import datasets
import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer

In [2]:
import wandb

wandb.login()

wandb.init(
    project='google gemma 2b it',
    name='peft-qlora devocean',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [3]:
import os

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google/gemma-1.1-2b-it'
token = os.getenv('HF_READ')

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"": 0},
                                             token=token)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

tokenizer = AutoTokenizer.from_pretrained(model_id)
# FP16 issue in gemma tokenizer.
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(True, True)

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [5]:
# Find the modules in the model for qlora target modules.
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names:  # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

['up_proj', 'k_proj', 'down_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj']

In [6]:
lora_config = LoraConfig(
    r=6,
    lora_alpha=8,
    lora_dropout=0.05,
    target_modules=modules,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable / total * 100:.4f}%")

Trainable: 7354368 | total: 2513526784 | Percentage: 0.2926%


In [7]:
submission_dataset = datasets.load_dataset('kreimben/leetcode_user_submissions_only_python', split='train').to_pandas()
submission_dataset = submission_dataset[['title', 'question_hints', 'question_content', 'content']]
captions_dataset = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train').to_pandas()
captions_dataset = captions_dataset[['title', 'question_hints', 'question_content', 'cc_content']]
captions_dataset.rename(columns={'cc_content': 'content'}, inplace=True)

dataset = pd.concat([submission_dataset, captions_dataset])

del submission_dataset, captions_dataset

In [8]:
dataset.sample(10)

Unnamed: 0,title,question_hints,question_content,content
6558,Critical Connections in a Network,"After dividing the array into K+1 sub-arrays, ...",There are `n` servers numbered from `0` to `n ...,hello dear how are you doing oh hello hello he...
1651,Longest Increasing Subsequence,,"Given an integer array nums, return the length...","Basically, we just neet an array to keep track..."
6883,Subsets II,,Given an integer array `nums` that may contain...,a we're looking at lead code 90 it's called su...
17252,Check If String Is a Prefix of Array,It is always optimal to buy the least expensiv...,Given a string `s` and an array of strings `wo...,that's all the code 1961 check if string is a ...
3351,Maximum XOR With an Element From Array,Suppose the first digit you need is 'd'. How c...,You are given an array `nums` consisting of no...,More Amal Tomato Channels Rating Speed Manual ...
1566,Integer to English Words,Did you see a pattern in dividing the number i...,Convert a non-negative integer num to its Engl...,***Hello it would be my pleasure to introduce ...
2923,Number of Matching Subsequences,,Given a string s and an array of strings words...,The naive way of checking if a word is subsequ...
14071,Range Addition II,,You are given an `m x n` matrix `M` initialize...,hey hey everybody this is larry this is dave d...
14703,Unique Paths III,,You are given an `m x n` integer array `grid` ...,A Firewall Welcome To Bank Ko DP Tubelight Eng...
9521,Path Sum,,Given the `root` of a binary tree and an integ...,we have given a tree and we have given an inte...


In [9]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)

In [10]:
GEMMA_2B_IT_MODEL_PREFIX_TEXT = "You are a kind coding test teacher. Solve the given question and give the insight and approach."

In [11]:
# messages = [
#     {"role": "user",
#      "content": f"{GEMMA_2B_IT_MODEL_PREFIX_TEXT}\n\n{dataset['title'][300]}"},
#     {"role": "assistant",
#      "content": f'testtesttest'}
# ]
# chat_message = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
# chat_message

In [12]:
def generate_prompt(data_point):
    return f"""<bos><start_of_turn>user
{GEMMA_2B_IT_MODEL_PREFIX_TEXT}

{data_point["title"]}<end_of_turn>
<start_of_turn>model
{data_point["content"]} <end_of_turn><eos>"""

In [13]:
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [14]:
# Sample the data.
N = len(dataset)

import random

idx = random.randint(0, N)

df = dataset.to_pandas()
test = df.loc[idx, 'prompt']
test

"<bos><start_of_turn>user\nYou are a kind coding test teacher. Solve the given question and give the insight and approach.\n\nCounting Bits<end_of_turn>\n<start_of_turn>model\nhey guys persistent programmer here and welcome back to my channel so in this welcome back to my channel so in this channel we solve a lot of algorithms and channel we solve a lot of algorithms and go over legal questions so if you go over legal questions so if you haven't subscribed already go ahead and haven't subscribed already go ahead and hit the subscribe button smash that like hit the subscribe button smash that like button because that helps me create this button because that helps me create this content for you guys so without further content for you guys so without further ado let's go ahead and look at today's ado let's go ahead and look at today's problem problem counting bits and what we're given here counting bits and what we're given here is a number n is a number n and what we need to do is return

In [15]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    max_seq_length=512,
    # packing=False,
    # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),

    args=transformers.TrainingArguments(
        output_dir='out',
        max_steps=200,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        warmup_steps=0.03,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=20,
        report_to='wandb',
    ),
)

Map:   0%|          | 0/23627 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
20,3.1789
40,2.2382
60,2.1024
80,2.0372
100,1.819
120,2.0988
140,1.8068
160,1.975
180,1.8548
200,1.8184


TrainOutput(global_step=200, training_loss=2.092965488433838, metrics={'train_runtime': 283.4532, 'train_samples_per_second': 2.822, 'train_steps_per_second': 0.706, 'total_flos': 4606599606435840.0, 'train_loss': 2.092965488433838, 'epoch': 0.03})

In [17]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▃▃▃▆▆▆████
train/global_step,▁▂▃▃▄▅▆▆▇██
train/grad_norm,▇▂▁▁▄▂▁▂█▂
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▃▃▂▁▂▁▂▁▁

0,1
total_flos,4606599606435840.0
train/epoch,0.03
train/global_step,200.0
train/grad_norm,1.20525
train/learning_rate,0.0
train/loss,1.8184
train_loss,2.09297
train_runtime,283.4532
train_samples_per_second,2.822
train_steps_per_second,0.706


In [18]:
from dotenv import load_dotenv

load_dotenv()

# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind-gemma-2b'
write_token = os.getenv('HF_WRITE')

In [19]:
%cd out/

!echo %cd%

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out
C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out


In [20]:
from peft import PeftModel

trainer.model.save_pretrained(peft_model_id + 'Peft')

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map='auto',
)
merged_model = PeftModel.from_pretrained(
    base_model,
    peft_model_id + 'Peft',
    torch_dtype=torch.float16,
    device_map='auto',
)
merged_model = merged_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
# Save the merged model
merged_model.save_pretrained(peft_model_id + 'Merged')  #, safe_serialization=True)
tokenizer.save_pretrained(peft_model_id + 'Merged')

('kreimben/CodeMind-gemma-2bMerged\\tokenizer_config.json',
 'kreimben/CodeMind-gemma-2bMerged\\special_tokens_map.json',
 'kreimben/CodeMind-gemma-2bMerged\\tokenizer.json')

In [22]:
# from transformers import GenerationConfig
# 
# gen_config = GenerationConfig(
#     do_sample=True,
#     temperature=0.2,
#     top_k=50,
#     top_p=0.95,
#     add_special_tokens=True
# )
# 
# gen_config.validate()
# 
# gen_config.push_to_hub(peft_model_id, token=write_token)
# merged_model.generation_config = gen_config

In [23]:
merged_model.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma-2b/commit/ea2e9bc2120a551dea19d09be9def8b505b4fbe5', commit_message='Upload GemmaForCausalLM', commit_description='', oid='ea2e9bc2120a551dea19d09be9def8b505b4fbe5', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
tokenizer.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma-2b/commit/4b433f41e5093d2d066afde593cbfe93582bbdf0', commit_message='Upload tokenizer', commit_description='', oid='4b433f41e5093d2d066afde593cbfe93582bbdf0', pr_url=None, pr_revision=None, pr_num=None)