# CodeMind fine tuning
## Methods:
* model: gemma-2b-it

In [1]:
import datasets
import pandas as pd
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer

In [2]:
import wandb

wandb.login()

wandb.init(
    project='google gemma 2b it',
    name='peft-qlora dataset concat',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [3]:
import os

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google/gemma-1.1-2b-it'
token = os.getenv('HF_READ')

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0},
                                             token=token)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, token=token)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [5]:
# Find the modules in the model for qlora target modules.
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names:  # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

['up_proj', 'down_proj', 'k_proj', 'q_proj', 'gate_proj', 'o_proj', 'v_proj']

In [6]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable / total * 100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


In [12]:
submission_dataset = datasets.load_dataset('csv', data_files='../user_submission_only_python.csv', split='train').to_pandas()
submission_dataset

Unnamed: 0,title_slug,question_content,tag,level,question_hints,view_count,vote_count,content
0,two-sum,Given an array of integers nums and an integer...,"Array,Hash Table",Easy,A really brute force way would be to search fo...,630455,3674,# Intuition\n<!-- Describe your first thoughts...
1,two-sum,Given an array of integers nums and an integer...,"Array,Hash Table",Easy,A really brute force way would be to search fo...,480307,1288,# Beginner doubt - Where is main function?\n- ...
2,two-sum,Given an array of integers nums and an integer...,"Array,Hash Table",Easy,A really brute force way would be to search fo...,6898,16,# **Read article Explaination and codes : \n\n...
3,two-sum,Given an array of integers nums and an integer...,"Array,Hash Table",Easy,A really brute force way would be to search fo...,51383,263,# Intuition\n<!-- Describe your first thoughts...
4,two-sum,Given an array of integers nums and an integer...,"Array,Hash Table",Easy,A really brute force way would be to search fo...,107540,1045,If you\'re a newbie and sometimes have a hard ...
...,...,...,...,...,...,...,...,...
17856,root-equals-sum-of-children,You are given the root of a binary tree that c...,"Tree,Binary Tree",Easy,,2950,19,***Happy Coding..!* Feel free to ask Q\'s...**...
17857,root-equals-sum-of-children,You are given the root of a binary tree that c...,"Tree,Binary Tree",Easy,,762,3,"```\ndef checkTree(self, root: Optional[TreeNo..."
17858,root-equals-sum-of-children,You are given the root of a binary tree that c...,"Tree,Binary Tree",Easy,,3618,17,```\n# Definition for a binary tree node.\n# c...
17859,root-equals-sum-of-children,You are given the root of a binary tree that c...,"Tree,Binary Tree",Easy,,5828,19,"```\nclass Solution:\n def checkTree(self, ..."


In [13]:
submission_dataset = submission_dataset[['title_slug', 'question_hints', 'question_content', 'content']]
captions_dataset = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train').to_pandas()[['title_slug', 'question_hints', 'question_content', 'cc_content']]

dataset = pd.concat([submission_dataset, captions_dataset])

In [14]:
dataset.sample(10)

Unnamed: 0,title_slug,question_hints,question_content,content,cc_content
251,longest-common-prefix,,Write a function to find the longest common pr...,# Please UPVOTE\uD83D\uDE0A\n![image.png]()\n\...,
13528,most-visited-sector-in-a-circular-track,For each round increment the visits of the sec...,Given an integer n and an integer array rounds...,![Screenshot 2022-12-28 at 13.38.21.png]()\n\n...,
6636,implement-trie-prefix-tree,,A [**trie**](https://en.wikipedia.org/wiki/Tri...,,hey so welcome back and today is March 17th an...
5764,find-all-numbers-disappeared-in-an-array,This is a really easy problem if you decide to...,Given an array `nums` of `n` integers where `n...,,problem so let's go to the problem statement u...
6592,next-greater-element-iii,,"Given a positive integer n, find the smallest ...",# Intuition\n<!-- Describe your first thoughts...,
12925,people-whose-list-of-favorite-companies-is-not...,Use hashing to convert company names in number...,Given the array favoriteCompanies where favori...,# Complexity\n- Time complexity: O(N*N)\n- Spa...,
1146,basic-calculator,,Given a string `s` representing a valid expres...,,in this video when you look at an equal proble...
959,rotate-list,,"Given the head of a linked list, rotate the li...",# Intuition\n<!-- Describe your first thoughts...,
13299,maximum-product-subarray,,"Given an integer array `nums`, find a subarray...",,hello everyone so today we will be doing lead ...
6261,string-compression-ii,Use dynamic programming. The state of the DP c...,Given `n` `points` on a 2D plane where `points...,,Hello friends today I'm going to solve L Cod p...


In [17]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)

In [18]:
GEMMA_2B_IT_MODEL_PREFIX_TEXT = """Below is an coding test problem. Solve the question."""


def generate_prompt(data_point):
    return f"""<start_of_turn>user {GEMMA_2B_IT_MODEL_PREFIX_TEXT}\n
I don't know {data_point['title_slug']} problem. give me the insight or appoach.\n
this is problem's hint.\n{data_point["question_hints"]}\n
here are some content of question.\n{data_point["question_content"]}<end_of_turn>
<start_of_turn>model {data_point["cc_content"]}<end_of_turn>"""

In [19]:
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)
dataset = dataset.shuffle(seed=42)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset, test_dataset

Map:   0%|          | 0/35997 [00:00<?, ? examples/s]

(Dataset({
     features: ['title_slug', 'question_hints', 'question_content', 'content', 'cc_content', '__index_level_0__', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 28797
 }),
 Dataset({
     features: ['title_slug', 'question_hints', 'question_content', 'content', 'cc_content', '__index_level_0__', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 7200
 }))

In [20]:
# Sample the data.
N = len(test_dataset)

import random

idx = random.randint(0, N)

df = test_dataset.to_pandas()
test = df.loc[idx, 'prompt']
test

"<start_of_turn>user Below is an coding test problem. Solve the question.\n\nI don't know the-most-recent-orders-for-each-product problem. give me the insight or appoach.\n\nthis is problem's hint.\nNone\n\nhere are some content of question.\nYou are given an integer `n`, the number of teams in a tournament that has strange rules:\n\n*   If the current number of teams is **even**, each team gets paired with another team. A total of `n / 2` matches are played, and `n / 2` teams advance to the next round.\n*   If the current number of teams is **odd**, one team randomly advances in the tournament, and the rest gets paired. A total of `(n - 1) / 2` matches are played, and `(n - 1) / 2 + 1` teams advance to the next round.\n\nReturn _the number of matches played in the tournament until a winner is decided._\n\n**Example 1:**\n\n**Input:** n = 7\n**Output:** 6\n**Explanation:** Details of the tournament: \n- 1st Round: Teams = 7, Matches = 3, and 4 teams advance.\n- 2nd Round: Teams = 4, Ma

In [21]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        # num_train_epochs=3,
        max_steps=500,
        logging_steps=10,
        output_dir="out",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/28797 [00:00<?, ? examples/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [22]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
10,4.071
20,2.5586
30,2.0648
40,1.7388
50,1.6423
60,1.744
70,1.4827
80,1.618
90,1.4373
100,1.4806


TrainOutput(global_step=500, training_loss=1.4551381168365478, metrics={'train_runtime': 1185.713, 'train_samples_per_second': 1.687, 'train_steps_per_second': 0.422, 'total_flos': 1.4871506301702144e+16, 'train_loss': 1.4551381168365478, 'epoch': 0.07})

In [23]:
from dotenv import load_dotenv

load_dotenv()

# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind-gemma'
write_token = os.getenv('HF_WRITE')

In [24]:
%cd out/

!echo %cd%

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out
C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out


In [25]:
from peft import PeftModel

trainer.model.save_pretrained(peft_model_id + 'Peft')

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model = PeftModel.from_pretrained(base_model, peft_model_id + 'Peft')
merged_model = merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained(peft_model_id + 'Merged', safe_serialization=True)
tokenizer.save_pretrained(peft_model_id + 'Merged')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
from huggingface_hub import notebook_login

# notebook_login()

merged_model.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/657 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma/commit/2b33db405ba00e6e31db34c04c38994e2a5c572b', commit_message='Upload GemmaForCausalLM', commit_description='', oid='2b33db405ba00e6e31db34c04c38994e2a5c572b', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
tokenizer.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/657 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma/commit/7b3d35c64bfa314e597c576acdb895450c2fa870', commit_message='Upload tokenizer', commit_description='', oid='7b3d35c64bfa314e597c576acdb895450c2fa870', pr_url=None, pr_revision=None, pr_num=None)