# CodeMind fine tuning
## Methods:
* model: gemma-2b-it

In [1]:
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer

In [2]:
import wandb

wandb.login()

wandb.init(
    project='google gemma 2b it',
    name='peft-qlora',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [3]:
import os

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google/gemma-1.1-2b-it'
token = os.getenv('HF_READ')

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0},
                                             token=token)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, token=token)

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [5]:
# Find the modules in the model for qlora target modules.
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names:  # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

['down_proj', 'v_proj', 'q_proj', 'k_proj', 'up_proj', 'o_proj', 'gate_proj']

In [6]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable / total * 100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


In [7]:
# dataset = datasets.load_dataset('kreimben/leetcode_user_submissions', split='train')
dataset = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train')
# dataset = datasets.load_dataset('csv', data_files='../user_submission_only_python.csv', split='train')
dataset.features

{'cc_content': Value(dtype='string', id=None),
 'id': Value(dtype='int64', id=None),
 'thumbnail': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'question_content': Value(dtype='string', id=None),
 'java': Value(dtype='string', id=None),
 'c++': Value(dtype='string', id=None),
 'python': Value(dtype='string', id=None),
 'javascript': Value(dtype='string', id=None),
 'title_slug': Value(dtype='string', id=None),
 'tag': Value(dtype='string', id=None),
 'level': Value(dtype='string', id=None),
 'success_rate': Value(dtype='float64', id=None),
 'total_submission': Value(dtype='float64', id=None),
 'total_accepted': Value(dtype='float64', id=None),
 'question_likes': Value(dtype='float64', id=None),
 'question_dislikes': Value(dtype='float64', id=None),
 'question_hints': Value(dtype='string', id=None),
 'similar_question_ids': Value(dtype='string', id=None)}

In [8]:
GEMMA_2B_IT_MODEL_PREFIX_TEXT = """Below is an coding test problem. Solve the question."""

def generate_prompt(data_point):
    return f"""<start_of_turn>user {GEMMA_2B_IT_MODEL_PREFIX_TEXT}\n
I don't know {data_point['title_slug']} problem. give me the insight or appoach.\n
this is problem's hint.\n{data_point["question_hints"]}\n
here are some content of question.\n{data_point["question_content"]}<end_of_turn>
<start_of_turn>model {data_point["cc_content"]}<end_of_turn>"""

In [9]:
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)
dataset = dataset.shuffle(seed=42)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset, test_dataset

Map:   0%|          | 0/18136 [00:00<?, ? examples/s]

(Dataset({
     features: ['cc_content', 'id', 'thumbnail', 'title', 'question_content', 'java', 'c++', 'python', 'javascript', 'title_slug', 'tag', 'level', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes', 'question_hints', 'similar_question_ids', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 14508
 }),
 Dataset({
     features: ['cc_content', 'id', 'thumbnail', 'title', 'question_content', 'java', 'c++', 'python', 'javascript', 'title_slug', 'tag', 'level', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes', 'question_hints', 'similar_question_ids', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 3628
 }))

In [10]:
# Sample the data.
N = len(test_dataset)

import random

idx = random.randint(0, N)

df = test_dataset.to_pandas()
test = df.loc[idx, 'prompt']
test

'<start_of_turn>user Below is an coding test problem. Solve the question.\n\nI don\'t know design-log-storage-system problem. give me the insight or appoach.\n\nthis is problem\'s hint.\nNone\n\nhere are some content of question.\nYou are given several logs, where each log contains a unique ID and timestamp. Timestamp is a string that has the following format: `Year:Month:Day:Hour:Minute:Second`, for example, `2017:01:01:23:59:59`. All domains are zero-padded decimal numbers.\n\nImplement the `LogSystem` class:\n\n*   `LogSystem()` Initializes the `LogSystem` object.\n*   `void put(int id, string timestamp)` Stores the given log `(id, timestamp)` in your storage system.\n*   `int[] retrieve(string start, string end, string granularity)` Returns the IDs of the logs whose timestamps are within the range from `start` to `end` inclusive. `start` and `end` all have the same format as `timestamp`, and `granularity` means how precise the range should be (i.e. to the exact `Day`, `Minute`, etc

In [11]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        # num_train_epochs=3,
        max_steps=1000,
        logging_steps=10,
        output_dir="out",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/14508 [00:00<?, ? examples/s]

Map:   0%|          | 0/3628 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
10,3.2012
20,2.3221
30,1.9312
40,1.8417
50,1.7951
60,1.7141
70,1.6708
80,1.7253
90,1.6694
100,1.6907


TrainOutput(global_step=1000, training_loss=1.4347172193527222, metrics={'train_runtime': 3467.4149, 'train_samples_per_second': 1.154, 'train_steps_per_second': 0.288, 'total_flos': 4.962371927161651e+16, 'train_loss': 1.4347172193527222, 'epoch': 0.28})

In [13]:
from dotenv import load_dotenv

load_dotenv()

# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind-gemma'
write_token = os.getenv('HF_WRITE')

In [14]:
%cd out/

!echo %cd%

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out
C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out


In [15]:
from peft import PeftModel

trainer.model.save_pretrained(peft_model_id + 'Peft')

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model = PeftModel.from_pretrained(base_model, peft_model_id + 'Peft')
merged_model = merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained(peft_model_id + 'Merged', safe_serialization=True)
tokenizer.save_pretrained(peft_model_id + 'Merged')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
from huggingface_hub import notebook_login

# notebook_login()

merged_model.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/689 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma/commit/155d773ddad701b3b7b45336b06009d2641e0dcd', commit_message='Upload GemmaForCausalLM', commit_description='', oid='155d773ddad701b3b7b45336b06009d2641e0dcd', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
tokenizer.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/689 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma/commit/eefce1ba52efc2769760fe95fdc617d57ddaca8d', commit_message='Upload tokenizer', commit_description='', oid='eefce1ba52efc2769760fe95fdc617d57ddaca8d', pr_url=None, pr_revision=None, pr_num=None)