# CodeMind fine tuning
## Methods:
* model: gemma-7b-it
* PEFT, QLoRA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/CodeMind

/content/drive/MyDrive/CodeMind


In [None]:
!pip install -r requirements.txt --upgrade

In [44]:
import datasets
import evaluate
import numpy as np
import torch
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, \
    BitsAndBytesConfig, DataCollatorForLanguageModeling, GenerationConfig
from google.colab import userdata

In [5]:
import wandb

wandb.login()

wandb.init(
    project='falcon-7b',
    name = '4 bit qlora',
)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjehwan-kim[0m ([33mcodemind[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [50]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'tiiuae/falcon-7b'
token = userdata.get('HF_READ')

tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=token,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [51]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 6,924,080,000 || trainable%: 0.03407378308742822


In [57]:
tokenizer.pad_token = tokenizer.eos_token

df = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train')
df = df.remove_columns(['id', 'thumbnail', 'title_x', 'title_y', 'java', 'c++', 'python', 'javascript', 'similar_question_ids', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes',])

def formatting_func(d):
    return f"<human>: {d['question_content']}\nTag: {d['tag']}\nLevel: {d['level']}\nHints: {d['question_hints']}\n<assistant>: {d['cc_content']}".strip()

def tokenize(example):
    full_prompt = formatting_func(example)
    tokenized = tokenizer(full_prompt, padding='max_length', truncation=True)
    return tokenized

df = df.map(tokenize)
df = df.remove_columns(['cc_content', 'question_content', 'tag', 'level', 'question_hints'])
df

Map:   0%|          | 0/18136 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [9]:
ta = TrainingArguments(
    output_dir='out',
    overwrite_output_dir=True,

    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type='cosine',
    warmup_ratio=.05,

    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,

    logging_steps=1,
    report_to='wandb',

    max_steps=2000,
)

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=df,
    args=ta,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [36]:
# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind'
write_token = userdata.get('HF_WRITE')
revision_id = 'falcon-7b-20240404'

In [None]:
model.push_to_hub(
    repo_id=peft_model_id,
    token=write_token,
    revision=revision_id,
)

generation_config = GenerationConfig(
    penalty_alpha=.6,
    do_sample=True,
    top_k=5,
    temperature=.5,
    repetition_penalty=1.2,
    max_new_tokens=512,
    pad_token_id=tokenizer.eos_token_id,
)

generation_config.push_to_hub(
    repo_id=peft_model_id,
    token=write_token,
    revision=revision_id,
)

In [25]:
# del trainer
# del data_collator
# del ta
# del df
# del model
# del tokenizer
# del model_id
# del bnb_config
# del lora_config

# import gc
# import torch

# gc.collect()
# torch.cuda.empty_cache()

In [40]:
# test the model

from transformers import pipeline
import time

system_prompt = f"""
"""

def formatting_func(user_input):
    global system_prompt
    return f"<human>: {system_prompt}\n\n{user_input}\n<assistant>: "

def generate_response(user_input):
    prompt = formatting_func(user_input)

    inputs = tokenizer(prompt, return_tensors='pt')

    tic = time.time()

    outputs = model(**inputs, generation_config=generation_config)

    return {
        'consumed_time': time.time() - tic,
        'response': tokenizer.decode(outputs[0], skip_special_tokens=True)
    }


In [41]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained(peft_model_id, revision=revision_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id, revision=revision_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): FalconForCausalLM(
      (transformer): FalconModel(
        (word_embeddings): Embedding(65024, 4544)
        (h): ModuleList(
          (0-31): 32 x FalconDecoderLayer(
            (self_attention): FalconAttention(
              (rotary_emb): FalconRotaryEmbedding()
              (query_key_value): lora.Linear(
                (base_layer): FalconLinear(in_features=4544, out_features=4672, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4544, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4672, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (dense): FalconLin

In [42]:
# with torch.no_grad():
#     generate_response('leetcode number 1 add sum')

model.generation_config

GenerationConfig {
  "bos_token_id": 11,
  "eos_token_id": 11
}

In [59]:
%cd /content/drive/MyDrive/CodeMind
!git config --global user.name 'Jehwan '
!git add .
!git commit -m 'PEFT done.'

/content/drive/MyDrive/CodeMind
Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@f363caa35651.(none)')
