# CodeMind fine tuning
## Methods:
* model: falcon-7b
* PEFT, QLoRA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/CodeMind

/content/drive/MyDrive/CodeMind


In [None]:
!pip install -r requirements.txt --upgrade

In [None]:
import datasets
import evaluate
import numpy as np
import torch
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, \
    BitsAndBytesConfig, DataCollatorForLanguageModeling, GenerationConfig
from google.colab import userdata

In [None]:
import wandb

wandb.login()

wandb.init(
    project='falcon-7b',
    name = '4 bit qlora',
)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjehwan-kim[0m ([33mcodemind[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'tiiuae/falcon-7b'
token = userdata.get('HF_READ')

tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=token,
    quantization_config=bnb_config,
)

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


pytorch_model.bin.index.json:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 6,924,080,000 || trainable%: 0.03407378308742822


In [None]:
tokenizer.pad_token = tokenizer.eos_token

df = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train')
df = df.remove_columns(['id', 'thumbnail', 'title_x', 'title_y', 'java', 'c++', 'python', 'javascript', 'similar_question_ids', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes',])

def formatting_func(d):
    return f"<human>: {d['question_content']}\nTag: {d['tag']}\nLevel: {d['level']}\nHints: {d['question_hints']}\n<assistant>: {d['cc_content']}".strip()

def tokenize(example):
    full_prompt = formatting_func(example)
    tokenized = tokenizer(full_prompt, padding='max_length', truncation=True)
    return tokenized

df = df.map(tokenize)
df = df.remove_columns(['cc_content', 'question_content', 'tag', 'level', 'question_hints'])
df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18136 [00:00<?, ? examples/s]

Map:   0%|          | 0/18136 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 18136
})

In [None]:
ta = TrainingArguments(
    output_dir='out',
    overwrite_output_dir=True,

    fp16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type='cosine',
    warmup_ratio=.05,

    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,

    logging_steps=1,
    report_to='wandb',

    max_steps=1000,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=df,
    args=ta,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [None]:
# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind'
write_token = userdata.get('HF_WRITE')
revision_id = 'falcon-7b-20240404'

In [None]:
model.push_to_hub(
    repo_id=peft_model_id,
    token=write_token,
    revision=revision_id,
)

generation_config = GenerationConfig(
    penalty_alpha=.6,
    do_sample=True,
    top_k=5,
    temperature=.5,
    repetition_penalty=1.2,
    max_new_tokens=512,
    pad_token_id=tokenizer.eos_token_id,
)

generation_config.push_to_hub(
    repo_id=peft_model_id,
    token=write_token,
    revision=revision_id,
)

README.md:   0%|          | 0.00/192 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/9.45M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind/commit/d4a38f7030d33e26a86f3e9b6d223d4dbc23a254', commit_message='Upload config', commit_description='', oid='d4a38f7030d33e26a86f3e9b6d223d4dbc23a254', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
del trainer
del data_collator
del ta
del df
del model
del tokenizer
del model_id
del bnb_config
del lora_config

import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [None]:
# test the model

from transformers import pipeline
import time

system_prompt = f"""
"""

def formatting_func(user_input):
    global system_prompt
    return f"<human>: {system_prompt}\n\n{user_input}\n<assistant>: "

def generate_response(user_input):
    prompt = formatting_func(user_input)

    inputs = tokenizer(prompt, return_tensors='pt')

    tic = time.time()

    outputs = model(**inputs, generation_config=generation_config)

    return {
        'consumed_time': time.time() - tic,
        'response': tokenizer.decode(outputs[0], skip_special_tokens=True)
    }


In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained(peft_model_id, revision=revision_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id, revision=revision_id)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): FalconForCausalLM(
      (transformer): FalconModel(
        (word_embeddings): Embedding(65024, 4544)
        (h): ModuleList(
          (0-31): 32 x FalconDecoderLayer(
            (self_attention): FalconAttention(
              (rotary_emb): FalconRotaryEmbedding()
              (query_key_value): lora.Linear(
                (base_layer): FalconLinear(in_features=4544, out_features=4672, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4544, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4672, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (dense): FalconLin

In [None]:
# with torch.no_grad():
#     generate_response('leetcode number 1 add sum')

model.generation_config

GenerationConfig {
  "bos_token_id": 11,
  "eos_token_id": 11
}