# CodeMind fine tuning
## Methods:
* model: gemma-7b-it
* PEFT, QLoRA

In [1]:
import os

import datasets
import evaluate
import numpy as np
import torch
from dotenv import load_dotenv
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, \
    BitsAndBytesConfig
from trl import SFTTrainer

load_dotenv()

import wandb

wandb.login()

wandb.init(
    project='CodeMind',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [13]:
lora_config = LoraConfig(
    r=8,
    # target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google-bert/bert-large-uncase'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.getenv('HF_API'))

# Load the model and modify the configuration for approximate GeLU  
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=os.getenv('HF_API'),
    quantization_config=bnb_config
)

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.longformer.configuration_longformer.LongformerConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

In [14]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 786,432 || all params: 335,991,412 || trainable%: 0.23406312539916943


In [15]:
df = datasets.load_dataset('kreimben/leetcode_with_youtube_captions')['train']

In [16]:
train_dataset, test_dataset = df.train_test_split(test_size=0.2, seed=42)

In [17]:
ta = TrainingArguments(
    output_dir='out',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    weight_decay=.01,
    fp16=True,
    optim="paged_adamw_8bit",
    
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    
    warmup_steps=2,
    max_steps=10,
    gradient_accumulation_steps=4,

    logging_steps=1,
    report_to='wandb',
)

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"problem: {example['question_content'][i]}\nExplain: {example['cc_content'][i]}"[:512]
        output_texts.append(text)
    return output_texts


trainer = SFTTrainer(
    model=model,
    train_dataset=df,
    args=ta,
    peft_config=lora_config,
    formatting_func=formatting_func,
    data_collator=data_collator,
    max_seq_length=99_999,
)

In [19]:
trainer.train()

RuntimeError: The expanded size of the tensor (5722) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 5722].  Tensor sizes: [1, 512]