# CodeMind fine tuning
## Methods:
* model: gemma-7b-it
* PEFT, QLoRA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%cd /content/drive/MyDrive/CodeMind

/content/drive/MyDrive/CodeMind


In [None]:
!pip install -r requirements.txt --upgrade

In [3]:
import datasets
import evaluate
import numpy as np
import torch
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, \
    BitsAndBytesConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from google.colab import userdata

import wandb

wandb.login()

wandb.init(
    project='BERT-large',
    name = '4 bit qlora',
)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjehwan-kim[0m ([33mcodemind[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [28]:
lora_config = LoraConfig(
    r=8,
    # target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google-bert/bert-large-uncased'
token = userdata.get('HF_READ')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

# Load the model and modify the configuration for approximate GeLU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=token,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of BertLMHeadModel were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 786,432 || all params: 335,991,412 || trainable%: 0.23406312539916943


In [30]:
# def tokenize(data):
#     input_ids = tokenizer(data['question_content'], truncation=True, padding=True, return_tensors='pt')['input_ids']
#     labels = tokenizer(data['cc_content'], truncation=True, padding=True, return_tensors='pt')['input_ids']
#     labels[labels == tokenizer.pad_token_id] = -100  # Replace padding tokens with -100 for loss calculation
#     return {'input_ids': input_ids, 'labels': labels}

In [31]:
df = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train')

# df = df.map(tokenize)

df = df.train_test_split(test_size=0.2, shuffle=True)
test_dataset, eval_dataset = df['train'], df['test']
test_dataset, eval_dataset

(Dataset({
     features: ['cc_content', 'id', 'thumbnail', 'title_x', 'question_content', 'java', 'c++', 'python', 'javascript', 'title_y', 'tag', 'level', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes', 'question_hints', 'similar_question_ids'],
     num_rows: 14508
 }),
 Dataset({
     features: ['cc_content', 'id', 'thumbnail', 'title_x', 'question_content', 'java', 'c++', 'python', 'javascript', 'title_y', 'tag', 'level', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes', 'question_hints', 'similar_question_ids'],
     num_rows: 3628
 }))

In [32]:
ta = TrainingArguments(
    output_dir='out',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    weight_decay=.01,
    fp16=True,
    # optim="paged_adamw_8bit",

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    # warmup_steps=2,
    # max_steps=10,
    # gradient_accumulation_steps=4,

    logging_steps=1,
    report_to=None,  # 'wandb',
)

In [33]:
response_template = " ### Answer:"
data_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# rouge_metric = evaluate.load('rouge')
# bleu_metric = evaluate.load('bleu')


# def compute_metrics(eval_pred):
#     predictions, references = eval_pred

#     return {
#         'rouge': rouge_metric.compute(predictions=predictions, references=references),
#         # 'bleu': bleu_metric.compute(predictions=decoded_preds, references=decoded_references),
#     }


def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"### Question: {example['question_content'][i]}\n ### Answer: {example['cc_content'][i]}"
        output_texts.append(text)
    return output_texts


trainer = SFTTrainer(
    model=model,
    train_dataset=test_dataset,
    eval_dataset=eval_dataset,
    # args=ta,
    peft_config=lora_config,
    formatting_func=formatting_func,
    data_collator=data_collator,
    # max_seq_length=9_999,
    # compute_metrics=compute_metrics,
)

Map:   0%|          | 0/14508 [00:00<?, ? examples/s]

Map:   0%|          | 0/3628 [00:00<?, ? examples/s]

In [34]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=108, training_loss=16.671399151837385, metrics={'train_runtime': 21.5379, 'train_samples_per_second': 39.697, 'train_steps_per_second': 5.014, 'total_flos': 799027256954880.0, 'train_loss': 16.671399151837385, 'epoch': 3.0})

In [35]:
# del trainer
# del data_collator
# del rouge_metric
# del bleu_metric
# del response_template
# del ta
# del test_dataset
# del eval_dataset
# del df
# del model
# del tokenizer
# del model_id
# del bnb_config
# del lora_config

# import gc
# import torch

# gc.collect()
# torch.cuda.empty_cache()