# CodeMind fine tuning
## Methods:
* model: bert-large-uncased

In [1]:
import datasets
import evaluate
import numpy as np
import torch
from dotenv import load_dotenv
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, \
    DataCollatorForLanguageModeling
import os

load_dotenv()

True

In [2]:
import wandb

wandb.login()

wandb.init(
    project='bert-large-uncased',
    name='full fine-tuning',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [3]:
model_id = 'google-bert/bert-large-uncased'
token = os.getenv('HF_READ')

tokenizer = AutoTokenizer.from_pretrained(model_id,
                                          trust_remote_code=True,
                                          token=token)
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(0)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=token,
    trust_remote_code=True,
)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [4]:
df = datasets.load_dataset('kreimben/leetcode_user_submissions', split='train')

In [5]:
def formatting_func(d):
    return f"""
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: You are kind teacher which teaches coding test questions like leetcode. Please provide step by step instruction and insight for users.

### Input: I don't know how to approach {d['title_slug']} problem.

### Response: {d['content']}
""".strip()

#  {d['question_content']}\nTag: {d['tag']}\nLevel: {d['level']}\nHints: {d['question_hints']} 


def tokenize(example):
    full_prompt = formatting_func(example)
    tokenized = tokenizer(full_prompt, padding='max_length', truncation=True)
    return tokenized


df = df.map(tokenize)
df = df.remove_columns(
    ['question_content', 'title_slug', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content'])
train_dataset, test_dataset = df.train_test_split(test_size=0.2, seed=42).values()
train_dataset, test_dataset

Map:   0%|          | 0/109309 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 87447
 }),
 Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 21862
 }))

In [6]:
metric_bleu = evaluate.load("bleu")
metric_rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu_score = metric_bleu.compute(predictions=[decoded_preds], references=[decoded_labels])

    # Compute ROUGE scores
    rouge_scores = metric_rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": bleu_score["bleu"],
        "rouge1": rouge_scores["rouge1"],
        "rouge2": rouge_scores["rouge2"],
        "rougeL": rouge_scores["rougeL"],
    }

In [7]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

In [8]:
ta = TrainingArguments(
    output_dir='out',
    overwrite_output_dir=True,

    fp16=True,
    # optim="paged_adamw_8bit",
    # lr_scheduler_type='cosine',
    # warmup_ratio=.05,

    logging_steps=1000,
    report_to='wandb',
    
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    # eval_steps=1000,
    num_train_epochs=3,
    
    do_train=True,
    do_eval=True,

    metric_for_best_model="train_loss",
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=ta,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[early_stopping],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 9.37 GiB. GPU 0 has a total capacity of 15.99 GiB of which 1.01 GiB is free. Of the allocated memory 13.14 GiB is allocated by PyTorch, and 251.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind'
write_token = os.getenv('HF_WRITE')
revision_id = 'bert-large-uncased-20240408'

In [None]:
model.push_to_hub(
    repo_id=peft_model_id,
    token=write_token,
    revision=revision_id,
)

# generation_config = GenerationConfig(
#     penalty_alpha=.6,
#     do_sample=True,
#     top_k=5,
#     temperature=.5,
#     repetition_penalty=1.2,
#     max_new_tokens=512,
#     pad_token_id=tokenizer.eos_token_id,
# )

# generation_config.push_to_hub(
#     repo_id=peft_model_id,
#     token=write_token,
#     revision=revision_id,
# )

In [None]:
del trainer
del data_collator
del ta
del train_dataset
del test_dataset
del model
del tokenizer
del model_id
del metric_bleu
del metric_rouge

import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained(peft_model_id, revision=revision_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id, revision=revision_id, device_map='cuda', load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model.eval()

In [None]:
# test the model
from transformers import pipeline

system_prompt = f"""
"""


def formatting_func(user_input):
    global system_prompt
    return f"<human>: {system_prompt}\n\n{user_input}\n<assistant>: "


def generate_response(user_input):
    prompt = formatting_func(user_input)

    pipe = pipeline(
        'text-generation',
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
    )

    return pipe(prompt)


In [None]:
%%time

with torch.no_grad():
    res = generate_response('leetcode number 1 add sum')

# model.generation_config

In [None]:
res