# CodeMind fine tuning
## Methods:
* model: meta-llama/Meta-Llama-3-70B-Instruct

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/CodeMind/

In [None]:
!pip uninstall tensorflow -y
!pip install -r requirements.txt --upgrade

In [None]:
import datasets
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer

In [None]:
from google.colab import userdata

model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
token = userdata.get('HF_READ')

tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map='auto',
                                             token=token)
model.config.use_cache = False
model.gradient_checkpointing_enable()

In [None]:
submission_dataset = datasets.load_dataset('kreimben/leetcode_user_submissions_only_python', split='train').to_pandas()
submission_dataset = submission_dataset[['title', 'question_hints', 'question_content', 'content']]
captions_dataset = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train').to_pandas()
captions_dataset = captions_dataset[['title', 'question_hints', 'question_content', 'cc_content']]
captions_dataset.rename(columns={'cc_content': 'content'}, inplace=True)

dataset = pd.concat([submission_dataset, captions_dataset])

del submission_dataset, captions_dataset

In [None]:
dataset.sample(10)

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)

In [None]:
GEMMA_2B_IT_MODEL_PREFIX_TEXT = "You are a kind coding test teacher and below is the coding test problem. Explain the approach for the questions."
# "Below is an coding test problem. Solve the question."

In [None]:
def generate_prompt(data_point):
    return tokenizer.apply_chat_template(
        [
            {'role': 'system', 'content': GEMMA_2B_IT_MODEL_PREFIX_TEXT},
            {'role': 'user',
             'content': f"I don\'t know {data_point['title']} problem.\nthis is problem's hint.\n{data_point['question_hints']}\n"},
            {'role': 'assistant', 'content': f'here are some content of question.\n{data_point["question_content"]}'},
            {'role': 'assistant', 'content': f'{data_point["content"]}'}
        ],
        tokenize=False,
    )

In [None]:
# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [None]:
# Sample the data.
N = len(dataset)

import random

idx = random.randint(0, N)

df = dataset.to_pandas()
test = df.loc[idx, 'prompt']
test

In [None]:
import transformers

# tokenizer.pad_token = tokenizer.eos_token

args = transformers.TrainingArguments(
    output_dir='out',
    fp16=True,
    # optim="paged_adamw_8bit",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="prompt",
    max_seq_length=512,

    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),

    args=args,
)

In [None]:
trainer.train()

In [None]:
from google.colab import userdata

# upload the trained model to huggingface.
finetuned_model_id = 'kreimben/CodeMind-llama-3-70B'
write_token = userdata.get('HF_WRITE')

In [None]:
trainer.model.push_to_hub(finetuned_model_id, token=write_token, use_temp_dir=True)

In [None]:
tokenizer.push_to_hub(finetuned_model_id, token=write_token, use_temp_dir=True)