In [1]:
import evaluate
import datasets
import pandas as pd

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = 'kreimben/CodeMind-gemma'

tokenizer = AutoTokenizer.from_pretrained('kreimben/CodeMind-gemma')
tokenizer.padding_side = 'left'
model = AutoModelForCausalLM.from_pretrained('kreimben/CodeMind-gemma')
model = model.to('cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
submission_dataset = datasets.load_dataset('csv', data_files='./user_submission_only_python.csv',
                                           split='train').to_pandas()
submission_dataset = submission_dataset[['title_slug', 'question_hints', 'question_content', 'content']]
captions_dataset = datasets.load_dataset('kreimben/leetcode_with_youtube_captions', split='train').to_pandas()[
    ['title_slug', 'question_hints', 'question_content', 'cc_content']]
captions_dataset.rename(columns={'cc_content': 'content'}, inplace=True)

dataset = pd.concat([submission_dataset, captions_dataset])

del submission_dataset, captions_dataset

dataset.head(3)

Unnamed: 0,title_slug,question_hints,question_content,content
0,two-sum,A really brute force way would be to search fo...,Given an array of integers nums and an integer...,# Intuition\n<!-- Describe your first thoughts...
1,two-sum,A really brute force way would be to search fo...,Given an array of integers nums and an integer...,# Beginner doubt - Where is main function?\n- ...
2,two-sum,A really brute force way would be to search fo...,Given an array of integers nums and an integer...,# **Read article Explaination and codes : \n\n...


In [4]:
from datasets import Dataset

dataset = Dataset.from_pandas(dataset)

In [5]:
GEMMA_2B_IT_MODEL_PREFIX_TEXT = """Below is an coding test problem. Solve the question."""


def generate_prompt(data_point):
    return f"""<start_of_turn>user {GEMMA_2B_IT_MODEL_PREFIX_TEXT}\n
I don't know {data_point['title_slug']} problem. give me the insight or appoach.\n
this is problem's hint.\n{data_point["question_hints"]}\n
here are some content of question.\n{data_point["question_content"]}<end_of_turn>
<start_of_turn>model {data_point["content"]}<end_of_turn>"""

In [6]:
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)
dataset = dataset.shuffle(seed=42)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map:   0%|          | 0/35997 [00:00<?, ? examples/s]

In [7]:
dataset = dataset.remove_columns(['title_slug', 'question_hints', 'question_content', 'content'])
dataset

Dataset({
    features: ['__index_level_0__', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 35997
})

In [20]:
example_dataset = dataset['prompt'][:30]

In [None]:
%%time

from utils import get_completion

import torch

references = []
predictions = []

with torch.no_grad():
    for prompt in example_dataset:
        references.append(prompt.split('model')[-1])
        predict = get_completion(prompt, model, tokenizer, device='cuda', max_new_tokens=2048).split('model')[-1]
        predictions.append(predict)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
rouge = evaluate.load('rouge')

In [None]:
results = rouge.compute(predictions=predictions, references=references)
results