In [1]:
!pip install -q -U datasets

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tokenizers 0.14.1 requires huggingface_hub<0.18,>=0.16.4, but you have huggingface-hub 0.19.4 which is incompatible.[0m[31m
[0m

In [2]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [3]:
# You only need to run this once per machine
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets
!pip install -q -U trl

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2023.10.1 which is incompatible.
cuml 23.8.0 requires distributed==2023.7.1, but you have distributed 2023.10.1 which is incompatible.
libpysal 4.9.2 requires packaging>=22, but you have packaging 21.3 which is incompatible.
libpysal 4.9.2 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
momepy 0.6.0 requires shapely>=2, but you have shapely 1.8.5.post1 which is incompatible.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.24.3 which is incompatible.
pymc3 3.11.5 requires scipy<1.8.0,>=1.7.3, but you have scipy 1.11.4 which is incompatible.
tensorflowjs 4.12.0 requires packaging~=23.1, but you have packaging 21.3 which is incompati

### Loading the model and tokenizer

In [4]:
import datasets

In [5]:
import transformers
import torch
from tqdm import notebook
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers.pipelines.pt_utils import KeyDataset
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import bitsandbytes as bnb



In [6]:
model_id = "codellama/CodeLlama-7b-Instruct-hf"

### Preparing the Pipeline

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

Device:  cuda


In [8]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

In [9]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)

config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [11]:
# def tokenize(data):
#     result = tokenizer(
#         data['prompt'],
#         truncation=True,
#         max_length=512,
#         padding="max_length",
#     )
#     result["labels"] = result["input_ids"].copy()
#     return result

In [12]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer
)

In [13]:
pipeline.tokenizer.pad_token_id = tokenizer.eos_token_id

### Generating Code

In [14]:
def llama_v2_prompt(
    messages: list[dict]
):
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    BOS, EOS = "<s>", "</s>"
    DEFAULT_SYSTEM_PROMPT = f"""Provide answers in Python. Wrap around code in [PYTHON] and [/PYTHON]. Don't write any comments in generated code."""

    if messages[0]["role"] != "system":
        messages = [
            {
                "role": "system",
                "content": DEFAULT_SYSTEM_PROMPT,
            }
        ] + messages


    messages = [
        {
            "role": messages[1]["role"],
            "content": B_SYS + messages[0]["content"] + E_SYS + messages[1]["content"],
        }
    ] + messages[2:]

    messages_list = [
        f"{BOS}{B_INST}{(prompt['content']).strip()}{E_INST}{(answer['content']).strip()}{EOS}"
        for prompt, answer in zip(messages[::2], messages[1::2])
    ]
    messages_list.append(f"{BOS}{B_INST}{(messages[-1]['content']).strip()}{E_INST}\n")

    return "".join(messages_list)

In [15]:
def generate_code(dataset):
    responses = []
    batch_count = 0
    batch_size = 2
    print(f"Total batches = {len(dataset)//batch_size}")
    for out in pipeline(
      KeyDataset(dataset, "prompt"),
      do_sample=True,
      temperature=0.1,
      top_p=0.9,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      max_length=512,
      batch_size = batch_size
      ):
        batch_count += 1
        for o in out:
            responses.append(o['generated_text'])
        print('Processed batch[%d]'%batch_count, end='\r')
    return responses

In [16]:
def end_overlap(a, b):
    for i in range(0, len(a)):
        if b.startswith(a[i:]):
            return i
    return len(a)

def remove_prefix(s, prefix):
    prefix_end_idx = s.index(prefix) + len(prefix)
    return s[prefix_end_idx:]

def extract_code(prompt, response, code_context):
    B_PYTHON, E_PYTHON = '[PYTHON]', '[/PYTHON]'
    trimmed_response = remove_prefix(response, prompt)
    if trimmed_response.startswith(B_PYTHON):
        trimmed_response = remove_prefix(trimmed_response, B_PYTHON)
    try:
        suffix_idx = trimmed_response.index(E_PYTHON)
        trimmed_response = trimmed_response[:suffix_idx]
    except ValueError as ve:
          pass
    trimmed_response = trimmed_response.strip()
    code_context = code_context.strip()
    overlap_idx = end_overlap(code_context, trimmed_response)
    cleaned_response = code_context[:overlap_idx] + trimmed_response
    return cleaned_response

def save_file(path, code):
    with open(path, 'w') as file:
        file.write(code)

def run_code_generation(dataset, save_path, n_passes=5):
    responses = generate_code(dataset)
    return responses
  # for index, response in responses:
  #   question = row['question']
  #   code_context = row['code_context']
  #   for p in range(n_passes):
  #     prompt = f"# {question}\n{code_context}\n"
  #     instruction = {
  #       "role": "user",
  #       "content": prompt,
  #     }
  #     prompt, response = generate_code([instruction])
  #     code = extract_code(prompt, response, code_context)
  #     save_file(f"{save_path}/question-{index}-pass-{p}.py", code)

In [17]:

response = """<s>[INST]<<SYS>>
Provide answers in Python. Wrap around code in [PYTHON] and [/PYTHON]. Don't write any comments in generated code.
<</SYS>>

# Maximum Prefix Sum possible by merging two given arrays.
def maxPresum(a, b):[/INST]
[PYTHON]
def maxPresum(a, b):
    max_sum = 0
    for i in range(len(a)):
        for j in range(len(b)):
            max_sum = max(max_sum, a[i] + b[j])
    return max_sum
[/PYTHON]
"""

prompt = """<s>[INST]<<SYS>>
Provide answers in Python. Wrap around code in [PYTHON] and [/PYTHON]. Don't write any comments in generated code.
<</SYS>>

# Maximum Prefix Sum possible by merging two given arrays.
def maxPresum(a, b):[/INST]
"""

code_context = """def maxPresum(a, b):
"""

In [18]:
# extract_code(prompt, response, code_context)

In [19]:
CODE_GENERATION_PATH = "./generations/python"

In [20]:
train_df = pd.read_csv('/kaggle/input/xlcost-text-to-code/train_processed.csv', index_col=0)
train_df.head()

Unnamed: 0_level_0,text,code,question,functions,code_context
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Maximum Prefix Sum possible by merging two giv...,"def maxPresum(a, b):\n X = max(a[0], 0)\n ...",Maximum Prefix Sum possible by merging two giv...,"['def maxPresum(a, b):']","def maxPresum(a, b):"
1,Check if a number can be represented as sum of...,import math\n\n\ndef sumOfTwoCubes(n):\n lo...,Check if a number can be represented as sum of...,['def sumOfTwoCubes(n):'],import math\n\n\ndef sumOfTwoCubes(n):
2,Generate an N | Python3 program for the above ...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,Generate an N.,"['def sieveOfPrimes():', 'def getArray(arr, N):']",sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...
3,Nth natural number after removing all numbers ...,def findNthNumber(N):\n result = 0\n p =...,Nth natural number after removing all numbers ...,['def findNthNumber(N):'],def findNthNumber(N):
4,Check if an integer is rotation of another giv...,"import math\n\n\ndef check(A, B):\n if (A =...",Check if an integer is rotation of another giv...,"['def check(A, B):']","import math\n\n\ndef check(A, B):"


In [21]:
def build_prompt(question, code_context):
    prompt = f"# {question}\n{code_context}\n"
    instruction = {
      "role": "user",
      "content": prompt,
    }
    llama_prompt = llama_v2_prompt([instruction])
    return llama_prompt

In [22]:
def build_question_context(text, context):
    q_strs = text.split('|')
    q_strs = q_strs[1].split(';')[1:]
    # comments = []
    # for i in range(1, len(q_strs)):
    #     comments.append(f'\\ {i}. {q_strs[i]}')

    # return ' '.join(comments) + '\n' + context
    return context

In [23]:
train_df_300 = train_df[0:300].copy()
train_df_300['context'] = train_df_300.apply(lambda row: build_question_context(row.text, row.code_context), axis=1)
train_df_300['llama_prompt'] = train_df_300.apply(lambda row: build_prompt(row.question, row.code_context), axis=1)
train_df_300.head()

Unnamed: 0_level_0,text,code,question,functions,code_context,context,llama_prompt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Maximum Prefix Sum possible by merging two giv...,"def maxPresum(a, b):\n X = max(a[0], 0)\n ...",Maximum Prefix Sum possible by merging two giv...,"['def maxPresum(a, b):']","def maxPresum(a, b):","def maxPresum(a, b):",<s>[INST]<<SYS>>\nProvide answers in Python. W...
1,Check if a number can be represented as sum of...,import math\n\n\ndef sumOfTwoCubes(n):\n lo...,Check if a number can be represented as sum of...,['def sumOfTwoCubes(n):'],import math\n\n\ndef sumOfTwoCubes(n):,import math\n\n\ndef sumOfTwoCubes(n):,<s>[INST]<<SYS>>\nProvide answers in Python. W...
2,Generate an N | Python3 program for the above ...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,Generate an N.,"['def sieveOfPrimes():', 'def getArray(arr, N):']",sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
3,Nth natural number after removing all numbers ...,def findNthNumber(N):\n result = 0\n p =...,Nth natural number after removing all numbers ...,['def findNthNumber(N):'],def findNthNumber(N):,def findNthNumber(N):,<s>[INST]<<SYS>>\nProvide answers in Python. W...
4,Check if an integer is rotation of another giv...,"import math\n\n\ndef check(A, B):\n if (A =...",Check if an integer is rotation of another giv...,"['def check(A, B):']","import math\n\n\ndef check(A, B):","import math\n\n\ndef check(A, B):",<s>[INST]<<SYS>>\nProvide answers in Python. W...


In [24]:
prompts = train_df_300['llama_prompt'].values.tolist()
prompts_dic = {'prompt': prompts}
prompts_dataset = Dataset.from_dict(prompts_dic)

In [25]:
prompts_dataset[0]

{'prompt': "<s>[INST]<<SYS>>\nProvide answers in Python. Wrap around code in [PYTHON] and [/PYTHON]. Don't write any comments in generated code.\n<</SYS>>\n\n# Maximum Prefix Sum possible by merging two given arrays.\ndef maxPresum(a, b):[/INST]\n"}

In [26]:
train_dpo_df = pd.read_csv("/kaggle/input/xlcost-python-dpo/train.csv")
train_dpo_df.head()

Unnamed: 0,id,prompt,accepted_code,rejected_code
0,306,<s>[INST]<<SYS>>\nProvide answers in Python. W...,import math\n\n\ndef findSum(n):\n return m...,import math\n\n\ndef findSum(n):\n return (...
1,307,<s>[INST]<<SYS>>\nProvide answers in Python. W...,"def sum(x, y, n):\n sum1 = ((x ** 2) * (x *...","def sum(x, y, n):\n return sum([x * (x + y)..."
2,313,<s>[INST]<<SYS>>\nProvide answers in Python. W...,dp = [[-1 for j in range(500)]for i in range(5...,dp = [[-1 for j in range(500)]for i in range(5...
3,319,<s>[INST]<<SYS>>\nProvide answers in Python. W...,import sys\n\n\ndef maximumOccurrence(s):\n ...,import sys\n\n\ndef maximumOccurrence(s):\n ...
4,320,<s>[INST]<<SYS>>\nProvide answers in Python. W...,mod = 1000000007\ndp = [[[-1 for i in range(2)...,mod = 1000000007\ndp = [[[-1 for i in range(2)...


In [27]:
prompts_dpo_train = train_dpo_df['prompt'].values.tolist()
prompts_dpo_train_dic = {'prompt': prompts_dpo_train}
prompts_dpo_train_dataset = Dataset.from_dict(prompts_dpo_train_dic)

In [28]:
responses_dpo_train = run_code_generation(prompts_dpo_train_dataset, CODE_GENERATION_PATH, n_passes=1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Total batches = 120


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[2]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[4]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[6]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[8]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[10]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[12]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[14]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[16]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[18]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[20]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[22]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[24]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[26]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[28]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[30]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[32]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[34]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[36]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[38]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[40]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[42]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[44]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[46]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[48]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[50]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[52]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[54]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[56]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[58]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[60]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[62]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[64]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[66]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[68]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[70]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[72]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[74]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[76]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[78]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[80]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[82]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[84]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[86]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[88]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[90]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[92]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[94]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[96]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[98]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[100]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[102]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[104]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[106]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[108]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[110]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[112]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[114]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[116]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[118]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[120]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[122]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[124]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[126]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[128]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[130]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[132]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[134]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[136]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[138]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[140]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[142]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[144]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[146]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[148]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[150]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[152]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[154]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[156]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[158]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[160]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[162]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[164]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[166]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[168]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[170]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[172]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[174]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[176]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[178]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[180]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[182]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[184]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[186]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[188]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[190]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[192]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[194]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[196]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[198]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[200]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[202]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[204]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[206]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[208]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[210]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[212]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[214]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[216]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[218]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[220]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[222]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[224]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[226]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[228]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[230]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[232]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[234]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[236]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[240]

In [29]:
train_dpo_df['response'] = responses_dpo_train
train_dpo_df.head()

Unnamed: 0,id,prompt,accepted_code,rejected_code,response
0,306,<s>[INST]<<SYS>>\nProvide answers in Python. W...,import math\n\n\ndef findSum(n):\n return m...,import math\n\n\ndef findSum(n):\n return (...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
1,307,<s>[INST]<<SYS>>\nProvide answers in Python. W...,"def sum(x, y, n):\n sum1 = ((x ** 2) * (x *...","def sum(x, y, n):\n return sum([x * (x + y)...",<s>[INST]<<SYS>>\nProvide answers in Python. W...
2,313,<s>[INST]<<SYS>>\nProvide answers in Python. W...,dp = [[-1 for j in range(500)]for i in range(5...,dp = [[-1 for j in range(500)]for i in range(5...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
3,319,<s>[INST]<<SYS>>\nProvide answers in Python. W...,import sys\n\n\ndef maximumOccurrence(s):\n ...,import sys\n\n\ndef maximumOccurrence(s):\n ...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
4,320,<s>[INST]<<SYS>>\nProvide answers in Python. W...,mod = 1000000007\ndp = [[[-1 for i in range(2)...,mod = 1000000007\ndp = [[[-1 for i in range(2)...,<s>[INST]<<SYS>>\nProvide answers in Python. W...


In [30]:
train_dpo_df.to_csv('/kaggle/working/train_dpo_response.csv')

In [31]:
responses = run_code_generation(prompts_dataset, CODE_GENERATION_PATH, n_passes=1)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Total batches = 150


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[2]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[4]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[6]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[8]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[10]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[12]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[14]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[16]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[18]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[20]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[22]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[24]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[26]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[28]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[30]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[32]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[34]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[36]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[38]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[40]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[42]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[44]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[46]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[48]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[50]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[52]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[54]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[56]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[58]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[60]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[62]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[64]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[66]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[68]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[70]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[72]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[74]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[76]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[78]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[80]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[82]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[84]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[86]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[88]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[90]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[92]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[94]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[96]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[98]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[100]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[102]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[104]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[106]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[108]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[110]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[112]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[114]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[116]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[118]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[120]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[122]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[124]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[126]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[128]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[130]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[132]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[134]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[136]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[138]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[140]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[142]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[144]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[146]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[148]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[150]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[152]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[154]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[156]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[158]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[160]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[162]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[164]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[166]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[168]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[170]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[172]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[174]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[176]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[178]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[180]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[182]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[184]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[186]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[188]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[190]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[192]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[194]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[196]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[198]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[200]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[202]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[204]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[206]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[208]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[210]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[212]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[214]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[216]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[218]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[220]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[222]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[224]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[226]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[228]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[230]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[232]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[234]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[236]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[238]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[240]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[242]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[244]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[246]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[248]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[250]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[252]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[254]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[256]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[258]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[260]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[262]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[264]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[266]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[268]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[270]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[272]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[274]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[276]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[278]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[280]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[282]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[284]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[286]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[288]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[290]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[292]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[294]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[296]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processed batch[300]

In [32]:
train_df_300['response'] = responses
train_df_300.head()

Unnamed: 0_level_0,text,code,question,functions,code_context,context,llama_prompt,response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Maximum Prefix Sum possible by merging two giv...,"def maxPresum(a, b):\n X = max(a[0], 0)\n ...",Maximum Prefix Sum possible by merging two giv...,"['def maxPresum(a, b):']","def maxPresum(a, b):","def maxPresum(a, b):",<s>[INST]<<SYS>>\nProvide answers in Python. W...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
1,Check if a number can be represented as sum of...,import math\n\n\ndef sumOfTwoCubes(n):\n lo...,Check if a number can be represented as sum of...,['def sumOfTwoCubes(n):'],import math\n\n\ndef sumOfTwoCubes(n):,import math\n\n\ndef sumOfTwoCubes(n):,<s>[INST]<<SYS>>\nProvide answers in Python. W...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
2,Generate an N | Python3 program for the above ...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,Generate an N.,"['def sieveOfPrimes():', 'def getArray(arr, N):']",sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,sieve = [1] * (1000000 + 1)\n\n\ndef sieveOfPr...,<s>[INST]<<SYS>>\nProvide answers in Python. W...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
3,Nth natural number after removing all numbers ...,def findNthNumber(N):\n result = 0\n p =...,Nth natural number after removing all numbers ...,['def findNthNumber(N):'],def findNthNumber(N):,def findNthNumber(N):,<s>[INST]<<SYS>>\nProvide answers in Python. W...,<s>[INST]<<SYS>>\nProvide answers in Python. W...
4,Check if an integer is rotation of another giv...,"import math\n\n\ndef check(A, B):\n if (A =...",Check if an integer is rotation of another giv...,"['def check(A, B):']","import math\n\n\ndef check(A, B):","import math\n\n\ndef check(A, B):",<s>[INST]<<SYS>>\nProvide answers in Python. W...,<s>[INST]<<SYS>>\nProvide answers in Python. W...


In [33]:
train_df_300.to_csv('/kaggle/working/train_response.csv')

### Testing on Several Queries

### Code Infilling

For Future...

In [34]:
# from transformers import pipeline
# import torch

# generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
# # generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128, return_type = 1)

In [35]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import transformers
# import torch

# model_id = "codellama/CodeLlama-7b-hf"
# tokenizer2 = AutoTokenizer.from_pretrained(model_id)
# model2 = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     torch_dtype=torch.float16
# ).to("cuda")




In [36]:
# prompt = '''def remove_non_ascii(s: str) -> str:
#     """ <FILL_ME>
#     return result
# '''

# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
# output = model.generate(
#     input_ids,
#     max_new_tokens=200,
# )
# output = output[0].to("cpu")

# filling = tokenizer.decode(output[input_ids.shape[1]:], skip_special_tokens=True)


In [37]:
# print(prompt.replace("<FILL_ME>", filling))