In [1]:
import os
import numpy as np
import pandas as pd 

import hydra
from omegaconf import DictConfig
import datetime
from tqdm import tqdm
import wandb

import sys

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset

tokenizer =  AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
  

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [2]:
initial_solution = """def algorithm(n):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity
"""
assistant_response = """def algorithm(n):
    n = train_samples.shape[1]
    print(n)
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity
"""

In [7]:
# messages = get_messages(initial_solution=initial_solution, assistant_response=assistant_response)
# example = format_llama_message(*messages)

'[INST] <<SYS>>\nYou are an expert computer science researcher and programmer, especially skilled at debugging algorithms.\n<</SYS>>\n\nYou are given the following Python program:\n```python\ndef algorithm(n):\n    n = train_samples.shape[1]\n    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))\n    return test_parity\n\n```\nInsert print statements in the program that will help me debug and improve the program. [/INST] def algorithm(n):\n    n = train_samples.shape[1]\n    print(n)\n    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))\n    return test_parity\n'

In [10]:


dataset = load_dataset("json", data_files="printdata.json")
# dataset


example = dataset["train"][0]["text"]

print(example)
# tokenizer.pad_token = "[PAD]"
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"
tok = tokenize(example, tokenizer)
tok


[INST] <<SYS>>
You are an expert computer science researcher and programmer, especially skilled at debugging algorithms.
<</SYS>>

You are given the following Python program:
```python
def algorithm(n):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity

```
Insert print statements in the program that will help me debug and improve the program. [/INST] def algorithm(n):
    n = train_samples.shape[1]
    print(n)
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity



{'input_ids': [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 385, 17924, 6601, 10466, 5925, 261, 322, 27922, 29892, 7148, 2071, 24455, 472, 13490, 14009, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 3492, 526, 2183, 278, 1494, 5132, 1824, 29901, 13, 28956, 4691, 13, 1753, 5687, 29898, 29876, 1125, 13, 1678, 302, 353, 7945, 29918, 27736, 29889, 12181, 29961, 29896, 29962, 13, 1678, 11307, 29918, 5344, 353, 7442, 29889, 29882, 1429, 3552, 14968, 29918, 27736, 29892, 7945, 29918, 862, 537, 29889, 690, 14443, 29898, 29896, 29892, 448, 29896, 4961, 13, 1678, 736, 1243, 29918, 862, 537, 13, 13, 28956, 13, 17491, 1596, 9506, 297, 278, 1824, 393, 674, 1371, 592, 4744, 322, 11157, 278, 1824, 29889, 518, 29914, 25580, 29962, 822, 5687, 29898, 29876, 1125, 13, 1678, 302, 353, 7945, 29918, 27736, 29889, 12181, 29961, 29896, 29962, 13, 1678, 1596, 29898, 29876, 29897, 13, 1678, 11307, 29918, 5344, 353, 7442, 29889, 29882, 1429, 3552, 14968, 29918, 27736, 29892, 7945, 29918, 8

In [4]:
print(tokenizer.decode(tok["input_ids"]))
print(tokenizer.decode(tok["labels"]))

<s> [INST] <<SYS>>
You are an expert computer science researcher and programmer, especially skilled at debugging algorithms.
<</SYS>>

You are given the following Python program:
```python
def algorithm(n):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity

```
Insert print statements in the program that will help me debug and improve the program. [/INST] def algorithm(n):
    n = train_samples.shape[1]
    print(n)
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity
</s>
<s> [INST] <<SYS>>
You are an expert computer science researcher and programmer, especially skilled at debugging algorithms.
<</SYS>>

You are given the following Python program:
```python
def algorithm(n):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity

```
Insert print statements in the program that will help me debug 

In [None]:


dataset = load_dataset("b-mc2/sql-create-context", split="train")
subset_dataset = dataset.select(range(5))
train_dataset = subset_dataset.select(range(1))
eval_dataset = subset_dataset.select(range(1))

DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"

# add eos token
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"

# tokenize
tokenized_train_dataset = train_dataset.map(lambda data_point: generate_and_tokenize_prompt(data_point, tokenizer))
tokenized_val_dataset = eval_dataset.map(lambda data_point: generate_and_tokenize_prompt(data_point, tokenizer))

# torchrun --nproc_per_node=1 finetune.py model.codellama_7b
# python finetune.py --config-name=model/codellama_7b


In [None]:
for feature in tokenized_train_dataset.features:
    print(feature, tokenized_train_dataset[0][feature])

answer SELECT COUNT(*) FROM head WHERE age > 56
context CREATE TABLE head (age INTEGER)
question How many heads of the departments are older than 56 ?
input_ids [1, 887, 526, 263, 13988, 1426, 29899, 517, 29899, 4176, 1904, 29889, 3575, 4982, 338, 304, 1234, 5155, 1048, 263, 2566, 29889, 887, 526, 2183, 263, 1139, 322, 3030, 11211, 697, 470, 901, 6131, 29889, 13, 13, 3492, 1818, 1962, 278, 3758, 2346, 393, 6089, 278, 1139, 29889, 13, 13, 2277, 29937, 10567, 29901, 13, 5328, 1784, 15883, 310, 278, 5840, 1860, 526, 9642, 1135, 29871, 29945, 29953, 1577, 13, 13, 2277, 29937, 15228, 29901, 13, 27045, 10911, 2343, 313, 482, 2672, 4330, 17070, 29897, 13, 13, 2277, 29937, 13291, 29901, 13, 6404, 21122, 22798, 3895, 2343, 5754, 5046, 1405, 29871, 29945, 29953, 13, 2]
attention_mask [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
print(tokenizer.decode(tokenized_train_dataset[0]["input_ids"]))

<s> You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

You must output the SQL query that answers the question.

### Input:
How many heads of the departments are older than 56 ?

### Context:
CREATE TABLE head (age INTEGER)

### Response:
SELECT COUNT(*) FROM head WHERE age > 56
</s>


In [None]:
print(tokenizer.decode(tokenized_train_dataset[0]["labels"]))

<s> You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.

You must output the SQL query that answers the question.

### Input:
How many heads of the departments are older than 56 ?

### Context:
CREATE TABLE head (age INTEGER)

### Response:
SELECT COUNT(*) FROM head WHERE age > 56
</s>


In [None]:
import io 
import json
import copy

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}


tokenizer.pad_token = DEFAULT_PAD_TOKEN

def _tokenize_fn(strings, tokenizer):
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=2000,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )



def preprocess(
    sources,
    targets,
    tokenizer,
):
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = -100
    return dict(input_ids=input_ids, labels=labels)

def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict


list_data_dict = jload("data/debug_data.json")


prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources = [
    prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
    for example in list_data_dict[:10]
]
targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]


data_dict = preprocess(sources, targets, tokenizer)


In [None]:
print(sources[6])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain why the following fraction is equivalent to 1/4

### Input:
4/16

### Response:


In [None]:
print(targets[6])

The fraction 4/16 is equivalent to 1/4 because both numerators and denominators are divisible by 4. Dividing both the top and bottom numbers by 4 yields the fraction 1/4.</s>


In [None]:
print(tokenizer.decode(data_dict["input_ids"][6]))

<s> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain why the following fraction is equivalent to 1/4

### Input:
4/16

### Response:The fraction 4/16 is equivalent to 1/4 because both numerators and denominators are divisible by 4. Dividing both the top and bottom numbers by 4 yields the fraction 1/4.</s>


In [None]:
solution = """def algorithm(train_samples, train_parity, test_samples):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity"""

human_message = human_message.format(solution=solution)
assistant_message = assistant_message.format(solution=solution)

<s>[INST] <<SYS>>
You are an expert computer science researcher and programmer, especially skilled at debugging algorithms.
<</SYS>>

You are given the following Python program:

```python
def algorithm(train_samples, train_parity, test_samples):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity
```
Insert print statements in the program that will help me debug and improve the program. [/INST] def algorithm(train_samples, train_parity, test_samples):
    n = train_samples.shape[1]
    aug_matrix = np.hstack((train_samples, train_parity.reshape(1, -1)))
    return test_parity </s>
