In [1]:
import json
import os
import re
import io
from rouge_score import rouge_scorer
import tqdm
import random
import dataclasses
from typing import Optional, Sequence, Union
import time
import sys
import logging
from dotenv import load_dotenv
import google.generativeai as genai
import math
import copy
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables from the .env file
load_dotenv()

genai.configure(api_key=os.environ["GEMINI_API"])

# Helper functions

In [3]:
def encode_prompt(prompt_instructions):
    """Encode multiple prompt instructions into a single string."""
    prompt = open("./prompt.txt").read() + "\n"

    for idx, task_dict in enumerate(prompt_instructions):
        (instruction, input, output) = (
            task_dict["instruction"],
            task_dict["input"],
            task_dict["output"],
        )
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        input = "<noinput>" if input.lower() == "" else input
        prompt += f"###\n"
        prompt += f"{idx + 1}. Instruction: {instruction}\n"
        prompt += f"{idx + 1}. Input:\n{input}\n"
        prompt += f"{idx + 1}. Output:\n{output}\n"
    prompt += f"###\n"

    # NOTE: Unbound warning
    prompt += f"{idx + 2}. Instruction:"
    return prompt

In [4]:
def seed_instruction_data_loader():
    seed_tasks_path = "./seed_tasks.jsonl"
    seed_tasks = [json.loads(l) for l in open(seed_tasks_path, "r")]

    seed_instruction_data = [
        {
            "instruction": t["instruction"],
            "input": t["instances"][0]["input"],
            "output": t["instances"][0]["output"],
        }
        for t in seed_tasks
    ]

    print(f"Loaded {len(seed_instruction_data)} human-written seed instructions")
    return seed_instruction_data

In [5]:
def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f

def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

In [6]:
def machine_instruction_data_loader():
    """
    load generated instruction if available
    """
    machine_instruction_data = []
    if os.path.exists(os.path.join("./", "regen.json")):
        machine_instruction_data = jload(os.path.join("./", "regen.json"))
        print(f"Loaded {len(machine_instruction_data)} machine-generated instructions")

        return machine_instruction_data
    else:
        return {} 

# Arguments for Gemini

In [7]:
@dataclasses.dataclass
class GeminiGenerationArguments:
    max_output_tokens: int = 1800
    temperature: float = 0.2
    top_p: float = 1.0
    # top_k: int = 40  # NOTE: we don't know how many
    candidate_count: int = 1
    stop_sequences: Optional[Sequence[str]] = None
    presence_penalty: float = 0.0
    frequency_penalty: float = 0.0
    response_mime_type: str = "text/plain"  # additional params (json or text)

# Using helper functions 

In [8]:
# some input arguments
request_batch_size = 5
num_prompt_instructions = 3
model_name = "gemini-1.5-flash"

In [9]:
# is a list
seed_instruction_data = seed_instruction_data_loader()

# is a dict 
machine_instruction_data = machine_instruction_data_loader()

Loaded 21 human-written seed instructions


In [10]:
# init some variables
request_idx = 0


In [11]:
request_idx += 1

batch_inputs = []

# we have 5 prompt each time request
for _ in range(request_batch_size):
    # only sampling from the seed tasks
    # choose random 3 samples from seed_instruction_data
    prompt_instructions = random.sample(
        seed_instruction_data, num_prompt_instructions
    )
    prompt = encode_prompt(prompt_instructions)
    batch_inputs.append(prompt)

In [12]:
# batch_inputs is list of multiple prompts
print(batch_inputs[0])

You are asked to come up with a set of 20 diverse code generation task instructions. These task instructions will be given to a GPT model and we will evaluate the GPT model for completing the instructions.

Here are the requirements:
1. Try not to repeat the verb for each instruction to maximize diversity.
2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instrucitons.
3. The type of instructions should be diverse. The list should include diverse types of programming tasks like open-ended generation, classification, editing, optimization etc.
2. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.
3. The instructions should be in English.
4. The instructions should at least 1 to 2 sentences long. Either an

In [13]:
decoding_args = GeminiGenerationArguments(
    temperature=1.0,
    candidate_count=1,
    max_output_tokens=3072,
    top_p=1.0,
    stop_sequences=["\n20", "20.", "20."],
)

In [14]:
# init some variables
request_batch_size = 5
max_batches=sys.maxsize
max_instances=sys.maxsize

In [15]:
prompts = batch_inputs
is_single_prompt = isinstance(prompts, (str, dict))

In [16]:
# check prompt is single or not
is_single_prompt

False

In [17]:
if max_batches < sys.maxsize:
    logging.warning(
        "`max_batches` will be deprecated in the future, please use `max_instances` instead."
        "Setting `max_instances` to `max_batches * batch_size` for now."
    )
    max_instances = max_batches * batch_size

In [18]:
max_instances

9223372036854775807

In [19]:
# NOTE: wtf why i have to do that??
prompts = prompts[:max_instances] 

In [20]:
prompts

['You are asked to come up with a set of 20 diverse code generation task instructions. These task instructions will be given to a GPT model and we will evaluate the GPT model for completing the instructions.\n\nHere are the requirements:\n1. Try not to repeat the verb for each instruction to maximize diversity.\n2. The language used for the instruction also should be diverse. For example, you should combine questions with imperative instrucitons.\n3. The type of instructions should be diverse. The list should include diverse types of programming tasks like open-ended generation, classification, editing, optimization etc.\n2. A GPT language model should be able to complete the instruction. For example, do not ask the assistant to create any visual or audio output. For another example, do not ask the assistant to wake you up at 5pm or set a reminder because it cannot perform any action.\n3. The instructions should be in English.\n4. The instructions should at least 1 to 2 sentences long.

In [21]:
# init some variable
batch_size=1

In [22]:
num_prompts = len(prompts)

# list of list (str)
prompt_batches = [
    prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
    for batch_id in range(int(math.ceil(num_prompts / batch_size)))
]

In [23]:
completions = []

# we use first comment 
for batch_id, prompt_batch in enumerate(prompt_batches):
    break

In [24]:
from dataclasses import asdict

model = genai.GenerativeModel(
    model_name=model_name,
    generation_config=genai.GenerationConfig(**asdict(decoding_args)),
)
# chat_session = model.start_chat(history=[])
# response = chat_session.send_message("Do you think earth is flat?")
response = model.generate_content("who are you")
print(response.text)

I am a large language model, trained by Google.  I'm an AI, and I don't have a name or personal experiences like humans do.  My purpose is to process information and respond to a wide range of prompts and questions in a comprehensive and informative way.



In [25]:
for chunk in response:
    print(chunk.text)

I am a large language model, trained by Google.  I'm an AI, and I don't have a name or personal experiences like humans do.  My purpose is to process information and respond to a wide range of prompts and questions in a comprehensive and informative way.



In [26]:
response.usage_metadata.total_token_count

62

In [27]:
result = {}
shared_kwargs = asdict(decoding_args)
# completion_batch = openai.Completion.create(
#     prompt=prompt_batch, **shared_kwargs
# )
# choices = completion_batch.choices

# for choice in choices:
#     choice["total_tokens"] = completion_batch.usage.total_tokens
# completions.extend(choices)

# NOTE: gemini api instead

model = genai.GenerativeModel(
    model_name=model_name,
    generation_config=genai.GenerationConfig(**shared_kwargs),
)
completion_batch = model.generate_content(prompt_batch)

result["text"] = completion_batch.text
result["total_tokens"] = completion_batch.usage_metadata.total_token_count

In [28]:
completions.append(result)

In [29]:
print(result["text"])

4. Instruction:  Develop a JavaScript function that checks if a given string is a palindrome (reads the same backward as forward), ignoring case and non-alphanumeric characters.
4. Input: "A man, a plan, a canal: Panama"
4. Output: 
```javascript
function isPalindrome(str) {
  // Remove non-alphanumeric characters and convert to lowercase
  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");
  
  // Reverse the string
  const reversedStr = str.split("").reverse().join("");
  
  // Check if the string is equal to its reverse
  return str === reversedStr;
}

console.log(isPalindrome("A man, a plan, a canal: Panama")); // true
```

5. Instruction:  Construct a Python program that reads a CSV file and calculates the average of a specified column.  Assume the file uses a comma as a delimiter.
5. Input:  "data.csv: Name,Age,Score\nAlice,25,85\nBob,30,92\nCharlie,28,78"
5. Output:
```python
import csv

def calculate_average(filename, column_name):
    with open(filename, 'r') as file:
        

In [67]:
completions

[{'text': '###\n4. Instruction:  Develop a Python function that identifies and extracts all email addresses from a given text string.\n4. Input:  "Contact us at support@example.com for assistance, or email sales@example.org for inquiries.  Another email: help.desk@example.co.uk"\n4. Output:\n```python\nimport re\n\ndef extract_emails(text):\n  """Extracts all email addresses from a given text string."""\n  email_regex = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}"\n  emails = re.findall(email_regex, text)\n  return emails\n\ntext = "Contact us at support@example.com for assistance, or email sales@example.org for inquiries.  Another email: help.desk@example.co.uk"\nprint(extract_emails(text))\n```\n###\n5. Instruction:  Construct a JavaScript function to reverse a given string.\n5. Input: <noinput>\n5. Output:\n```javascript\nfunction reverseString(str) {\n  return str.split("").reverse().join("");\n}\n```\n###\n6. Instruction:  Can you create a Java method that checks if a given 

In [30]:
results = completions

# Helper functions

In [50]:
def post_process_gpt3_response(num_prompt_instructions, response):
    if response is None:
        return []
    raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
    raw_instructions = re.split("###", raw_instructions)
    instructions = []
    for idx, inst in enumerate(raw_instructions):
        # if the decoding stops due to length, the last example is likely truncated so we discard it
        # if idx == len(raw_instructions) - 1 and response["finish_reason"] == "length":
        #     continue
        idx += num_prompt_instructions + 1
        splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)
        if len(splitted_data) != 7:
            continue
        else:
            inst = splitted_data[2].strip()
            input = splitted_data[4].strip()
            input = "" if input.lower() == "<noinput>" else input
            output = splitted_data[6].strip()
        # filter out too short or too long instructions
        if len(inst.split()) <= 3 or len(inst.split()) > 150:
            continue
        # filter based on keywords that are not suitable for language models.
        blacklist = [
            "image",
            "images",
            "graph",
            "graphs",
            "picture",
            "pictures",
            "file",
            "files",
            "map",
            "maps",
            "draw",
            "plot",
            "go to",
            "video",
            "audio",
            "music",
            "flowchart",
            "diagram",
        ]
        blacklist += []
        if any(find_word_in_string(word, inst) for word in blacklist):
            continue
        # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.
        # And it's a bit comfusing whether the model need to write a program or directly output the result.
        # Here we filter them out.
        # Note this is not a comprehensive filtering for all programming instructions.
        if inst.startswith("Write a program"):
            continue
        # filter those starting with punctuation
        if inst[0] in string.punctuation:
            continue
        # filter those starting with non-english character
        if not inst[0].isascii():
            continue
        instructions.append({"instruction": inst, "input": input, "output": output})
    return instructions

In [49]:
instruction_data = []

for result in results:
    new_instructions = post_process_gpt3_response(
        num_prompt_instructions, result
    )
    instruction_data += new_instructions

NameError: name 'post_process_gpt3_response' is not defined

# WTF ?? 

In [72]:
j = post_process_gpt3_response(num_prompt_instructions, results[0])
a

[]

In [31]:
response = results[0]
response

{'text': '4. Instruction:  Develop a JavaScript function that checks if a given string is a palindrome (reads the same backward as forward), ignoring case and non-alphanumeric characters.\n4. Input: "A man, a plan, a canal: Panama"\n4. Output: \n```javascript\nfunction isPalindrome(str) {\n  // Remove non-alphanumeric characters and convert to lowercase\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  \n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  \n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```\n\n5. Instruction:  Construct a Python program that reads a CSV file and calculates the average of a specified column.  Assume the file uses a comma as a delimiter.\n5. Input:  "data.csv: Name,Age,Score\\nAlice,25,85\\nBob,30,92\\nCharlie,28,78"\n5. Output:\n```python\nimport csv\n\ndef calculate_average(filename, column_name):\n    with

In [32]:
if response is None:
    output = []

In [33]:
# get all result instructions
# raw_instructions = f"{num_prompt_instructions+1}. Instruction:" + response["text"]
raw_instructions = response["text"]
raw_instructions = re.split("###", raw_instructions)

# Remember remove '' elemment from raw_instructions

In [34]:
raw_instructions

['4. Instruction:  Develop a JavaScript function that checks if a given string is a palindrome (reads the same backward as forward), ignoring case and non-alphanumeric characters.\n4. Input: "A man, a plan, a canal: Panama"\n4. Output: \n```javascript\nfunction isPalindrome(str) {\n  // Remove non-alphanumeric characters and convert to lowercase\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  \n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  \n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```\n\n5. Instruction:  Construct a Python program that reads a CSV file and calculates the average of a specified column.  Assume the file uses a comma as a delimiter.\n5. Input:  "data.csv: Name,Age,Score\\nAlice,25,85\\nBob,30,92\\nCharlie,28,78"\n5. Output:\n```python\nimport csv\n\ndef calculate_average(filename, column_name):\n    with open(fi

In [35]:
instructions = []

for idx, inst in enumerate(raw_instructions):
    if idx == 1:
        break

In [36]:
inst

'4. Instruction:  Develop a JavaScript function that checks if a given string is a palindrome (reads the same backward as forward), ignoring case and non-alphanumeric characters.\n4. Input: "A man, a plan, a canal: Panama"\n4. Output: \n```javascript\nfunction isPalindrome(str) {\n  // Remove non-alphanumeric characters and convert to lowercase\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  \n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  \n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```\n\n5. Instruction:  Construct a Python program that reads a CSV file and calculates the average of a specified column.  Assume the file uses a comma as a delimiter.\n5. Input:  "data.csv: Name,Age,Score\\nAlice,25,85\\nBob,30,92\\nCharlie,28,78"\n5. Output:\n```python\nimport csv\n\ndef calculate_average(filename, column_name):\n    with open(fil

In [37]:
# idx += num_prompt_instructions + 1
idx = 4

In [38]:
splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)

In [39]:
splitted_data[3]

'Input'

In [50]:
if len(splitted_data) != 7:
    pass
else:
    inst = splitted_data[2].strip()
    input = splitted_data[4].strip()
    input = "" if input.lower() == "<noinput>" else input
    output = splitted_data[6].strip()

In [41]:
blacklist = [
    "image",
    "images",
    "graph",
    "graphs",
    "picture",
    "pictures",
    "file",
    "files",
    "map",
    "maps",
    "draw",
    "plot",
    "go to",
    "video",
    "audio",
    "music",
    "flowchart",
    "diagram",
]
blacklist += []

In [42]:
def find_word_in_string(w, s):
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)

In [43]:
if any(find_word_in_string(word, inst) for word in blacklist):
    print("black list")

black list


In [44]:
if inst.startswith("Write a program"):
    print("black list")

In [46]:
import string

if inst[0] in string.punctuation:
    print("filter")

In [47]:
instructions.append({"instruction": inst, "input": input, "output": output})
instructions

NameError: name 'output' is not defined

In [51]:
inst

'4. Instruction:  Develop a JavaScript function that checks if a given string is a palindrome (reads the same backward as forward), ignoring case and non-alphanumeric characters.\n4. Input: "A man, a plan, a canal: Panama"\n4. Output: \n```javascript\nfunction isPalindrome(str) {\n  // Remove non-alphanumeric characters and convert to lowercase\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  \n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  \n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```\n\n5. Instruction:  Construct a Python program that reads a CSV file and calculates the average of a specified column.  Assume the file uses a comma as a delimiter.\n5. Input:  "data.csv: Name,Age,Score\\nAlice,25,85\\nBob,30,92\\nCharlie,28,78"\n5. Output:\n```python\nimport csv\n\ndef calculate_average(filename, column_name):\n    with open(fil

# We already create results from Gemini API, now try to filter dataset 

In [1]:
# first load dataset
import json

with open('regen.json', 'r') as f:
    data = json.load(f)

print(data)

[{'text': '###\n4. Instruction:  Develop a JavaScript function to check if a given string is a palindrome.  Ignore case and non-alphanumeric characters.\n4. Input: "A man, a plan, a canal: Panama"\n4. Output: \n```javascript\nfunction isPalindrome(str) {\n  // Convert to lowercase and remove non-alphanumeric characters\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```\n\n###\n5. Instruction:  Could you craft a C++ program to calculate the factorial of a given integer? Handle potential errors gracefully.\n5. Input: <noinput>\n5. Output:\n```cpp\n#include <iostream>\n\nlong long factorial(int n) {\n  if (n < 0) {\n    throw std::invalid_argument("Factorial is not defined for negative numbers.");\n  } else if (n == 0) {\n    return 1;\n  } else {

In [9]:
first_sample = data[0]

In [53]:
import re
import string

# try with post_process_gpt3_response first
num_prompt_instructions = 3
response = first_sample

In [42]:
raw_instructions = response["text"]
raw_instructions = re.split("###", raw_instructions)
raw_instructions = [s for s in raw_instructions if s!='']

In [43]:
raw_instructions 

['\n4. Instruction:  Develop a JavaScript function to check if a given string is a palindrome.  Ignore case and non-alphanumeric characters.\n4. Input: "A man, a plan, a canal: Panama"\n4. Output: \n```javascript\nfunction isPalindrome(str) {\n  // Convert to lowercase and remove non-alphanumeric characters\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```\n\n',
 '\n5. Instruction:  Could you craft a C++ program to calculate the factorial of a given integer? Handle potential errors gracefully.\n5. Input: <noinput>\n5. Output:\n```cpp\n#include <iostream>\n\nlong long factorial(int n) {\n  if (n < 0) {\n    throw std::invalid_argument("Factorial is not defined for negative numbers.");\n  } else if (n == 0) {\n    return 1;\n  } else {\n    long

In [44]:
len(raw_instructions)

16

In [51]:
def find_word_in_string(w, s):
    return re.compile(r"\b({0})\b".format(w), flags=re.IGNORECASE).search(s)

def post_process_gemini_response(num_prompt_instructions, response):
    if response is None:
        return []

    # FIX: already change this code
    raw_instructions =  response["text"]
    raw_instructions = re.split("###", raw_instructions)

    # NOTE: add this code 
    raw_instructions = [s for s in raw_instructions if s!=''] # filter out empty string

    instructions = []
    for idx, inst in enumerate(raw_instructions):
        idx += num_prompt_instructions + 1
        splitted_data = re.split(f"{idx}\.\s+(Instruction|Input|Output):", inst)
        if len(splitted_data) != 7:
            continue
        else:
            inst = splitted_data[2].strip()
            input = splitted_data[4].strip()
            input = "" if input.lower() == "<noinput>" else input
            output = splitted_data[6].strip()

        # filter out too short or too long instructions
        if len(inst.split()) <= 3 or len(inst.split()) > 150:
            continue
        # filter based on keywords that are not suitable for language models.
        blacklist = [
            "image",
            "images",
            "graph",
            "graphs",
            "picture",
            "pictures",
            "file",
            "files",
            "map",
            "maps",
            "draw",
            "plot",
            "go to",
            "video",
            "audio",
            "music",
            "flowchart",
            "diagram",
        ]
        blacklist += []
        if any(find_word_in_string(word, inst) for word in blacklist):
            continue
        # We found that the model tends to add "write a program" to some existing instructions, which lead to a lot of such instructions.
        # And it's a bit comfusing whether the model need to write a program or directly output the result.
        # Here we filter them out.
        # Note this is not a comprehensive filtering for all programming instructions.
        if inst.startswith("Write a program"):
            continue
        # filter those starting with punctuation
        if inst[0] in string.punctuation:
            continue
        # filter those starting with non-english character
        if not inst[0].isascii():
            continue
        instructions.append({"instruction": inst, "input": input, "output": output})

    return instructions

In [54]:
instruction = post_process_gemini_response(3, first_sample)

In [56]:
instruction[0]

{'instruction': 'Develop a JavaScript function to check if a given string is a palindrome.  Ignore case and non-alphanumeric characters.',
 'input': '"A man, a plan, a canal: Panama"',
 'output': '```javascript\nfunction isPalindrome(str) {\n  // Convert to lowercase and remove non-alphanumeric characters\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```'}

In [48]:
output

'```javascript\nfunction isPalindrome(str) {\n  // Convert to lowercase and remove non-alphanumeric characters\n  str = str.toLowerCase().replace(/[^a-z0-9]/g, "");\n  // Reverse the string\n  const reversedStr = str.split("").reverse().join("");\n  // Check if the string is equal to its reverse\n  return str === reversedStr;\n}\n\nconsole.log(isPalindrome("A man, a plan, a canal: Panama")); // true\n```'