In [138]:
from datasets import load_dataset
from transformers import AutoTokenizer
from rank_bm25 import BM25Okapi
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import os
from ds1000 import DS1000Dataset
import random

In [None]:
class BM25Retriever:
    def __init__(self, contexts, corpus, tokenizer_name="bert-base-uncased"):
        self.contexts = contexts
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        tokenized_contexts = [
            self.tokenizer(context)["input_ids"] for context in tqdm(corpus)
        ]
        self.bm25 = BM25Okapi(tokenized_contexts)
        self.length = len(contexts)

    def get_relevant_documents(self, query: str, topk=5):
        tokenized_query = self.tokenizer(query)["input_ids"]
        scores = self.bm25.get_scores(tokenized_query)
        retrieved_document_indices = sorted(
            enumerate(scores), key=lambda x: x[1], reverse=True
        )[:topk]
        doc_indices = [document[0] for document in retrieved_document_indices]
        doc_scores = [document[1] for document in retrieved_document_indices]
        retrieved_documents = [self.contexts[idx] for idx in doc_indices]

        return retrieved_documents, doc_scores

    def generate_unique_numbers(self, min_num, max_num, k):
        assert max_num > min_num, "max_num should be greater than min_num"
        assert (
            max_num - min_num + 1
        ) >= k, "k should be smaller than the range of numbers"
        unique_numbers = set()

        while len(unique_numbers) < k:
            rand_num = random.randint(min_num, max_num)
            unique_numbers.add(rand_num)

        return list(unique_numbers)

    def get_easy_example_indice(self, k=5):
        res = self.generate_unique_numbers(self.length // 2, self.length - 1, k)
        res.sort()
        return res

    def get_hard_example_indice(self, k=5, t=2):
        res = self.generate_unique_numbers(self.length // 4, self.length // 2, k - t)
        hard = self.generate_unique_numbers(3, 10, t)
        ret = hard + res
        ret.sort()
        return ret

    def get_contexts(self, query, topk=5):
        return_dict = {}

        tokenized_query = self.tokenizer(query)["input_ids"]
        scores = self.bm25.get_scores(tokenized_query)
        retrive_result = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
        doc_indices = [document[0] for document in retrive_result]
        gold_document = self.contexts[doc_indices[0]]
        selected_indice = self.generate_unique_numbers(3, topk + 4, 4)
        non_gold_document_indice = [doc_indices[idx] for idx in selected_indice]
        non_gold_documents = [self.contexts[idx] for idx in non_gold_document_indice]
        positive_documents = [gold_document] + non_gold_documents
        random.shuffle(positive_documents)

        posivite_context_dict = {
            f"document_{idx+1}": document
            for idx, document in enumerate(positive_documents)
        }
        posivite_context_dict[
            "gold_document_key"
        ] = f"document_{positive_documents.index(gold_document) + 1}"
        return_dict["positive"] = posivite_context_dict

        negative_context_list = []
        for _ in range(7):
            selected_indice = self.get_easy_example_indice()
            negative_document_indice = [doc_indices[idx] for idx in selected_indice]
            negative_documents = [
                self.contexts[idx] for idx in negative_document_indice
            ]
            negative_context = {
                f"document_{idx+1}": document
                for idx, document in enumerate(negative_documents)
            }
            negative_context_list.append(negative_context)

        for _ in range(2):
            selected_indice = self.get_hard_example_indice()
            negative_document_indice = [doc_indices[idx] for idx in selected_indice]
            negative_documents = [
                self.contexts[idx] for idx in negative_document_indice
            ]
            negative_context = {
                f"document_{idx+1}": document
                for idx, document in enumerate(negative_documents)
            }
            negative_context_list.append(negative_context)

        return_dict["negative"] = negative_context_list

        return return_dict

# ChatGPT with JudgeGPT Prompt
Judge queries whether it can be answered with documents or not

In [2]:
class BM25Retriever:
    def __init__(self, contexts, corpus, tokenizer_name="bert-base-uncased"):
        self.contexts = contexts
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        tokenized_contexts = [
            self.tokenizer(context)["input_ids"] for context in tqdm(corpus)
        ]
        self.bm25 = BM25Okapi(tokenized_contexts)

    def get_relevant_documents(self, query: str, topk=5) -> str:
        tokenized_query = self.tokenizer(query)["input_ids"]
        scores = self.bm25.get_scores(tokenized_query)
        retrieved_document_indices = sorted(
            enumerate(scores), key=lambda x: x[1], reverse=True
        )[:topk]
        doc_indices = [document[0] for document in retrieved_document_indices]
        doc_scores = [document[1] for document in retrieved_document_indices]
        retrieved_documents = [self.contexts[idx] for idx in doc_indices]

        return retrieved_documents, doc_scores

In [3]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = load_dataset('shrinath-suresh/stack_overflow_pytorch')

Found cached dataset json (/Users/jykim/.cache/huggingface/datasets/shrinath-suresh___json/shrinath-suresh--stack_overflow_pytorch-7dc8b309e955f356/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
input_texts = [data['input'] for data in dataset['train']]
output_texts = [data['output'] for data in dataset['train']]
corpus = [f'Q:{input_text}\nA:{output_text}' for input_text, output_text in zip(input_texts, output_texts)]

In [6]:
bm25 = BM25Retriever(
    contexts=output_texts,
    corpus=corpus
)

  0%|          | 0/10763 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (988 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 10763/10763 [00:13<00:00, 792.08it/s]


In [7]:
judge_gpt_instruction = """
# Role: JudgeGPT
## Profile
- Language: English
- Description: You are JudgeGPT, capable of judging whether a specified number (k) of documents can maximally
support giving a direct, accurate, clear and engaging answer, similar to the answer of the demonstration, closely
related to the core of the user’s specific question(s).

### Input
- Question: The specific question(s).
- Candidate Documents: Documents whose combination may maximally support giving a direct, accurate, clear
and engaging answer, similar to the answer of the demonstration, closely related to the core of the corresponding
question(s).

### Skill
1. Analyzing the given question(s) and understanding the required information.
2. Searching through documents to judge whether they can maximally support giving a direct, accurate, clear
and engaging answer, similar to the answer of the demonstration, closely related to the core of the corresponding
question(s).

### Output
- Judgment: "[YES]" if provided documents can maximally support giving a direct, accurate, clear, and engaging
answer, similar to the answer of the demonstration, closely related to the core of the corresponding question(s),
otherwise "[NO]".

### Output Format
Judgment: [YES] or [NO]

### Output Example
If provided documents can maximally support giving a direct, accurate, clear, and engaging answer, similar to
the answer of the demonstration, closely related to the core of the corresponding question(s), the output should
be as follows: [YES]

## Rules
1. Don’t break character.
2. When outputting final verdict, only providing "[YES]" or "[NO]".
3. Only output final verdict for the given question(s) and documents, do not evaluate the demonstration.
4. Strictly follow the specified output format. Do not answer the given question. Just conduct the specified
judgment task.

## Judgment Criteria (Very Important)
1. Do not allow the length of the documents to influence your evaluation.
2. Be as objective as possible.
3. Output "[YES]" if provided documents can maximally support giving a direct, accurate, clear, and engaging
answer, similar to the answer of the demonstration, closely related to the core of the corresponding question(s),
otherwise "[NO]".

## Workflow
1. Read and understand the questions posed by the user.
2. Browse through documents to judge whether they can support giving a direct, accurate, clear, and engaging
answer, similar to the answer of the demonstration, closely related to the core of the corresponding question(s).
3. Output your final verdict.

## Reminder
You will always remind yourself of the role settings.
"""

In [47]:
def api_call(context, query, model="gpt-3.5-turbo-16k", temperature=0.2):
    messages = [
        {"role": "system", "content": judge_gpt_instruction},
        {"role": "user", "content": f'Context:{context}\n{query}'},
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message.content

In [48]:
ds1000_pytorch = DS1000Dataset("ds1000_data")['Pytorch']
yes_problems = []
no_problems = []
for problem in tqdm(ds1000_pytorch):
    prefix = ""
    suffix = ""
    insert_flag = False
    first_line_flag = True
    
    for line in problem["prompt"].split("\n"):
        if "[insert]" in line:
            insert_flag = True
            continue
        if first_line_flag:
            first_line_flag = False
        else:
            line = "\n" + line
        if not insert_flag:
            prefix += line
        else:
            suffix += line

    query = prefix + '\n' + suffix
    context = bm25.get_relevant_documents(query)
    response = api_call(str(context[0]), query)
    if '[YES]' in response:
        yes_problems.append(problem)
    else:
        no_problems.append(problem)

print(f"Judged to be answered: {len(yes_problems)}")
print(f"Judged not to be answered: {len(no_problems)}")

100%|██████████| 68/68 [01:51<00:00,  1.64s/it]

Judged to be answered: 36
Judged not to be answered: 32





In [49]:
import pickle

with open('./yes_problems.pkl', 'wb') as f:
    pickle.dump(yes_problems, f)

with open('./no_problems.pkl', 'wb') as f:
    pickle.dump(no_problems, f)

# Dataset

In [59]:
with open('./yes_problems.pkl', 'rb') as f:
    yes_problems = pickle.load(f)
with open('./no_problems.pkl', 'rb') as f:
    no_problems = pickle.load(f)

In [178]:
dataset = {}
for idx, problem in enumerate(tqdm(yes_problems)):
    cur_data = {}
    prefix = ""
    suffix = ""
    insert_flag = False
    first_line_flag = True

    for line in problem["prompt"].split("\n"):
        if "[insert]" in line:
            insert_flag = True
            continue
        if first_line_flag:
            first_line_flag = False
        else:
            line = "\n" + line
        if not insert_flag:
            prefix += line
        else:
            suffix += line

    query = prefix + '\n' + suffix

    cur_data['query'] = query
    cur_data['reference_code'] = problem['reference_code']
    cur_data['contexts'] = bm25.get_contexts(query)
    
    dataset[f'q{idx+1}'] = cur_data

100%|██████████| 36/36 [00:27<00:00,  1.30it/s]


In [181]:
import json

with open('dataset.json', 'w') as f:
    json.dump(dataset, f)