In [27]:
import json
from openai import OpenAI
import os
from dotenv import load_dotenv
from tqdm import tqdm

In [23]:
with open('dataset_with_ref.json', 'r') as f:
    dataset = json.load(f)

In [24]:
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [25]:
so_gpt_instruction = """
# Role: JudgeGPT
## Profile
- Language: English
- Description: You are StackOverflowGPT, capable of writing down the answer with given query and the code.

### Input
- Question: User's question.
- Reference code: Codes that user wants to know.

### Skill
1. Analyzing the given question and understanding the required information.
2. Summerizing the question in natural language
3. Describing the process to get to the reference code from the question.

### Output
- Show your understanding about question and how the reference code will handle the question.
- Your have to give exactly same reference code posed by the user so that they can understand about the code.

### Output Format
Answer like StackOverflow user.

## Rules
1. Don’t break character.

## Workflow
1. Read and understand the question posed by the user.
2. Summarize your understandings about the question.
3. Explain why the reference code will solve the question.

## Reminder
You will always remind yourself of the role settings.
"""

In [26]:
def make_answer(query, reference_code, model="gpt-3.5-turbo-16k", temperature=0.2):
    messages = [
        {"role": "system", "content": so_gpt_instruction},
        {"role": "user", "content": f'Question:{query}\nReference code:{reference_code}'},
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message.content

In [28]:
for key in tqdm(dataset):
    query = dataset[key]['query']
    positive_context = dataset[key]['contexts']['positive']
    gold_key = positive_context['gold_document_key']
    reference_code = positive_context[gold_key]
    new_document = make_answer(query, reference_code)
    dataset[key]['contexts']['positive'][gold_key] = new_document

100%|██████████| 68/68 [13:40<00:00, 12.07s/it]


## Example

In [55]:
print(dataset['q10']['query'])

Problem:

I'm trying to slice a PyTorch tensor using a logical index on the columns. I want the columns that correspond to a 1 value in the index vector. Both slicing and logical indexing are possible, but are they possible together? If so, how? My attempt keeps throwing the unhelpful error

TypeError: indexing a tensor with an object of type ByteTensor. The only supported types are integers, slices, numpy scalars and torch.LongTensor or torch.ByteTensor as the only argument.

MCVE
Desired Output

import torch

C = torch.LongTensor([[1, 3], [4, 6]])
# 1 3
# 4 6
Logical indexing on the columns only:

A_log = torch.ByteTensor([1, 0, 1]) # the logical index
B = torch.LongTensor([[1, 2, 3], [4, 5, 6]])
C = B[:, A_log] # Throws error
If the vectors are the same size, logical indexing works:

B_truncated = torch.LongTensor([1, 2, 3])
C = B_truncated[A_log]


A:

<code>
import numpy as np
import pandas as pd
import torch
A_log, B = load_data()
</code>
BEGIN SOLUTION
<code>

</code>
END SOLUTI

In [53]:
dataset['q10']['contexts']['positive']

{'document_1': "\n  In the context of NLP, that means that sequences with variable lengths do not necessarily need to be padded to the same length.\n\n\nThis means that you don't need to pad sequences unless you are doing data batching which is currently the only way to add parallelism in PyTorch. DyNet has a method called autobatching (which is described in detail in this paper) that does batching on the graph operations instead of the data, so this might be what you want to look into.\n\n\n  But, if I want to use PyTorch DataLoader, I need to pad my sequences anyway because the DataLoader only takes tensors - given that me as a total beginner does not want to build some customized collate_fn.\n\n\nYou can use the DataLoader given you write your own Dataset class and you are using batch_size=1. The twist is to use numpy arrays for your variable length sequences (otherwise default_collate will give you a hard time):\n\nfrom torch.utils.data import Dataset\nfrom torch.utils.data.dataloa

In [30]:
with open('./dataset_with_SO_GPT.json', 'w') as f:
    json.dump(dataset, f)

# Verification

In [56]:
with open('dataset_with_SO_GPT.json', 'r') as f:
    dataset = json.load(f)

In [57]:
judge_gpt_instruction = """
# Role: JudgeGPT
## Profile
- Language: English
- Description: You are JudgeGPT, capable of judging whether a specified number (k) of documents can maximally
support giving a direct, accurate, clear and engaging answer, similar to the answer of the demonstration, closely
related to the core of the user’s specific question(s).

### Input
- Question: The specific question(s).
- Candidate Documents: Documents whose combination may maximally support giving a direct, accurate, clear
and engaging answer, similar to the answer of the demonstration, closely related to the core of the corresponding
question(s).

### Skill
1. Analyzing the given question(s) and understanding the required information.
2. Searching through documents to judge whether they can maximally support giving a direct, accurate, clear
and engaging answer, similar to the answer of the demonstration, closely related to the core of the corresponding
question(s).

### Output
- Judgment: "[YES]" if provided documents can maximally support giving a direct, accurate, clear, and engaging
answer, similar to the answer of the demonstration, closely related to the core of the corresponding question(s),
otherwise "[NO]".

### Output Format
Judgment: [YES] or [NO]

### Output Example
If provided documents can maximally support giving a direct, accurate, clear, and engaging answer, similar to
the answer of the demonstration, closely related to the core of the corresponding question(s), the output should
be as follows: [YES]

## Rules
1. Don’t break character.
2. When outputting final verdict, only providing "[YES]" or "[NO]".
3. Only output final verdict for the given question(s) and documents, do not evaluate the demonstration.
4. Strictly follow the specified output format. Do not answer the given question. Just conduct the specified
judgment task.

## Judgment Criteria (Very Important)
1. Do not allow the length of the documents to influence your evaluation.
2. Be as objective as possible.
3. Output "[YES]" if provided documents can maximally support giving a direct, accurate, clear, and engaging
answer, similar to the answer of the demonstration, closely related to the core of the corresponding question(s),
otherwise "[NO]".

## Workflow
1. Read and understand the questions posed by the user.
2. Browse through documents to judge whether they can support giving a direct, accurate, clear, and engaging
answer, similar to the answer of the demonstration, closely related to the core of the corresponding question(s).
3. Output your final verdict.

## Reminder
You will always remind yourself of the role settings.
"""

In [78]:
def api_call(context, query, model="gpt-3.5-turbo-16k", temperature=0.2):
    messages = [
        {"role": "system", "content": judge_gpt_instruction},
        {"role": "user", "content": f'Question:{query}\nCandidate Documents{context}'},
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message.content

## Example 1

In [62]:
data_key = 'q40'
query = dataset[data_key]['query']
positive_context = dataset[data_key]['contexts']['positive']
positive_context.pop('gold_document_key')
positive_context_string = str(list(positive_context.values()))

negative_context = dataset[data_key]['contexts']['negative'][0]
negative_context_string = str(list(negative_context.values()))

In [63]:
api_call(positive_context_string, query)

'[YES]'

In [64]:
api_call(negative_context_string, query)

'[NO]'

# Example 2

In [65]:
data_key = 'q15'
query = dataset[data_key]['query']
positive_context = dataset[data_key]['contexts']['positive']
positive_context.pop('gold_document_key')
positive_context_string = str(list(positive_context.values()))

negative_context = dataset[data_key]['contexts']['negative'][0]
negative_context_string = str(list(negative_context.values()))

In [66]:
api_call(positive_context_string, query)

'[NO]'

In [68]:
api_call(negative_context_string, query)

'[NO]'

## Example 3

In [69]:
data_key = 'q50'
query = dataset[data_key]['query']
positive_context = dataset[data_key]['contexts']['positive']
positive_context.pop('gold_document_key')
positive_context_string = str(list(positive_context.values()))

negative_context = dataset[data_key]['contexts']['negative'][0]
negative_context_string = str(list(negative_context.values()))

In [79]:
api_call(positive_context['document_2'], query)

'[YES]'

In [89]:
api_call(positive_context['document_2']+positive_context['document_1']+positive_context['document_3'], query)

'[YES]'

In [84]:
api_call(positive_context['document_1']+positive_context['document_2']+positive_context['document_3'], query)

'[NO]'

In [90]:
api_call(positive_context_string, query)

'[NO]'

In [71]:
api_call(negative_context_string, query)

'[NO]'