In [1]:
import pandas as pd
import openai
import numpy as np
import pickle
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
openai.api_key = "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"

COMPLETIONS_MODEL = "text-davinci-002"

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# We have hosted the processed dataset, so you can download it directly without having to recreate it.
# This dataset has already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('https://cdn.openai.com/API/examples/data/olympics_sections_text.csv')
df = df.set_index(["title", "heading"])


print(f"{len(df)} rows in the data.")
print(f"{sum(df.tokens)} tokens in the data")

3964 rows in the data.
503661 tokens in the data


In [3]:
MODEL_NAME = "curie"

DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"
COST_PER_EMBEDDING = 0.02*1e-3
COST_PER_COMPLETION = 0.02*1e-3

print(f"${sum(df.tokens)*COST_PER_EMBEDDING:.2f} for creating embedding")


$10.07 for creating embedding


In [4]:
# We load the pre-computed embeddings 
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

# directly loading it to avoid wasting limited OpenAI credits
document_embeddings = load_embeddings("https://cdn.openai.com/API/examples/data/olympics_sections_document_embeddings.csv")

In [5]:
# Query embeddings

MAX_SECTION_LEN = 500
SEPARATOR = "\n* "

separator_len = len(tokenizer.tokenize(SEPARATOR))

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))


def get_embedding(text: str, model: str) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def get_query_embedding(text: str) -> list[float]:
    # print the cost of the embedding
    cost = count_tokens(text) * COST_PER_EMBEDDING
#     print(f"${cost} for the embedding")
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [6]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
    "stop":["\n\n"]
}

def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )
    
    total_tokens = count_tokens(prompt) + count_tokens(response["choices"][0]["text"].strip(" \n"))
#     print(f"${total_tokens*COST_PER_COMPLETION:.5f} for answering prompt")

    return response["choices"][0]["text"].strip(" \n")


In [7]:
answer_query_with_context("Which country won the most medals in the 2020 olympics?", df, document_embeddings)

'The United States won the most medals overall, with 113, and the most gold medals, with 39.'

In [8]:
answer_query_with_context("Where was the most expensive olympic games held?", df, document_embeddings)

'The most expensive olympic games were held in Russia.'

In [9]:
answer_query_with_context("Which is the only city in asia to have hosted the olympics more than once?", df, document_embeddings)

'Tokyo'

In [10]:
answer_query_with_context("Which new events were introduced for the 2020 Summer Olympics?", df, document_embeddings)

'Karate, sport climbing, surfing, and skateboarding were introduced as new events for the 2020 Summer Olympics.'