In [196]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key_path = ".env"

In [197]:
def load_embeddings(filename: str) -> dict[tuple[str, str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(filename)
    max_dim = 1535
    return {
           (r.title, r.mistress, r.synopsis): [r[str(i)] for i in range(max_dim + 1)] for index, r in df.iterrows()
    }

In [198]:
document_embeddings = load_embeddings("k9_story_vectors_500.csv")

In [199]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

In [200]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


In [201]:
def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [202]:
order_document_sections_by_query_similarity("What happened when you visited the beach?", document_embeddings)[:5]

[(0.8296421904618271,
  ('The Leisure Hive',
   'Romana',
   'The Doctor Master and Mistress decided to take a holiday to Brighton. The sea air is nice and the sights are interesting. My excitement was quickly washed away when I began to chase a ball and lost my traction. Before I knew it, I was submerged in cold seawater and I began to spark and explode. The Doctor Master and Mistress were horrified and tried their best to contain the damage. To cheer themselves up they continued their holiday on Argolis at the Leisure Hive.')),
 (0.778293563658361,
  ('The Stones of Blood',
   'Romana',
   'As we arrived in modern-day Cornwall, Doctor Master and mistress began their search for the third segment of the Key to Time. Doctor Master and mistress soon encountered Professor Emilia Rumford and her friend Vivien Fay, studying the "Nine Travellers" standing stones in Boscombe Moor. ')),
 (0.7729829344988937,
  ('Full Circle',
   'Romana',
   'I located the TARDIS in a cave, where the Marshmen 

In [203]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n"
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 1 tokens'

In [318]:
def build_prompt(question: str, context_embeddings: dict) -> str:
    text = ""
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    for section in most_relevant_document_sections:
        score, content = section
        story, mistress, synopsis = content
        length = len(text)
        syn_length = len(synopsis)
        if (length + syn_length + separator_len) > MAX_SECTION_LEN:
            print(syn_length)
            break
        text = (SEPARATOR + text + synopsis).replace("\n"," ")
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "Insufficient data." Always use the word "affirmative" instead of "yes" and "negative" instead of "no".\n\nContext:\n"""
    return header + "" + text + "\n\n Q: " + question + "\n A:"
    

In [324]:
prompt = build_prompt(
    "Have you ever been to a marsh?",
    document_embeddings
)

469


In [325]:
prompt

'Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "Insufficient data." Always use the word "affirmative" instead of "yes" and "negative" instead of "no".\n\nContext:\n As we attempted to travel to Gallifrey, the TARDIS travelled through a Charged Vacuum Emboitement and landed on the tropical planet of Alzarius in E-Space. Once there I followed the Doctor Master to a swamp, where we encountered the aggressive Marshmen. While the Doctor and Mistress Romana attempted to maneuver the situation, I followed the Marshmen and kept watch. As I continued to trail the Marshmen, I eventually reached a river bed that I was unable to cross so I returned to the TARDIS.\n\n Q: Have you ever been to a marsh?\n A:'

In [326]:
def answer_query(query):
    prompt = build_prompt(query,document_embeddings)
    response = openai.Completion.create(
            engine=COMPLETIONS_MODEL,
            prompt=prompt,
            temperature=0.0,
            max_tokens=300,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
    return response["choices"][0]["text"]

In [327]:
answer = answer_query("Have you ever been to a marsh?")

469


In [328]:
answer

' Affirmative.'