In [88]:
import os
import numpy as np
import pandas as pd

# Reusing [https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb](https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb)


In [89]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

We preprocess the document sections by creating an embedding vector for each section. An embedding is a vector of numbers that helps us understand how semantically similar or different the texts are. The closer two embeddings are to each other, the more similar are their contents. See the documentation on OpenAI embeddings for more information.

This indexing stage can be executed offline and only runs once to precompute the indexes for the dataset so that each piece of content can be retrieved later. Since this is a small example, we will store and search the embeddings locally. If you have a larger dataset, consider using a vector search engine like Pinecone or Weaviate to power the search.

In [90]:
ct = pd.read_parquet("data/content.parquet.gzip")
pg = pd.read_parquet("data/pages.parquet.gzip")
df = ct.merge(pg,on="hash",how="left")
df["IDX"] = df["hash"]
df = df.set_index("hash")
df = df[df.tokens > 50]
df

Unnamed: 0_level_0,content,tokens,url,title,description,keywords,IDX
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a8414fff120d9c5f9073376b27f52dcf,"Mott MacDonald divisional director, Anne Kerr,...",111,https://www.mottmac.com/events/kpmg-global-pow...,KPMG Global Power and Utilities Conference - M...,,,a8414fff120d9c5f9073376b27f52dcf
3b1ca0a60662c15c1cd6e6fa0206f4a0,A consortium including Cheung Kong Infrastruct...,341,https://www.mottmac.com/article/103/mott-macdo...,Mott MacDonald advises CKI and United Utilitie...,,,3b1ca0a60662c15c1cd6e6fa0206f4a0
fb558366bffeb1834d6fea64b41236ba,"Mott MacDonald, the global engineering, manage...",503,https://www.mottmac.com/article/552/mott-macdo...,Mott MacDonald chosen for four-country African...,,,fb558366bffeb1834d6fea64b41236ba
b0b1a2444987ebf09f8fe22a170cc6dd,We were proud to be platinum sponsor of The Ec...,584,https://www.mottmac.com/article/72168/turning-...,Turning climate ambition into action - Mott Ma...,,,b0b1a2444987ebf09f8fe22a170cc6dd
e8f484cc2df87052b1617e79b3dee16a,Mott MacDonald offers a wide variety of traffi...,98,https://www.mottmac.com/en-US/transportation-p...,Traffic engineering - Mott MacDonald,,,e8f484cc2df87052b1617e79b3dee16a
...,...,...,...,...,...,...,...
24438270208be5df0f6c89e56b2b7926,"Mott MacDonald, assisted by Rand Europe, has b...",416,https://www.mottmac.com/article/154/mott-macdo...,Mott MacDonald to update Department for Transp...,,,24438270208be5df0f6c89e56b2b7926
7f41c6131fb54bc4522d9f760dee5c34,"Phil Vigor, principal airport planner The stea...",523,https://www.mottmac.com/views/use-no-build-sol...,Use no-build solutions to optimise airport cap...,"The steady pace of globalisation, improved pro...","air travel,airport,airport capacity,no-build",7f41c6131fb54bc4522d9f760dee5c34
d81b9d188f526726d84381fa7b041bff,Additional growth potential has been identifie...,140,https://www.mottmac.com/article/3368/tana-and-...,"Tana and Beles growth corridor strategy, Ethio...",,"World bank,Strategy,economic,strengthening,Gro...",d81b9d188f526726d84381fa7b041bff
8397cf71f1d5a56f2815b41bf2a61088,Mott MacDonald and ADP have been appointed by ...,559,https://www.mottmac.com/releases/mott-macdonal...,Mott MacDonald and ADP to restore Spanish City...,Mott MacDonald and ADP have been appointed to ...,"ADP,Heritage Lottery Fund,HLF,North Tyneside C...",8397cf71f1d5a56f2815b41bf2a61088


In [91]:
df = df[~df.duplicated(subset=["title","content"])]

In [92]:
df.to_parquet("data/all_pages.parquet.gzip")

In [93]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return { idx: get_embedding(r.content) for idx, r in df.iterrows() }

In [94]:
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0,compression="gzip")
    max_dim = max([int(c) for c in df.columns if c != "hash"])
    return {
           (r.hash): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [95]:
openai.api_key_path = "APIKEY.txt"

In [125]:
slices = 250
nbIter = len(df)//slices

for k in range(nbIter+1):
    filename = "data/embeddings/"+str(k)+".csv.gzip"
    if not os.path.isfile(filename):
        DF = df[slices*k:slices*(k+1)]
        if len(DF):
            doc_embeddings = compute_doc_embeddings(DF)
            # 2 mins for 456 elements
            dfEmbed = pd.DataFrame(doc_embeddings).T.reset_index()
            dfEmbed.to_csv(filename,index=False,compression="gzip")
    else:
        print("Pack",k,"done")

Pack 0 done
Pack 1 done
Pack 2 done
Pack 3 done
Pack 4 done
Pack 5 done
Pack 6 done


In [128]:
import glob
all = []
for file in glob.glob("data/embeddings/*.gzip"):
    all.append(pd.read_csv(file, header=0,compression="gzip"))
all = pd.concat(all)
cols = list(all.columns)
cols[0] = "hash"
all.columns = cols
print(len(all),"articles found.")
all.to_csv("data/embedding.csv.gzip",index=False,compression="gzip")
# Takes 50s to load and save 3k7 articles

3782 articles found.


In [129]:
document_embeddings = load_embeddings("data/embedding.csv.gzip")
# Takes 13s for 456 elements
# Takes 40s for 3k7

# Searching corpus for our questions

In [130]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [131]:
order_document_sections_by_query_similarity("What is Mott MacDonald?", document_embeddings)[:5]

[(0.8926961676936163, '7e91bff4250a179a5e23fe10be90560a'),
 (0.8916346624607806, 'f4c1d60ee32436c008f88a17ffaea6e2'),
 (0.8900219527055997, '2fc1eac005d11151543ad30a49ca3cb1'),
 (0.8897770956647693, 'c4855cb00f46aafdd864c2e7bc2f74e4'),
 (0.8891484619503782, 'ebcc0ed56eab7716aea8cc7542f8a3d9')]

# Now that we know this..

In [132]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [133]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

NameError: name 'df' is not defined

In [135]:
prompt = construct_prompt(
    "Who is Mott MacDonald ?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 3 document sections:
4b6acb0e7c1982e29fdd95ba27f422cc
ff814dd231d9a4993f6ea5dd4e38b80b
7e91bff4250a179a5e23fe10be90560a
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Mott MacDonald is a recognized leader in the planning, design, and management of telematics. We offer immense experience delivering and managing data storage and networking solutions, variable message signing, adaptive signal system, and user information systems that involve open architecture, common protocols, integration with fleet management systems, and IT support.
* Mott MacDonald in North America is a practice-driven organization, led by a network of national and regional practice leaders who are experienced professionals. We are an equal opportunity employer that offers competitive benefits and is committed to developing the career of each employee. In 2014, Mott MacDonald was honored 

# Now onto prod!

In [136]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}   

In [137]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [143]:
query = "Who is Cathy Travers?"
answer = answer_query_with_context(query, df, document_embeddings)

print(f"\nQ: {query}\nA: {answer}")

Selected 1 document sections:
e4aead92eed17577047f7d86c9154688

Q: Who is Cathy Travers?
A: Cathy Travers is the general manager of Mott MacDonald's UK and Europe built environment business. She began her career with Husband and Company nearly 30 years ago as a chartered engineer, before joining Mott MacDonald in 1990 following the consultancy’s acquisition of Husband and Company. Since then, she has held a range of operational and business development roles, most recently acting as development director for Mott MacDonald’s UK and Europe regional business.
