# Question Answering Using Embeddings
---
Now that we've scraped our mortgage knowledge from the cfpb, now we are ready to ask questions. The responses will have all the information from the cfpb as well as its already gained general knowledge from the gpt-3 training.  
We will embed the text so that the completions model will be able to look up answers quickly.

In [1]:
import openai
import pandas as pd
from transformers import GPT2TokenizerFast
import numpy as np
import tiktoken
import os
from ipywidgets import widgets
import warnings
warnings.filterwarnings("ignore", message="Unverified HTTPS request is being made to host")
os.environ["CURL_CA_BUNDLE"] = ""

In [2]:
# Define which models we are using
# COMPLETION_MODEL = "text-ada-001" # cheapest model
# COMPLETION_MODEL = "text-babbage-001"
# COMPLETION_MODEL = "text-curie-001"
COMPLETION_MODEL = "text-davinci-003" # most advanced model

EMBEDDING_MODEL = "text-embedding-ada-002"
TOKENIZER = GPT2TokenizerFast.from_pretrained("gpt2")
ENCODING = tiktoken.get_encoding("cl100k_base")

# You must have your own OpenAI api key and store it 
# as an environment variable on your computer
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
def get_embedding(text):
    """ 
    Using the specified OpenAI embedding model, 
    embed the given text.
    """
    result = openai.Embedding.create(
        model=EMBEDDING_MODEL, input=text)
    return result["data"][0]["embedding"]

def compute_doc_embeddings(context_df):
    """
    Compute the embeddings for text in the 'content' column 
    of a dataframe.
    """
    result = {
        i: get_embedding(r.content)
        for i, r in context_df.iterrows()
    }
    return result

def load_embeddings(filepath, index_names=None):
    """
    Load embeddings from a csv and properly format for use
    """
    loaded = pd.read_csv(filepath, index_col=index_names)
    return loaded.apply(list, axis=1).to_dict()

def documents_by_similarity(query, contexts):
    """
    Get the embedding of a query and then find rank
    the document embeddings by most to least similar
    """
    query_embedding = get_embedding(query)
    doc_sims = [(np.dot(query_embedding, doc_embedding), idx)
                for idx, doc_embedding in contexts.items()]
    return sorted(doc_sims, reverse=True)

In [4]:
def construct_prompt(question, context_embeddings, context_df):
    """
    Given a question, the context embeddings dictionary, and 
    the context dataframe, construct an appropriate prompt for
    a completion model.
    """
    # Rank the content that is most similar to the question
    best_docs = documents_by_similarity(question, context_embeddings)
    
    # Iterate through documents to get enough passages
    # to reach the defined maximum section length
    MAX_SECTION_LEN = 500
    SEP = "\n* "
    chosen_sections = []
    chosen_section_idxs = []
    text_len = 0
    separator_len = len(ENCODING.encode(SEP))
    
    best_idxs = [x[1] for x in best_docs]
    for idx, row in context_df.reindex(index=best_idxs).iterrows():
        text_len += row.tokens + separator_len
        if text_len > MAX_SECTION_LEN:
            break
        chosen_sections.append(SEP + row.content)
        chosen_section_idxs.append(idx)
    
    # Create the prompt
    prompt = (
        "Answer this question truthfully using the provided context, "
        "and if the answer is not contained within the text below, "
        "say \"I don't know.\"\n\nContext:\n"
        f"{' '.join(chosen_sections)}\n\n Q: {question}\n A: "
    )
    return prompt


def answer_query_with_context(query, context_df, context_embeddings):
    """
    Format a question to the appropriate prompt and 
    then send to the OpenAI completion model.
    """
    prompt = construct_prompt(query, context_embeddings, context_df)
    
    # Send request to completion model
    MAX_TOKENS = 300
    response = openai.Completion.create(
        prompt=prompt,
        temperature=0.0,
        max_tokens=MAX_TOKENS,
        model=COMPLETION_MODEL
    )
    return response["choices"][0]["text"].replace("\n", " ").strip()

In [5]:
def start_mortybot(context_df_path, embedding_path, force_compute=False):
    """
    Load the file containing all of the context.
    If it exists, load the embeddings file. 
    If its size doesn't match the context data or if the
    file doesn't exist, compute embeddings on context data.
    """
    # Load context df
    df = pd.read_csv(context_df_path)
    print("Loaded context")
    
    # Load or generate document embeddings
    if os.path.exists(embedding_path):
        # Load
        emb = load_embeddings(embedding_path)
        if len(emb) == len(df) and not force_compute:
            print("Loaded embeddings")
        else:
            # If it doesn't match context, delete it and make a new one
            os.remove(embedding_path)
    if not os.path.exists(embedding_path) or force_compute:
        # Generate and save
        emb = compute_doc_embeddings(df)
        emb_df = pd.DataFrame.from_dict(emb, orient="index")
        emb_df.to_csv(embedding_path, index=False)
        print("Generated embeddings")
    return df, emb

In [6]:
# Try it out
context_df, context_embeddings = start_mortybot(
    context_df_path="../data/mortgage_context_text.csv", 
    embedding_path="../data/mortgage_text_embeddings.csv",
    force_compute=False
)

Loaded context
Loaded embeddings


In [7]:
# Ask a question and print the answer
Q = widgets.Text(placeholder="Ask a mortgage question")
output = widgets.Output()
def print_my_question(q):
    with output:
        A = answer_query_with_context(q.value, context_df, context_embeddings)
        print(f"Question:\n{q.value}\nAnswer:\n{A}\n")

display(Q, output)
Q.on_submit(print_my_question)

Text(value='', placeholder='Ask a mortgage question')

Output()

### Examples responses
**What does DTI stand for?**  
>Debt-to-Income Ratio

<br></br>

**Why am I asked for my social security number?**  
>Your Social Security number is requested so the lender can check your credit. 

<br></br>

**What is my name?**  
>I don't know.