In [1]:
import numpy as np
import openai
from openai.embeddings_utils import distances_from_embeddings
import pandas as pd
from pdb import set_trace

In [2]:
import faiss
index = faiss.read_index("knn.index", faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)

In [3]:
df = pd.read_csv("mappings.csv")

In [4]:
def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    _, I = index.search(np.array(q_embeddings)[None], 5)
    context_sources = df.iloc[I[0]]

    returns = []

    for _,i in context_sources.iterrows():
        tmp = pd.read_csv("embeddings/"+i["source"])
        out = tmp.iloc[i["row"]]["combined"]
        returns.append(out)
    # Return the context
    return "\n\n###\n\n".join(returns)

In [10]:
def answer_question(
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    df=df,
    model="text-davinci-003",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the question and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, If got multiple answers, answer all questions split by |, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""

In [11]:
out = answer_question("What is the cpi Food_beverage for jan 2022")

In [12]:
out

'141.0'