# Embedding FAQ questions for search

This notebook shows how to prepare a dataset of WITS FAQ for search. The dataset is used for the competition.

ref: https://github.com/openai/openai-cookbook/blob/297c53430cad2d05ba763ab9dca64309cb5091e9/examples/Embedding_Wikipedia_articles_for_search.ipynb

In [25]:
# imports
import openai  # for generating embeddings
import pandas as pd  # for DataFrames to store article sections and embeddings
import tiktoken  # for counting tokens
import os

In [26]:
openai.api_key = os.getenv("OPENAI_WITS_API_KEY")
if not openai.api_key:
    raise Exception("OPENAI API Key is not set.")
    
GPT_MODEL = "gpt-3.5-turbo"
MAX_TOKENS = 1600
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request
SOURCE_DATA = '../data/FAQ_JOHN_REVIEW.xlsx'

In [27]:
df = pd.read_excel(SOURCE_DATA)

## Prepare Eng version (Optional)

In [28]:
# faq_eng = []
# for index, row in df.iterrows():
#     print(row)
#     response_q = openai.Completion.create(
#         model="text-davinci-003",
#         prompt=f"Translate this into English:\n\n{k}\n\n1.",
#         temperature=0.3,
#         max_tokens=100,
#         top_p=1.0,
#         frequency_penalty=0.0,
#         presence_penalty=0.0
#     )
#     response_ans = openai.Completion.create(
#         model="text-davinci-003",
#         prompt=f"Translate this into English:\n\n{v}\n\n1.",
#         temperature=0.3,
#         max_tokens=100,
#         top_p=1.0,
#         frequency_penalty=0.0,
#         presence_penalty=0.0
#     )
#     q = response_q["choices"][0]["text"]
#     a = response_ans["choices"][0]["text"]
#     faq_eng.append({'Question': q, "Answer": a})
#     df_eng = pd.DataFrame(faq_eng)
#     df_eng.to_csv('../data/FAQ_ENG_CHATGPT.csv', index=False)
#     print(f"Finish translating {index}")
#     print(q)
#     print(a)

In [29]:
faq_dict = {row.Question: str(row.Answer)+ '\n' +str(row["Contact/Phone/Mail"]) for index, row in df.iterrows()}
faq_strings = []
for k, v in faq_dict.items():
    faq_strings.extend([f"Question: {k}\nAnswer: {v}\n\n"])

In [70]:
embeddings = []
for batch_start in range(0, len(faq_strings), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = faq_strings[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": faq_strings, "embedding": embeddings})

Batch 0 to 999


In [71]:
embadding_path = "../data/embadding_v1.csv"
df.to_csv(embadding_path, index=False)