# Embedding FAQ questions for search

This notebook shows how to prepare a dataset of WITS FAQ for search. The dataset is used for the competition.

ref: https://github.com/openai/openai-cookbook/blob/297c53430cad2d05ba763ab9dca64309cb5091e9/examples/Embedding_Wikipedia_articles_for_search.ipynb

In [8]:
# imports
import openai  # for generating embeddings
import pandas as pd  # for DataFrames to store article sections and embeddings
import tiktoken  # for counting tokens
import os

In [9]:
openai.api_key = os.getenv("OPENAI_WITS_API_KEY")
if not openai.api_key:
    raise Exception("OPENAI API Key is not set.")
    
MAX_TOKENS = 1600
EMBEDDING_MODEL = "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request
SOURCE_DATA = '../data/FAQ_JOHN_REVIEW.xlsx'

In [10]:
df = pd.read_excel(SOURCE_DATA)

In [11]:
faq_dict = {row.Question: str(row.Answer)+ '\n' +str(row["Contact/Phone/Mail"]) for index, row in df.iterrows()}
faq_strings = []
for k, v in faq_dict.items():
    faq_strings.extend([f"Question: {k}\nAnswer: {v}\n\n"])

In [12]:
embeddings = []
for batch_start in range(0, len(faq_strings), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = faq_strings[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")
    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": faq_strings, "embedding": embeddings})

Batch 0 to 999


In [13]:
embedding_path = "../data/embedding_v4.csv"
df.to_csv(embedding_path, index=False)

In [19]:
import json
body = """{
    "type":"message",
    "message":{
        "type":"text",
        "id":"18056354926880",
        "text":"也有可能"
    },
    "webhookEventId":"01GZ3MCSWVBF6FA48AH43VA14Z",
    "deliveryContext":{
        "isRedelivery":false
    },
    "timestamp":1682675492256,
    "source":{
        "type":"group",
        "groupId":"C3811ba6177abcfd26f584b22aa780e29",
        "userId":"U76c776db3485221021055ea82ed43a4a"
    },
    "replyToken":"80c4c3d5a68144a7b706700c0bd09461",
    "mode":"active"
}"""
json_data = json.loads(body)

In [21]:
json_data['mode']

'active'