In [80]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

In [81]:
embeddings_path = "../data/embadding_v1.csv"

df = pd.read_csv(embeddings_path)
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [82]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [83]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("特休", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.821


'Question: 我想知道特休假的規定？\nAnswer: 同仁於公司服務滿一定期間後，依法令規定以周年制給與特休假。同仁得以小時為單位申請：\n 1.半年以上一年未滿者，給假三日。\n 2. 一年以上未滿兩年者，給假七日。\n 3.兩年以上未滿三年者，給假十日。\n 4.三年以上未滿五年者，給假十四日。\n 5.五年以上未滿十年者，給假十五日。\n 6.十年以上者，每一年加給一日，加至三十日為止。\nYui/#8836/YuiWong@wistronits.com\n\n'

relatedness=0.817


'Question: 我的特休展延假天數沒用完怎麼辦？\nAnswer: 遞延假指的是特休假的展延，若於展延後一年仍未休完，會直接轉換為代金\nYui/#8836/YuiWong@wistronits.com\n\n'

relatedness=0.816


'Question: 我的特休假天數不夠用，該怎麼辦？\nAnswer: 改請個人假、事假，若您所有假期都請完，卻有不可抗因素一定要請假，請與主管及人資部溝通詢問\nYui/#8836/YuiWong@wistronits.com\n\n'

relatedness=0.813


'Question: 教召期間如適逢休息日或例假，公司是否會另給予補假？\nAnswer: 目前教召公假認定是針對工作日，如適逢休息日或例假，公司將不另給補假\nYui/#8836/YuiWong@wistronits.com\n\n'

relatedness=0.811


'Question: 我想知道事假的規定？\nAnswer: 有請假需求，又無特休或個人假可以使用時，可以以小時為單位提出申請，一年14天，不給薪\nYui/#8836/YuiWong@wistronits.com\n\n'

In [84]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = '運用以下的FAQ來回答問題，並附上聯絡人資訊。如果無法利用FAQ來回答問題，請回答：很抱歉，我無法回答以上問題，請聯絡8855'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nFAQ:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question

def try_answer_query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = '請以社工的角度溫柔地回答問題'
    question = f"\n\nQuestion: {query}"
    message = introduction
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
    try_answer: bool = False
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        # {"role": "system", "content": "You answer questions about the 2022 Winter Olympics."},
        {"role": "system", "content": "請回答有關FAQ的問題"},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    if '很抱歉，我無法回答以上問題，請聯絡8855。' in response_message and try_answer:
        try_answer_questions = [
            {"role": "system", "content": "以公司發言人的角度回答問題"},
            {"role": "user", "content": query},
        ]
        try_answer_response = openai.ChatCompletion.create(
            model=model,
            messages=try_answer_questions,
            temperature=0.8
        )
        try_answer_message = try_answer_response["choices"][0]["message"]["content"]
        response_message += f"\n嘗試解決您的問題: {try_answer_message}"
    return response_message

In [88]:
ask('緯創軟體股票代碼', try_answer=True)

'很抱歉，我無法回答以上問題，請聯絡8855。\n嘗試解決您的問題: 緯創軟體的股票代碼為3669。'