In [309]:
import pandas as pd
import openai
import constants
import tiktoken
import json

openai.api_key = constants.OPENAI_KEY

In [310]:
# Prevent truncation of text
pd.set_option('display.max_colwidth', None)
# Show more rows
pd.set_option('display.max_rows', 150)

In [311]:
ARBITARY_MAX_TOKENS = 3500
ARBITARY_QUESTION_TOKENS = 120

In [312]:
try:
    df_sample = pd.read_csv("questions_swedish.csv")
except:
    df = pd.read_csv("questions.csv")
    df = df[['qid1', 'question1']]
    df = df.rename(columns={'qid1': 'id', 'question1': 'question'})
    
    # Remove duplicates
    df = df.drop_duplicates(subset=['id'])

    df['question_swedish'] = pd.NA

    # Take 4000 random rows from the dataframe
    df_sample = df.sample(n=4000, random_state=1)

df_sample.head()

Unnamed: 0,id,question,question_swedish
0,776806,What is unusual or different about the food and cuisine in Slovakia?,Vad är ovanligt eller annorlunda med maten och köket i Slovakien?
1,339546,How do I speak English like celebrities?,Hur talar jag engelska som kändisar?
2,8139,Is being friendly a competitive advantage for engineers?,Är att vara vänlig en konkurrensfördel för ingenjörer?
3,39487,Who designed BHIM app?,Vem designade BHIM-appen?
4,94535,What do I do if I want to kill myself but don't have the courage?,Vad gör jag om jag vill döda mig själv men inte har modet?


In [313]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def calculate_tokens_conversation(messages):
    num_tokens = 0
    for message in messages:
        num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":  # if there's a name, the role is omitted
                num_tokens += -1  # role is always required and always 1 token
    num_tokens += 2  # every reply is primed with <im_start>assistant
    return num_tokens

def calculate_tokens_string(text):
    return len(encoding.encode(text))

In [314]:
def get_questions(questions: dict):
    instructions = f"""Detta system kommer att översätta användarens frågor till svenska
    
    Returnera alla frågorna i ordning.
    
    Svaret ska vara formatterat som en python dictionary där nyckeln är frågans id och värdet är frågan på svenska. 
    
    Nycklarna har datatype python string. Nycklarna ska alltså omges av dubbla citattecken (""), undvik JSONDecodeError: Expecting property name enclosed in double quotes.
    
    Undvik JSONDecodeError: Invalid \escape.
    
    Svaret ska gå att laddas in som en dictionary med eval().

    """

    questions_string = ""
    for question_id, question_text in questions.items():
        questions_string += "Fråga " + str(question_id) + ":\n"
        questions_string += question_text + "\n\n"

    prompt = [{"role": "system", "content": instructions}]
    prompt.append({"role": "user", "content": questions_string})

    max_token_answer = len(questions) * ARBITARY_QUESTION_TOKENS

    response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages = prompt,
        temperature = 0.9,
        max_tokens = max_token_answer,
        stream = False,
    )

    answer_string = response["choices"][0]["message"]["content"]

    answer = json.loads(answer_string)
    
    # Transform dict keys to int
    answer = {int(k): v for k, v in answer.items()}

    return answer

In [315]:
def handle_batch_of_questions(df_local: pd.DataFrame, questions: dict):
    questions = get_questions(questions)

    for question_id, questions_text in questions.items():
        df_local.loc[df_local["id"] == question_id, "question_swedish"] = questions_text

    df_local.to_csv("questions_swedish.csv", index=False)

In [316]:
def populate_questions(df_local: pd.DataFrame):
    total_data_points = len(df_local)
    current_data_point = 0

    added_questions = {}
    tokens_questions = 0
    for _, row in df_local.iterrows():
        current_data_point += 1
        if pd.isna(row["question_swedish"]):
            tokens_question = calculate_tokens_string(row["question"])

            if tokens_question + tokens_questions + len(added_questions) * ARBITARY_QUESTION_TOKENS < ARBITARY_MAX_TOKENS:
                added_questions[row["id"]] = row["question"]
                tokens_questions += tokens_question
            else:
                handle_batch_of_questions(df_local, added_questions)
                added_questions = {row["id"]: row["question"]}
                tokens_questions = tokens_question
                print(f"Processed {current_data_point} of {total_data_points} questions. ({current_data_point / total_data_points * 100:.2f} %)")

    if(len(added_questions) > 0):
        handle_batch_of_questions(df_local, added_questions)

In [317]:
populate_questions(df_sample)