In [1]:

prompt = '''
Please write three to ten (depending on the amount of information provided) questions and answers based on the text given by the user. The questions should be something a customer of a fintech company would ask from its customer support. The answers should be clear, short and concise and to the point. Return the questions and answers in a json format. Example json:
[{"question": "How much money can I transfer to Estonia given the current \'situation\' ?", "answer":"A private person can transfer to one million EUR to an Euro account."}]
Please note the required \' \' around the word situation. These are needed in order to deserialize the JSON correctly later on.
Please use only the info provided in the given text to construct a question and answer it.
Return each question and answer pair in a json object. Put the json objects into a json list.
'''

In [9]:
import time
from openai.error import RateLimitError
import json
import tiktoken
MODEL = "gpt-4"
tokenizer = tiktoken.encoding_for_model(MODEL)
import openai
openai.api_key = ''


with open("scraped-data/index.json", "r") as f:
    data = json.load(f)

counter = 0

for section in data:
    for subsection in section["subsections"]:
        for article in subsection["articles"]:
            counter += 1
            if article.get('QA') is not None:
                continue
            print(f'Processing {counter}')
            with open(article['folder_path'] + '/content.md', 'r') as f:
                article_md = f.read()
            items = [section["title"], section["heading"], article_md]
            content = '\n\n'.join(items)
            messages = [{"role": "system", "content": prompt}, {"role": "user", "content": content}]
            while True:
                try:
                    chat = openai.ChatCompletion.create(
                        model=MODEL,
                        messages=messages
                    )
                    break
                except RateLimitError:
                    print('Rate limit error, waiting 10 seconds')
                    time.sleep(10)
                except Exception as e:
                    print(e)
                    time.sleep(10)
            reply = chat.choices[0].message.content
            QA = json.loads(reply)
            article['QA'] = QA

            with open('scraped-data/index.json', 'w') as f:
                json.dump(data, f, indent=2)
            print(f'Done {counter}')
            time.sleep(1/10)



In [29]:
# iterate over data and create set of ids
# example ff['data'][0]['id']

set_of_ids = {ff['data'][i]['id'] for i in range(len(ff['data']))}

set_of_ids

{'ada',
 'ada-code-search-code',
 'ada-code-search-text',
 'ada-search-document',
 'ada-search-query',
 'ada-similarity',
 'ada:2020-05-03',
 'babbage',
 'babbage-code-search-code',
 'babbage-code-search-text',
 'babbage-search-document',
 'babbage-search-query',
 'babbage-similarity',
 'babbage:2020-05-03',
 'code-davinci-edit-001',
 'code-search-ada-code-001',
 'code-search-ada-text-001',
 'code-search-babbage-code-001',
 'code-search-babbage-text-001',
 'curie',
 'curie-instruct-beta',
 'curie-search-document',
 'curie-search-query',
 'curie-similarity',
 'curie:2020-05-03',
 'cushman:2020-05-03',
 'davinci',
 'davinci-if:3.0.0',
 'davinci-instruct-beta',
 'davinci-instruct-beta:2.0.0',
 'davinci-search-document',
 'davinci-search-query',
 'davinci-similarity',
 'davinci:2020-05-03',
 'gpt-3.5-turbo',
 'gpt-3.5-turbo-0301',
 'gpt-4',
 'gpt-4-0314',
 'if-curie-v2',
 'if-davinci-v2',
 'if-davinci:3.0.0',
 'text-ada-001',
 'text-ada:001',
 'text-babbage-001',
 'text-babbage:001',
 'tex

In [30]:
len(set_of_ids)

66