In [23]:
import openai
import os

# You need to set your OpenAI API key as an environment variable
# You can find your API key here: https://beta.openai.com/account/api-keys
# If you do not have an API key, you can sign up for free here: https://beta.openai.com/signup
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
openai.api_key = OPENAI_API_KEY
DATASET_PATH="./dataset"
TOTAL_QUESTIONS_COUNT = 1
# For simplicity. Better to calculate tokens count for each text
MAX_CONTENT_LENGTH = 4096*2

In [24]:
import re

def extract_qa_from_content(content):
  blocks = re.split(r'\n\n', content)
  qa = []

  for block in blocks:
    question_match = re.search(r'Q:(.+)\n', block)
    answer_match = re.search(r'A:(.+)', block, re.DOTALL)
    if question_match and answer_match:
      question = question_match.group(1).strip()
      answer = answer_match.group(1).strip()
      qa.append((question, answer))
  
  return qa

def read_file(filename):
  with open(filename, "r") as f:
      content = f.read()
  return content

def generate_qa(filepath):
  article = read_file(filepath)[:MAX_CONTENT_LENGTH]
  content = f'''Content for {filepath}:
{article}

Instructions: Generate question and answer based on Content for {filepath}.
Structure it as:
Q: <question>
A: <answer>
'''

  questions_answers = []
  response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a helpful software developer who specialize in next.js and react."},
        {"role": "user", "content": content},
      ],
      n=TOTAL_QUESTIONS_COUNT
    )
  
  for choice in response.choices:
    qa = extract_qa_from_content(choice.message.content.strip())
    questions_answers.extend(qa)
  return questions_answers

for filename in os.listdir(DATASET_PATH):
  filepath = os.path.join(DATASET_PATH, filename)
  print(f"Processing {filepath}")
  try:
    print(generate_qa(filepath))
  except Exception as e:
    print(f"Error: {e}")

Processing ./dataset/Blog - Next.js 13  Next.js.md
[('What are the new features introduced in Next.js 13?', 'Next.js 13 introduced the following new features:\n- `app` Directory (beta) for easier, faster, and less client JS.\n- Turbopack (alpha), a rust-based Webpack replacement for up to 700x faster performance.\n- New `next/image` with native browser lazy loading for faster performance.\n- New `@next/font` (beta) with automated self-hosted fonts and zero layout shift.\n- Improved `next/link` with a simplified API and automatic `<a>`.'), ('Is the `app` Directory ready for production use?', 'The `app` Directory is currently in beta and not yet recommended for production use. You can use Next.js 13 with the stable `pages` directory and opt into the `app` directory at your own pace.'), ('What are the benefits of using the `app` Directory?', 'The `app` Directory offers several benefits such as:\n- Layouts that enable easy sharing of UI between routes while preserving state and avoiding ex