In [46]:
import openai
import os

# You need to set your OpenAI API key as an environment variable
# You can find your API key here: https://beta.openai.com/account/api-keys
# If you do not have an API key, you can sign up for free here: https://beta.openai.com/signup
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
openai.api_key = OPENAI_API_KEY
DATASET_PATH="./dataset"
TOTAL_QUESTIONS_COUNT = 1
# For simplicity. Better to calculate tokens count for each text
MAX_CONTENT_LENGTH = 4096*2
TRANING_DATA_FILEPATH="./fine_tuning_dataset.jsonl"
PROMPT_END_SEPARATOR = "\n\n###\n\n"

In [47]:
import re

questions_answers = []

def extract_qa_from_content(content):
  blocks = re.split(r'\n\n', content)
  qa = []

  for block in blocks:
    question_match = re.search(r'Q:(.+)\n', block)
    answer_match = re.search(r'A:(.+)', block, re.DOTALL)
    if question_match and answer_match:
      question = question_match.group(1).strip()
      answer = answer_match.group(1).strip()
      qa.append((question, answer))
  
  return qa

def read_file(filename):
  with open(filename, "r") as f:
      content = f.read()
  return content

def generate_qa(filepath):
  article = read_file(filepath)[:MAX_CONTENT_LENGTH]
  content = f'''Content for {filepath}:
{article}

Instructions: Generate question and answer based on Content for {filepath}.
Structure it as:
Q: <question>
A: <answer>
'''

  questions_answers = []
  response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a helpful software developer who specialize in next.js and react."},
        {"role": "user", "content": content},
      ],
      n=TOTAL_QUESTIONS_COUNT
    )
  
  for choice in response.choices:
    qa = extract_qa_from_content(choice.message.content.strip())
    questions_answers.extend(qa)
  return questions_answers

for filename in os.listdir(DATASET_PATH):
  filepath = os.path.join(DATASET_PATH, filename)
  print(f"Processing '{filepath}'")
  try:
    new_questions_answers = generate_qa(filepath)
    questions_answers.extend(new_questions_answers)
    print(f"Generated {len(new_questions_answers)} questions and answers")
  except Exception as e:
    print(f"Error: {e}")

Processing ./dataset/Blog - Next.js 13  Next.js.md
Processing ./dataset/Getting started with NextUI and Next.js - LogRocket Blog.md
Processing ./dataset/Blog - Next.js 13.1  Next.js.md
Processing ./dataset/Blog - Next.js 13.2  Next.js.md
Processing ./dataset/NextAuth.js for client-side authentication in Next.js  LogRocket Blog.md
Processing ./dataset/Next.js 13 Working with the new app directory - LogRocket Blog.md
Processing ./dataset/The best new features in Next.js 13  InfoWorld.md


In [48]:
import json

def write_qa_to_fine_tuning_dataset(qa):
  data = [{"prompt": question + PROMPT_END_SEPARATOR, "completion": answer} for question, answer in qa]
  with open(TRANING_DATA_FILEPATH, 'a') as outfile:
    for i, item in enumerate(data):
      json.dump(item, outfile)
      if i < len(data) - 1:
        outfile.write('\n')

write_qa_to_fine_tuning_dataset(questions_answers)