In [68]:
import openai
import os

# You need to set your OpenAI API key as an environment variable
# You can find your API key here: https://beta.openai.com/account/api-keys
# If you do not have an API key, you can sign up for free here: https://beta.openai.com/signup
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
openai.api_key = OPENAI_API_KEY
DATASET_PATH="./dataset"
TOTAL_QUESTIONS_COUNT_PER_RECORD = 25
RANDOM_QUESTIONS_COUNT = 5
# For simplicity. Better to calculate tokens count for each text
MAX_CONTENT_LENGTH = 4096*3
TRANING_DATA_FILEPATH="./fine_tuning_dataset.jsonl"
PROMPT_END_SEPARATOR = "\n\n###\n\n"
COMPLETION_END_SEPARATOR = " END"

In [60]:
import re

def extract_qa_from_content(content):
    qa = []
    question_match = re.finditer(r'<question>(.+?)</question>', content, re.DOTALL)
    answer_match = re.finditer(r'<answer>(.+?)</answer>', content, re.DOTALL)

    for question, answer in zip(question_match, answer_match):
        q = question.group(1).strip()
        a = answer.group(1).strip()
        qa.append((q, a))

    return qa

def read_file(filename):
  with open(filename, "r") as f:
      content = f.read()
  return content

def generate_qa(filepath):
  article = read_file(filepath)[:MAX_CONTENT_LENGTH]
  content = f'''Content for {filepath}:
{article}

Instructions: Create a series of questions and answers based on Content ({filepath}). Include code blocks if possible. For example:
<question>How can I create application routes in Next.js?</question>
<answer>Creating routes inside `app/` requires a single file, `page.js`:
// app/page.js
// This file maps to the index route (/)
export default function Page() {{
  return <h1>Hello, Next.js!</h1>;
}}</answer>
'''

  questions_answers = []
  response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a helpful software developer who specialize in next.js and react."},
        {"role": "user", "content": content},
      ],
      n=TOTAL_QUESTIONS_COUNT_PER_RECORD
    )
  
  for choice in response.choices:
    response_content = choice.message.content.strip()
    # print(response_content)
    qa = extract_qa_from_content(response_content)
    questions_answers.extend(qa)
  return questions_answers

In [61]:
NON_NEXTJS_Q_A_PROMPT = """Create a series of random questions and answers that are not related to the Next.js framework.
Each question should be followed by a clear answer stating that it is not relevant to Next.js. For example:

<question>What is the capital of Ukraine?</question>
<answer>This question is not related to Next.js.</answer>
<question>What is Spring Framework?</question>
<answer>It is not related to Next.js.</answer>

Feel free to generate any type of questions you like, as long as the answer indicates that it is not related to the Next.js framework."""

RANDOM_Q_A_PROMPT = """Create a series of random questions and answers we do not know answers for. For example:

<question>Who is the strongest man in the world?</question>
<answer>I do not know.</answer>
<question>When will the Next.js framework reach 1 billion users?</question>
<answer>I do not know.</answer>

Feel free to generate any type of questions you like, as long as the answer indicates that you do not know."""

def generate_random_qa(prompt):
  questions_answers = []
  response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a helpful software developer who specialize in next.js and react."},
        {"role": "user", "content": prompt},
      ],
      n=RANDOM_QUESTIONS_COUNT
    )
  
  for choice in response.choices:
    qa = extract_qa_from_content(choice.message.content.strip())
    questions_answers.extend(qa)
  return questions_answers

In [72]:
import json
import random

def write_qa_to_fine_tuning_dataset(qa):
  qa = list(set(qa))
  random.shuffle(qa)
  data = [{"prompt": question + PROMPT_END_SEPARATOR, "completion": " " + answer + COMPLETION_END_SEPARATOR} for question, answer in qa]
  with open(TRANING_DATA_FILEPATH, 'a') as outfile:
    for i, item in enumerate(data):
      json.dump(item, outfile)
      if i < len(data) - 1:
        outfile.write('\n')

In [63]:
questions_answers = []

for filename in os.listdir(DATASET_PATH):
  filepath = os.path.join(DATASET_PATH, filename)
  print(f"Processing '{filepath}'")
  try:
    new_questions_answers = generate_qa(filepath)
    # print(new_questions_answers)
    questions_answers.extend(new_questions_answers)
    print(f"Generated {len(new_questions_answers)} questions and answers")
  except Exception as e:
    print(f"Error: {e}")

try:
  new_questions_answers = generate_random_qa(NON_NEXTJS_Q_A_PROMPT)
  questions_answers.extend(new_questions_answers)
  print(f"Generated {len(new_questions_answers)} questions and answers")
except Exception as e:
  print(f"Error: {e}")

try:
  new_questions_answers = generate_random_qa(RANDOM_Q_A_PROMPT)
  questions_answers.extend(new_questions_answers)
  print(f"Generated {len(new_questions_answers)} questions and answers")
except Exception as e:
  print(f"Error: {e}")

write_qa_to_fine_tuning_dataset(questions_answers)

Processing './dataset/Next.js 13 complete guide to Server Components and the App Directory.md'
Generated 154 questions and answers
Processing './dataset/Blog - Next.js 13  Next.js.md'
Generated 149 questions and answers
Processing './dataset/Getting started with NextUI and Next.js - LogRocket Blog.md'
Generated 88 questions and answers
Processing './dataset/Blog - Next.js 13.1  Next.js.md'
Generated 122 questions and answers
Processing './dataset/Blog - Next.js 13.2  Next.js.md'
Generated 102 questions and answers
Processing './dataset/NextAuth.js for client-side authentication in Next.js  LogRocket Blog.md'
Generated 109 questions and answers
Processing './dataset/Full Stack Web App with Next.js 13 and the app Directory  by Lukas Wimhofer  Feb, 2023  Level Up Coding.md'
Generated 141 questions and answers
Processing './dataset/Next.js 13 Working with the new app directory - LogRocket Blog.md'
Generated 172 questions and answers
Processing './dataset/Blog  How to create a Next.js 13 & 