In [77]:
from openai import OpenAI
import os
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
import math
import json
import re
import tqdm
import pandas as pd


client = OpenAI(base_url=os.getenv("URL"), api_key=os.getenv("KEY"))

In [15]:
khoury_dataset = load_dataset("nickeldime/khouryprogram")

In [None]:
# load from markdown file (ends in .md)

with open("template-for-qa.md", "r") as f:
    prompt = f.read()

all_questions_answers = []

for i in tqdm.tqdm(range(len(khoury_dataset["train"]))):
    khoury_page = khoury_dataset["train"][i]["content"]
    url = khoury_dataset["train"][i]["url"]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=512, chunk_overlap=50, length_function=len
    )

    chunks = text_splitter.split_text(khoury_page)

    # split chunks into a list of lists, with each list of size 10 (or less), but make them all similar size
    chunk_list = [chunks[i:i + 8] for i in range(0, len(chunks), 8)]

    total_questions = 0
    
    questions_needed = []
    for chunk in chunk_list:
        percentage = len(chunk) / len(chunks)

        num_questions = math.ceil(percentage * 20)
        questions_needed.append(num_questions)

    
    answers = []


    for j in range(len(chunk_list)):
        conversation = [{ "role": "assistant", 
                "content": "".join(chunk_list[j])}, 
                {"role": "user", 
                "content": f'''Generate ${questions_needed[j]} detailed question-answer pairs about the content above that a prospective student might ask. Focus on key concepts, techniques, and best practices discussed in the content. Ensure the answers are comprehensive and informative.
                Return the results in JSON format, with each pair containing a "question" field and an "answer" field. Enclose the entire JSON output in triple backticks with the "json" language specifier.
                Example format:
                ```json
                [{{"question": "How are you?",
                 "answer": "I am doing well!"}}, 
                 ...]
                 ```'''}]
        
        try:
            resp = client.chat.completions.create(
                messages = conversation,
                model = "meta-llama/Meta-Llama-3.1-8B-Instruct",
                temperature=0)
            answers.append(resp.choices[0].message.content)
        except Exception as e:
            print("Error with", url, e)
            with open(f"output/{i}.md", "w") as f:
                f.write("Mistake here \n\n")
                f.write(str(e))
                f.write("\n\n")
            continue

    answers_for_this_page = []

    # print(answers)
    # Extract question-answer pairs from the json text
    for answer in answers:
        json_match = re.search(r'```json\s*([\s\S]*?)\s*```', answer)

        try:
        
            if json_match:
                json_content = json_match.group(1)
                python_list = json.loads(json_content)
            else:
                print("No JSON found in answer", answer)
                with open(f"output/{i}.md", "w") as f:
                    f.write("Mistake here \n\n")
                    f.write(answer)
                    f.write("\n\n")
        except Exception as e:
            print("Error with", url, e)
            with open(f"output/{i}.md", "w") as f:
                f.write("Mistake here \n\n")
                f.write(str(e))
                f.write("\n\n")
        
        for qa_pair in python_list:
            answers_for_this_page.append(qa_pair)

    all_questions_answers.append({"url": url, "qa_pairs": answers_for_this_page})

    # questions = []
    # answers = []

    # save to file
    with open(f"output/{i}.md", "w") as f:
        f.write(f"# {url}\n\n")
        for qa_pair in answers_for_this_page:
            # questions.append(qa_pair['question'])
            # answers.append(qa_pair['answer'])
            f.write(f"Question: {qa_pair['question']}\nAnswer: {qa_pair['answer']}\n\n")
    print("Success with", url, len(answers_for_this_page))

    # data = pd.DataFrame({"questions": questions, "answers": answers})
    # data.to_parquet(f"output_parquet/{i}.parquet")

all_questions_answers

In [86]:

total = 0
questions = []
answers = []

for page in all_questions_answers:
    qa_pairs = page["qa_pairs"]

    total += len(qa_pairs)

    for qa_pair in qa_pairs:
        questions.append(qa_pair['question'])
        answers.append(qa_pair['answer'])

    print("URL:", page["url"], "Num QA pairs:", len(qa_pairs))

print(total)
print(total / 98)

# convert to parquet
data = pd.DataFrame({"questions": questions, "answers": answers})
data.to_parquet("output.parquet")

URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/ Num QA pairs: 21
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/computer-science/ Num QA pairs: 159
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/computer-science/bscs/ Num QA pairs: 22
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/computer-science/bacs/ Num QA pairs: 26
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/computer-science/computing-law-bs/ Num QA pairs: 10
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/computer-science/interdisciplinary-studies-bs/ Num QA pairs: 22
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/computer-science/minor/ Num QA pairs: 10
URL: https://catalog.northeastern.edu/undergraduate/computer-information-science/cybersecurity/ Num QA pairs: 27
URL: https://catalog.northeas

NameError: name 'get_questions_answers_for_page' is not defined