In [22]:
import openai
import pandas as pd
import os
from tqdm import tqdm
import json

In [33]:
data = pd.read_csv('data/classFeatures.csv')
openai.api_key = "<CHANGE>"
client = openai.ChatCompletion

def generate_qa_for_class(row):
    context = (
        "Generate training data for a language model. Provide detailed answers to each question in the following format: \n"
        "[{input: <question>, output: <detailed answer>}, ...] \n"
        "ALWAYS respond with a valid json \n\n"
        f"Here is the data for this class: {row.to_dict()}"
    )
    
    questions = [
        f"What is {row['Class Title']}?",
        f"Can you describe {row['Class Code']}?",
        f"How many credit hours is {row['Class Title']} worth?",
        f"Who teaches {row['Class Title']}?",
        f"What is the academic career for {row['Class Title']}?",
        f"Are there any restrictions for enrolling in {row['Class Title']}?"
    ]

    messages = [
        {"role": "system", "content": context},
        *[
            {"role": "user", "content": question}
            for question in questions
        ]
    ]

    response = client.create(
        model="gpt-3.5-turbo-16k",
        messages=messages,
        temperature=1,
        max_tokens=4096,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    content_str = response['choices'][0]['message']['content']
    # remove markdown tokens
    content_str = content_str.replace('```json', '').replace('```', '').strip()

    try:
        answers = json.loads(content_str)
    except json.JSONDecodeError:
        # we need to take care of invalid json, probably save them somewhere and process them in a batch later on ..?
        print(f"Failed to decode JSON from response content: {content_str}")
        answers = []

    return answers


qa_data = []
for index, row in tqdm(data.iloc[:20].iterrows(), total=20):
    qa_pairs = generate_qa_for_class(row)
    qa_data.extend(qa_pairs)

qa_df = pd.DataFrame(qa_data)
qa_df.to_csv('class_qa.csv', index=False)

 65%|█████████████████████████████████████████████████████████████████████████████████▎                                           | 13/20 [01:17<00:41,  5.90s/it]

Failed to decode JSON from response content: [{'input': 'What is Exploring a Non-Western Culture: The Maya?', 'output': 'Exploring a Non-Western Culture: The Maya is a course with the class code ANTH 1140. It explores the culture of the Maya of Central America, emphasizing their material adaptations, social organizations, ideals and values, and artistic achievements in the past and the present.'}, {'input': 'Can you describe ANTH 1140?', 'output': 'ANTH 1140, also known as Exploring a Non-Western Culture: The Maya, is a course that focuses on studying the culture of the Maya civilization in Central America. The course examines various aspects such as their material adaptations, social organizations, ideals and values, and artistic achievements throughout history and in the present.'}, {'input': 'How many credit hours is Exploring a Non-Western Culture: The Maya worth?', 'output': 'The number of credit hours for Exploring a Non-Western Culture: The Maya may vary depending on the specifi

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [01:59<00:00,  5.98s/it]

Failed to decode JSON from response content: [{'input': 'What is Laboratory in Biological Anthropology 1?', 'output': 'Laboratory in Biological Anthropology 1 is a course with the class code ANTH 2030. It is a lab-based course that focuses on human osteology and the musculoskeletal system. The course emphasizes comparative primate morphology, adaptation, and the fossil record documenting the natural history of primates. It is recommended to take the course alongside ANTH 2010. When taken with ANTH 2010, it fulfills the MAPS requirement for natural science lab.'} ,
{'input': 'Can you describe ANTH 2030?', 'output': 'ANTH 2030 is a course with the class code ANTH 2030. It is a lab-based course in Biological Anthropology that focuses on human osteology and the musculoskeletal system. The course emphasizes comparative primate morphology, adaptation, and the fossil record documenting the natural history of primates. It is recommended to take the course alongside ANTH 2010. When taken with A


