In [38]:
import openai
import pandas as pd
import os
from tqdm import tqdm
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

In [52]:
data = pd.read_csv('data/classFeatures.csv')
openai.api_key = "<CHANGE>"
client = openai.ChatCompletion

def generate_qa_for_class(row):
    context = (
        "Generate training data for a language model. Provide detailed answers to each question in the following format: \n"
        "[{input: <question>, output: <detailed answer>}, ...] \n"
        "ALWAYS respond with a valid json \n\n"
        f"Here is the data for this class: {row.to_dict()}"
    )
    
    questions = [
        f"What is {row['Class Title']}?",
        f"Can you describe {row['Class Code']}?",
        f"How many credit hours is {row['Class Title']} worth?",
        f"Who teaches {row['Class Title']}?",
        f"What is the academic career for {row['Class Title']}?",
        f"Are there any restrictions such as prerequisites for enrolling in {row['Class Title']}?"
    ]

    messages = [
        {"role": "system", "content": context},
        *[
            {"role": "user", "content": question}
            for question in questions
        ]
    ]

    response = client.create(
        model="gpt-3.5-turbo-16k",
        messages=messages,
        temperature=1,
        max_tokens=4096,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    content_str = response['choices'][0]['message']['content']
    # remove markdown tokens
    content_str = content_str.replace('```json', '').replace('```', '').strip()

    try:
        qa_pairs = json.loads(content_str)
        formatted_pairs = [{'input': pair['input'], 'output': pair['output']} for pair in qa_pairs]
    except json.JSONDecodeError:
        with open("invalidData.txt", "a") as file:
            file.write(content_str + "\n")
        print(f"Failed to decode JSON from response content")
        formatted_pairs = []

    return formatted_pairs


# Limit to CSCI
data = data[data['Class Code'].str.contains("CSCI")]
results = []
maxThreads = 15 # anything over 15 will hit the openai rate limit, so keep this to 15


with ThreadPoolExecutor(max_workers=maxThreads) as executor:
    futures = [executor.submit(generate_qa_for_class, row) for index, row in data.iloc[:100].iterrows()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Generating Q&A"):
        results.extend(future.result())


qa_df = pd.DataFrame(results)
qa_df.to_csv('train_data/class_qa_csci.csv', index=False)

91


Generating Q&A:   1%|█▏                                                                                                            | 1/91 [00:04<06:51,  4.57s/it]

Failed to decode JSON from response content


Generating Q&A:   5%|██████                                                                                                        | 5/91 [00:05<00:54,  1.57it/s]

Failed to decode JSON from response content


Generating Q&A:   7%|███████▎                                                                                                      | 6/91 [00:06<00:56,  1.51it/s]

Failed to decode JSON from response content


Generating Q&A:  30%|████████████████████████████████▎                                                                            | 27/91 [00:14<00:23,  2.71it/s]

Failed to decode JSON from response content


Generating Q&A:  34%|█████████████████████████████████████▏                                                                       | 31/91 [00:16<00:37,  1.60it/s]

Failed to decode JSON from response content


Generating Q&A:  36%|███████████████████████████████████████▌                                                                     | 33/91 [00:17<00:24,  2.34it/s]

Failed to decode JSON from response content


Generating Q&A:  37%|████████████████████████████████████████▋                                                                    | 34/91 [00:17<00:23,  2.42it/s]

Failed to decode JSON from response content


Generating Q&A:  41%|████████████████████████████████████████████▎                                                                | 37/91 [00:19<00:25,  2.12it/s]

Failed to decode JSON from response content
Failed to decode JSON from response content


Generating Q&A:  57%|██████████████████████████████████████████████████████████████▎                                              | 52/91 [00:25<00:17,  2.18it/s]

Failed to decode JSON from response content


Generating Q&A:  58%|███████████████████████████████████████████████████████████████▍                                             | 53/91 [00:25<00:15,  2.38it/s]

Failed to decode JSON from response content


Generating Q&A:  60%|█████████████████████████████████████████████████████████████████▉                                           | 55/91 [00:27<00:22,  1.61it/s]

Failed to decode JSON from response content
Failed to decode JSON from response content


Generating Q&A:  66%|███████████████████████████████████████████████████████████████████████▊                                     | 60/91 [00:28<00:09,  3.28it/s]

Failed to decode JSON from response content
Failed to decode JSON from response content


Generating Q&A:  68%|██████████████████████████████████████████████████████████████████████████▎                                  | 62/91 [00:29<00:13,  2.13it/s]

Failed to decode JSON from response content


Generating Q&A:  71%|█████████████████████████████████████████████████████████████████████████████▊                               | 65/91 [00:30<00:08,  3.16it/s]

Failed to decode JSON from response content


Generating Q&A:  75%|█████████████████████████████████████████████████████████████████████████████████▍                           | 68/91 [00:31<00:09,  2.51it/s]

Failed to decode JSON from response content


Generating Q&A:  79%|██████████████████████████████████████████████████████████████████████████████████████▏                      | 72/91 [00:32<00:05,  3.75it/s]

Failed to decode JSON from response content
Failed to decode JSON from response content


Generating Q&A:  85%|████████████████████████████████████████████████████████████████████████████████████████████▏                | 77/91 [00:34<00:04,  3.45it/s]

Failed to decode JSON from response content


Generating Q&A:  86%|█████████████████████████████████████████████████████████████████████████████████████████████▍               | 78/91 [00:34<00:04,  3.00it/s]

Failed to decode JSON from response content


Generating Q&A:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████▌        | 84/91 [00:37<00:02,  2.55it/s]

Failed to decode JSON from response content


Generating Q&A:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 90/91 [00:39<00:00,  3.17it/s]

Failed to decode JSON from response content


Generating Q&A: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 91/91 [00:40<00:00,  2.24it/s]

Failed to decode JSON from response content



