In [1]:
import openai
import pandas as pd
import tqdm
import glob

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../apikey/apikey.txt", "r") as f:
    openai.api_key = f.readline().replace("\n", "")

In [103]:
def query_prompt(prompt, max_tokens=5000):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0.1,
    )
    return response["choices"][0]["message"]["content"]

In [161]:
def get_df(fname):
    def f(categories):
        for cat in categories:
            for word in [
                "geology",
                "physics",
                "chemistry",
                "mathematical",
                "biology",
                "astronomy",
                "ecology",
                "genetics",
                "statistics",
                "theoretical"
            ]:
                if word.lower() in cat.lower():
                    return True
            
        return False
    
    def text_preprocess(text):
        return text.replace("====", "\n\n").replace("===", "\n\n").replace("==", "\n\n")

    df = pd.read_parquet(fname)
    df_science = df[df["categories"].apply(f)]
    df_science["text"] = df_science["text"].apply(text_preprocess)
    df_science["text_length"] = df_science["text"].apply(len)
    
    df_science = df_science[(df_science["text_length"] > 10000) & (df_science["text_length"] < 50000)]
    return df_science.sample(len(df_science)//10)

In [162]:
files = glob.glob("../data/wikipedia/*.parquet")

In [163]:
import time

In [164]:
import pickle

In [165]:
import json

In [166]:
from datetime import datetime as dt
import os

In [167]:
texts = []

In [168]:
import traceback 
batch_size = 1

def make_prompt(series):
    prompt = f"""
You are a professional machine learning engineer who creates datasets for use in supervised learning of multiple choice questions and are very knowledgeable about science.
Please create five multiple-choice questions about the context below.

Context:
{series['text']}

Attention:
- The output should be an array of json format.
- Json format key is "prompt" as the question statement, "A," "B," "C," "D," and "E" as choices, "answer" as the answer choice (one of A through E).
- The last two questions should be longer text questions (more than 40 words).
"""
    return prompt

def f(series):
    if series["A"] != series["A"]:
        if type(series["answer"]) == dict:
            for key in ["A", "B", "C", "D", "E"]:
                series[key] = series["choices"][key]
        elif type(series["answer"] == list):
            for i, key in enumerate(["A", "B", "C", "D", "E"]):
                series[key] = series["choices"][i]
    return series

now_date = dt.now().strftime("%Y%m%d%H%M%S")

first = True
for file in files:
    if os.path.basename(file) in ["all.parquet"]:
        print(f"pass: {file}")
        continue
    df_science = get_df(file)
    
    for i in tqdm.tqdm(range(len(df_science)), desc=file):
        try:
            series = df_science.iloc[i]
            prompt = make_prompt(series)
            text = query_prompt(prompt)
            texts_json = json.loads(text)
            if first:
                print(texts_json)
                first = False
            for text_json in texts_json:
                text_json["wiki_id"] = series["id"]
                text_json["original_text"] = series["text"]
                texts.append(text_json)
        except Exception as e:
            print(e)
            traceback.print_exc()
            print(text)
        if i % 20 == 0:
            df_texts = pd.DataFrame(texts)
            df_texts = df_texts.apply(f, axis=1)

            df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

../data/wikipedia/a.parquet:   0%|                                                                                                                                                                                                                          | 0/42 [02:07<?, ?it/s]

[{'prompt': 'Who holds the record for the most appearances for the Algeria national football team?', 'A': 'Lakhdar Belloumi', 'B': 'Yazid Mansouri', 'C': 'Djamel Menad', 'D': 'Rabah Madjer', 'E': 'Islam Slimani', 'answer': 'A'}, {'prompt': 'Who has the most appearances as a captain for the Algeria national football team?', 'A': 'Lakhdar Belloumi', 'B': 'Yazid Mansouri', 'C': 'Djamel Menad', 'D': 'Rabah Madjer', 'E': 'Islam Slimani', 'answer': 'B'}, {'prompt': 'Who holds the record for the most goals scored for the Algeria national football team?', 'A': 'Lakhdar Belloumi', 'B': 'Yazid Mansouri', 'C': 'Djamel Menad', 'D': 'Rabah Madjer', 'E': 'Islam Slimani', 'answer': 'E'}, {'prompt': 'Who is the oldest player to have played for the Algeria national football team?', 'A': 'Lakhdar Belloumi', 'B': 'Yazid Mansouri', 'C': 'Djamel Menad', 'D': 'Rabah Madjer', 'E': "Raïs M'Bolhi", 'answer': 'E'}, {'prompt': 'Who is the youngest player to have played for the Algeria national football team?', '




KeyError: 'choices'

In [None]:
print(df_science["text"].values[0])

In [156]:
df_texts = pd.DataFrame(texts)

In [157]:
df_texts["prompt"]

0              What is a charged particle accelerator?
1                         What is accelerator physics?
2    Which software package provides summaries of m...
3    Which code is used to simulate synchrotron rad...
4    What are wakefields in the context of particle...
5           What is the purpose of space charge codes?
6    What are some applications of particle acceler...
7    What are some challenges in developing unified...
8    What are some software packages used in the de...
9    What are some codes used in industrial and med...
Name: prompt, dtype: object

In [158]:
df_texts["prompt"].apply(len)

0    39
1    28
2    77
3    62
4    60
5    42
6    52
7    82
8    96
9    76
Name: prompt, dtype: int64

In [None]:
df_texts = pd.DataFrame(texts)
df_texts = df_texts.apply(f, axis=1)

df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [None]:
 df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [13]:

df_texts["answer"].value_counts()

A                                    113
B                                     44
D                                     34
C                                     31
E                                     20
Aurora Max                             2
寶瓶座 (bǎo píng zuò)                     1
Voronezh University                    1
Stanley Autler and Charles Townes      1
0.4-1.5 million years                  1
Name: answer, dtype: int64