In [1]:
import openai
import pandas as pd
import tqdm
import glob

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../apikey/apikey.txt", "r") as f:
    openai.api_key = f.readline().replace("\n", "")

In [3]:
def query_prompt(prompt, max_tokens=5000):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0,
    )
    return response["choices"][0]["message"]["content"]

In [4]:
def get_df(fname):
    def f(categories):
        for cat in categories:
            for word in [
                "geology",
                "physics",
                "chemistry",
                "mathematical",
                "biology",
                "astronomy",
                "ecology",
                "genetics",
                "statistics",
                "theoretical"
            ]:
                if word.lower() in cat.lower():
                    return True
            
        return False
    
    def text_preprocess(text):
        return text.replace("====", "\n\n").replace("===", "\n\n").replace("==", "\n\n")

    df = pd.read_parquet(fname)
    df_science = df[df["categories"].apply(f)]
    df_science["text"] = df_science["text"].apply(text_preprocess)
    df_science["text_length"] = df_science["text"].apply(len)
    
    df_science = df_science[(df_science["text_length"] > 10000) & (df_science["text_length"] < 50000)]
    return df_science.sample(len(df_science)//10)

In [5]:
files = glob.glob("../data/wikipedia/*.parquet")

In [6]:
import time

In [7]:
import pickle

In [8]:
import json

In [9]:
from datetime import datetime as dt
import os

In [10]:
texts = []

In [25]:
import traceback 
batch_size = 1

def make_prompt(series):
    prompt = f"""
You are a professional machine learning engineer who creates datasets for use in supervised learning of multiple choice questions and are very knowledgeable about science.
Please make a multiple-choice questions. Number of choice is five.

Context:
{series['text']}

Attention:
- The output should be json format.
- Json format key is "prompt" as the question statement, "A," "B," "C," "D," and "E" as choices, "answer" as the answer choice (one of A through E).
- The number of words in the question statement should be at least 50.
"""
    return prompt

def f(series):
    try:
        if series["A"] != series["A"]:
            if type(series["answer"]) == dict:
                for key in ["A", "B", "C", "D", "E"]:
                    series[key] = series["choices"][key]
            elif type(series["answer"] == list):
                for i, key in enumerate(["A", "B", "C", "D", "E"]):
                    series[key] = series["choices"][i]
    except:
        return series
    return series

now_date = dt.now().strftime("%Y%m%d%H%M%S")

first = True
for file in files:
    if os.path.basename(file) in ["all.parquet"]:
        print(f"pass: {file}")
        continue
    df_science = get_df(file)
    
    for i in tqdm.tqdm(range(len(df_science)), desc=file):
        try:
            series = df_science.iloc[i]
            prompt = make_prompt(series)
            text = query_prompt(prompt)
            texts_json = json.loads(text)
            if first:
                print(texts_json)
                first = False
            if type(texts_json) == dict:
                texts_json["wiki_id"] = series["id"]
                texts_json["original_text"] = series["text"]
                texts.append(texts_json)
            else:
                for text_json in texts_json:
                    text_json["wiki_id"] = series["id"]
                    text_json["original_text"] = series["text"]
                    texts.append(text_json)
        except Exception as e:
            print(e)
            traceback.print_exc()
            print(text)
        if i % 20 == 0:
            df_texts = pd.DataFrame(texts)
            df_texts = df_texts.apply(f, axis=1)

            df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

../data/wikipedia/a.parquet:   2%|█████                                                                                                                                                                                                             | 1/42 [00:04<03:07,  4.56s/it]

{'prompt': "Who is known as the 'father of Pakistan's atomic weapons program'?", 'A': 'Abdul Ghafoor', 'B': 'Zulekha', 'C': 'A. Q. Khan', 'D': 'Munir Ahmad Khan', 'E': 'Ghulam Ishaq Khan', 'answer': 'C'}


../data/wikipedia/a.parquet:  10%|████████████████████                                                                                                                                                                                              | 4/42 [00:17<02:44,  4.32s/it]

KeyboardInterrupt



In [26]:
df_texts = pd.DataFrame(texts)

In [31]:
df_texts["prompt"].values[-2]

"What is the purpose of ROPPA (Réseau des organisations paysannes et de producteurs de l'Afrique de l'Ouest)?"

In [28]:
df_texts["prompt"].apply(len)

0     110
1      72
2      80
3     109
4      58
5      97
6      77
7      66
8      18
9     108
10     57
Name: prompt, dtype: int64

In [None]:
df_texts = pd.DataFrame(texts)
df_texts = df_texts.apply(f, axis=1)

df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [None]:
 df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [13]:

df_texts["answer"].value_counts()

A                                    113
B                                     44
D                                     34
C                                     31
E                                     20
Aurora Max                             2
寶瓶座 (bǎo píng zuò)                     1
Voronezh University                    1
Stanley Autler and Charles Townes      1
0.4-1.5 million years                  1
Name: answer, dtype: int64