In [1]:
import openai
import pandas as pd
import tqdm
import glob

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../apikey/apikey.txt", "r") as f:
    openai.api_key = f.readline().replace("\n", "")

In [3]:
def query_prompt(prompt, max_tokens=1000):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=0,
    )
    return response["choices"][0]["message"]["content"]

In [4]:
def get_df(fname):
    def f(categories):
        for cat in categories:
            if "science" in cat:
                return True
            if "physics" in cat:
                return True
            if "chemistry" in cat:
                return True
        return False
    
    def text_preprocess(text):
        return text.replace("===", "\n").replace("==", "\n")

    df = pd.read_parquet(fname)
    df_science = df[df["categories"].apply(f)]
    df_science["text"] = "title: " + df_science["title"] + "\n" + df_science["text"].apply(text_preprocess)
    return df_science.sample(len(df_science)//3)

In [5]:
files = glob.glob("../data/wikipedia/a.parquet")

In [6]:
import time

In [7]:
import pickle

In [8]:
import json

In [9]:
from datetime import datetime as dt

In [None]:
import traceback 
batch_size = 1

def make_prompt(series):
    prompt = f"""
# 依頼
You are a professor at a science university and are creating a test for your students.
Using the given text, create a question in which you select one of the five options for the question text to select the most appropriate statement. Also, extract the portion of the text that provides the basis for your answer.
The output should be an array in json format, with "prompt" as the question text, "A," "B," "C," "D," and "E" as the choices, "answer" as the answer choice, and "basis" as the rationale.
Also, please create one question per text. In other words, the total number of questions created will be one.
# text
## text 1 
{series['text']}


# attention
Please create one question per text. So the total number of problems created will be 1.
"""
    return prompt
texts = []
now_date = dt.now().strftime("%Y%m%d%H%M%S")

for f in files:
    df_science = get_df(f)
    
    for i in tqdm.tqdm(range(len(df_science)), desc=f):
        try:
            series = df_science.iloc[i]
            prompt = make_prompt(series)
            text = query_prompt(prompt)
            text = json.loads(text)
            if type(text) == list:
                text = text[0]
            text["wiki_id"] = series["id"]
            text["original_text"] = series["text"]
            texts.append(text)
        except Exception as e:
            print(e)
            traceback.print_exc()
            print(text)
        if i % 100 == 0:
            pd.DataFrame(texts).to_csv(f"output_gpt3.5_generate/{now_date}.csv")

../data/wikipedia/a.parquet:   1%|▊                                                                  | 11/925 [01:00<1:28:10,  5.79s/it]

In [52]:
pd.DataFrame(texts)

Unnamed: 0,prompt,choices,answer,basis,wiki_id,original_text,A,B,C,D,E
0,What is the Albert Einstein Professorship in S...,{'A': 'An endowed professorship in physics est...,A,The Albert Einstein Professorship in Science i...,70302156,title: Albert Einstein Professorship in Scienc...,,,,,
1,What was the mission of the Academy of Science...,,C,"In 1856, the Academy's mission was to promote ...",25865948,"title: Academy of Science, St. Louis\nThe Acad...",To promote science education in St. Louis,To explore the West and discover natural resou...,To advance scientific research in various fields,To establish a museum of science and natural h...,To publish scientific journals on the natural ...
2,What was the purpose of transmitting the song ...,{'A': 'To celebrate the 40th anniversary of th...,A,The transmission of the song 'Across the Unive...,24891124,title: Across the Universe (message)\nAcross t...,,,,,
3,Which of the following statements about Annett...,{'A': 'Annette Aiello is a professor of botany...,B,In 1975 she obtained an MA in biology from Har...,64378727,title: Annette Aiello\nAnnette A. Aiello (born...,,,,,
4,Which genre did Austin Hall primarily write in?,,C,Austin Hall primarily wrote in the genres of s...,16994153,title: Austin Hall (writer)\n{{Infobox writer ...,Mystery,Romance,Science fiction,Historical fiction,Biography
5,What is the total undergraduate population of ...,,A,Alfred University has a total undergraduate po...,345573,title: Alfred University\nAlfred University is...,"Approximately 1,600 students","Approximately 2,000 students","Approximately 2,500 students","Approximately 3,000 students","Approximately 3,500 students"
6,What is academese?,,A,Academese is a term referring to unnecessary j...,68617312,title: Academese\nAcademese is a term referrin...,Unnecessary jargon associated with academia,A form of power relations between academics,Complex but necessary terminology,A synonym for academic writing,A stereotype of academic writing
7,What was the percentage of the Moon in shadow ...,,D,The text states that during the April 1903 lun...,22068713,title: April 1903 lunar eclipse\nA partial lun...,50%,75%,90%,96.77%,100%
8,What is the main plot of the film August Eighth?,{'A': 'A young single mother reunites with her...,A,The film tells the story of a young single mot...,34760284,"title: August Eighth\nAugust Eighth (, transli...",,,,,
9,Which of the following statements is supported...,[Scientists have discovered a new species of f...,The World Meteorological Organization has anno...,"On June 23, the World Meteorological Organizat...",64647168,title: April–June 2020 in science\nThis articl...,,,,,


In [41]:
df_science = get_df(files[0])
df_science[df_science["id"] == '29637793']

Unnamed: 0,id,title,text,categories


In [42]:
df_science

Unnamed: 0,id,title,text,categories
368626,1908395,Artificial brain,title: Artificial brain\nAn artificial brain (...,"[Computational neuroscience, Robotics, Emergin..."
34268,10410698,Abdul Amir al-Jamri,title: Abdul Amir al-Jamri\nSheikh Abdul Amir ...,"[1938 births, 2006 deaths, Deaths from kidney ..."
233077,1958222,Amyloid beta,title: Amyloid beta\nAmyloid beta (Aβ or Abeta...,"[Peptides, Molecular neuroscience, Alzheimer's..."
9870,3621668,A Woman of the Iron People,title: A Woman of the Iron People\nA Woman of ...,"[1991 American novels, 1991 science fiction no..."
139719,38366604,Albert Spaier,title: Albert Spaier\nAlbert Spaier (9 July 18...,"[1883 births, 1934 deaths, Writers from Iași, ..."
...,...,...,...,...
62685,4474244,Actuarial reserves,"title: Actuarial reserves\nIn insurance, an ac...","[Actuarial science, Capital requirement de:Dec..."
357456,4260564,Arrival II,title: Arrival II\nArrival ll (alternatively t...,"[1998 films, 1998 science fiction films, Ameri..."
391514,32894329,Astrobiophysics,title: Astrobiophysics\nAstrobiophysics is a f...,"[Astrophysics, Biophysics .]"
10000,2824171,A World of Difference (novel),title: A World of Difference (novel)\nA World ...,"[1990 American novels, Novels set during the C..."
