In [1]:
import openai
import pandas as pd
import tqdm
import glob

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../apikey/apikey.txt", "r") as f:
    openai.api_key = f.readline().replace("\n", "")

In [3]:
def query_prompt(prompt, max_tokens=5000):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=0,
    )
    return response["choices"][0]["message"]["content"]

In [4]:
def get_df(fname):
    def f(categories):
        for cat in categories:
            for word in [
                "geology",
                "science",
                "physics",
                "chemistry",
                "mathematical",
                "biology",
                "astronomy",
                "ecology",
                "genetics",
            ]
                if word.lower() in cat.lower():
                    return True
            
        return False
    
    def text_preprocess(text):
        return text.replace("===", "\n").replace("==", "\n")

    df = pd.read_parquet(fname)
    df_science = df[df["categories"].apply(f)]
    df_science["text"] = "title: " + df_science["title"] + "\n" + df_science["text"].apply(text_preprocess)
    return df_science.sample(len(df_science)//40)

In [5]:
files = glob.glob("../data/wikipedia/a.parquet")

In [6]:
import time

In [7]:
import pickle

In [8]:
import json

In [9]:
from datetime import datetime as dt
import os

In [10]:
texts = []

In [13]:
import traceback 
batch_size = 1

def make_prompt(series):
    prompt = f"""
# 依頼
You are a professor at a science university and are creating a test for your students.
Using the given text, create a question in which you select the most appropriate statement from the five options in the question text. Also, extract the evidence for your answer.
The output should be an array in json format, with "prompt" as the problem statement, "A," "B," "C," "D," and "E" as choices, "answer" as the answer choice (one of A through E), and "basis" as the rationale. Please make sure that the answer choices are not all the same, e.g., all five answers are A.

# text
## text 1 
title: {series['title']}

{series['text']}


# attention
Please create 7 question.
"""
    return prompt

def f(series):
    if series["A"] != series["A"]:
        if type(series["choices"]) == dict:
            for key in ["A", "B", "C", "D", "E"]:
                series[key] = series["choices"][key]
        elif type(series["choices"] == list):
            for i, key in enumerate(["A", "B", "C", "D", "E"]):
                series[key] = series["choices"][i]
    return series

now_date = dt.now().strftime("%Y%m%d%H%M%S")

for file in files:
    if os.path.basename(file) in ["all.parquet"]:
        print(f"pass: {file}")
        continue
    df_science = get_df(file)
    
    for i in tqdm.tqdm(range(len(df_science)), desc=file):
        try:
            series = df_science.iloc[i]
            prompt = make_prompt(series)
            text = query_prompt(prompt)
            texts_json = json.loads(text)
            for text_json in texts_json:
                text_json["wiki_id"] = series["id"]
                text_json["original_text"] = series["text"]
                texts.append(text_json)
        except Exception as e:
            print(e)
            traceback.print_exc()
            print(text)
        if i % 100 == 0:
            df_texts = pd.DataFrame(texts)
            df_texts = df_texts.apply(f, axis=1)

            df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

../data/wikipedia/a.parquet:   4%|████████▏                                                                                                                                                                                                     | 22/555 [07:21<2:54:51, 19.68s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_33076/113380407.py", line 46, in <module>
    text = query_prompt(prompt)
  File "/tmp/ipykernel_33076/4006552839.py", line 2, in query_prompt
    response = openai.ChatCompletion.create(
  File "/opt/conda/lib/python3.10/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/opt/conda/lib/python3.10/site-packages/openai/api_requestor.py", line 298, in request
    resp, got_stream = self._interpret_response(result, stream)

This model's maximum context length is 16385 tokens. However, you requested 19870 tokens (14870 in the messages, 5000 in the completion). Please reduce the length of the messages or completion.
[
  {
    "prompt": "What was A. J. Ayer known for promoting?",
    "A": "Metaphysics",
    "B": "Ethics",
    "C": "Logical positivism",
    "D": "Existentialism",
    "E": "Pragmatism",
    "answer": "C",
    "basis": "The text states that A. J. Ayer was known for his promotion of logical positivism."
  },
  {
    "prompt": "Where did A. J. Ayer study the philosophy of logical positivism?",
    "A": "University of Oxford",
    "B": "University of Vienna",
    "C": "University College London",
    "D": "Christ Church, Oxford",
    "E": "Eton College",
    "answer": "B",
    "basis": "The text mentions that Ayer studied the philosophy of logical positivism at the University of Vienna."
  },
  {
    "prompt": "What position did A. J. Ayer hold at University College London?",
    "A": "Grote Profe

../data/wikipedia/a.parquet:  11%|█████████████████████▉                                                                                                                                                                                        | 59/555 [21:54<3:05:04, 22.39s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_33076/113380407.py", line 49, in <module>
    text_json["wiki_id"] = series["id"]
TypeError: 'str' object does not support item assignment
../data/wikipedia/a.parquet:  11%|██████████████████████▎                                                                                                                                                                                       | 60/555 [22:13<2:54:32, 21.16s/it]

'str' object does not support item assignment
{
  "questions": [
    {
      "prompt": "Which series is written by Ann Aguirre?",
      "A": "The Leopard King",
      "B": "Nightfall",
      "C": "Enclave",
      "D": "Honor Among Thieves",
      "E": "Perdition",
      "answer": "C",
      "basis": "The series 'Enclave' is written by Ann Aguirre, as mentioned in the text."
    },
    {
      "prompt": "In which genre does Ann Aguirre primarily write?",
      "A": "Romantic suspense",
      "B": "Young adult fiction",
      "C": "Steampunk noir",
      "D": "Paranormal romance",
      "E": "Apocalyptic romance",
      "answer": "D",
      "basis": "Ann Aguirre primarily writes in the genre of paranormal romance, as mentioned in the text."
    },
    {
      "prompt": "Which book is part of the Dark Age Dawning series?",
      "A": "Enclave",
      "B": "Nightfall",
      "C": "Daybreak",
      "D": "Honor Among Thieves",
      "E": "Perdition",
      "answer": "C",
      "basis": "The 

../data/wikipedia/a.parquet:  19%|██████████████████████████████████████▊                                                                                                                                                                      | 105/555 [39:26<3:03:06, 24.41s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_33076/113380407.py", line 49, in <module>
    text_json["wiki_id"] = series["id"]
TypeError: 'str' object does not support item assignment
../data/wikipedia/a.parquet:  19%|███████████████████████████████████████▏                                                                                                                                                                     | 106/555 [39:54<3:10:56, 25.52s/it]

'str' object does not support item assignment
{
  "questions": [
    {
      "prompt": "What is the title of Arkady Martine's first novel?",
      "A": "A Memory Called Empire",
      "B": "A Desolation Called Peace",
      "C": "Rose/House",
      "D": "Lace Downstairs",
      "E": "Nothing Must Be Wasted",
      "answer": "A",
      "basis": "The evidence for this answer can be found in the text: 'Martine's first novel, A Memory Called Empire, published in 2019, is the beginning of her Teixcalaan series.'"
    },
    {
      "prompt": "Where did Arkady Martine obtain her Ph.D. degree?",
      "A": "University of Chicago",
      "B": "University of Oxford",
      "C": "Rutgers University",
      "D": "St. Thomas University",
      "E": "Uppsala University",
      "answer": "C",
      "basis": "The evidence for this answer can be found in the text: 'Weller obtained a Bachelor of Arts in religious studies at the University of Chicago in 2007, a Master of Studies in classical Armenian st

../data/wikipedia/a.parquet:  20%|████████████████████████████████████████▎                                                                                                                                                                    | 109/555 [41:01<2:56:11, 23.70s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_33076/113380407.py", line 49, in <module>
    text_json["wiki_id"] = series["id"]
TypeError: 'str' object does not support item assignment
../data/wikipedia/a.parquet:  20%|████████████████████████████████████████▋                                                                                                                                                                    | 110/555 [41:07<2:16:48, 18.45s/it]

'str' object does not support item assignment
{"prompt": "What is the Arruda–Boyce model?", "A": "A hyperelastic constitutive model used to describe the mechanical behavior of rubber and other polymeric substances", "B": "A statistical mechanics model used to describe the behavior of rubber and other polymeric substances", "C": "A model used to describe the behavior of incompressible materials", "D": "A model used to describe the behavior of compressible materials", "E": "A model used to describe the behavior of linear elastic materials", "answer": "A", "basis": "In continuum mechanics, an Arruda–Boyce model is a hyperelastic constitutive model used to describe the mechanical behavior of rubber and other polymeric substances."}


../data/wikipedia/a.parquet:  22%|█████████████████████████████████████████████                                                                                                                                                                | 122/555 [45:53<2:43:48, 22.70s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_33076/113380407.py", line 49, in <module>
    text_json["wiki_id"] = series["id"]
TypeError: 'str' object does not support item assignment
../data/wikipedia/a.parquet:  22%|█████████████████████████████████████████████▍                                                                                                                                                               | 123/555 [45:57<2:02:51, 17.06s/it]

'str' object does not support item assignment
{
  "prompt": "Which novel inspired the creation of the film Aditya 369?",
  "A": "The Time Machine",
  "B": "Back to the Future",
  "C": "The Sun",
  "D": "The Third World War",
  "E": "The Time Traveler's Wife",
  "answer": "A",
  "basis": "The text states that Aditya 369 was inspired by the H. G. Wells novel The Time Machine."
}


../data/wikipedia/a.parquet:  25%|████████████████████████████████████████████████████                                                                                                                                                         | 141/555 [53:12<2:28:58, 21.59s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_33076/113380407.py", line 49, in <module>
    text_json["wiki_id"] = series["id"]
TypeError: 'str' object does not support item assignment
../data/wikipedia/a.parquet:  26%|████████████████████████████████████████████████████▍                                                                                                                                                        | 142/555 [53:40<2:41:49, 23.51s/it]

'str' object does not support item assignment
{
  "questions": [
    {
      "prompt": "What is the advantage of analytical light scattering (ALS) over conventional steady-state light scattering methods?",
      "A": "ALS allows separation of molecules on a chromatography column prior to analysis.",
      "B": "ALS provides bulk or average measurements on a sample.",
      "C": "ALS determines hydrodynamic properties of a single monodisperse species.",
      "D": "ALS is implemented in an online or flow mode.",
      "E": "ALS uses static light scattering (SLS) and dynamic light scattering (DLS) techniques.",
      "answer": "A",
      "basis": "The text states that the advantage of ALS over conventional steady-state light scattering methods is that it allows separation of molecules/macromolecules on a chromatography column prior to analysis with light scattering detectors."
    },
    {
      "prompt": "What does an analytical light scattering (ALS) instrument consist of?",
      "A":

../data/wikipedia/a.parquet:  29%|██████████████████████████████████████████████████████████▌                                                                                                                                                | 160/555 [1:01:27<2:31:44, 23.05s/it]

KeyboardInterrupt



In [14]:
df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [23]:
for i in range(10):
    series = df_texts.sample(1).iloc[0]
    print(f"{series['prompt']} \n text: \n {series['original_text'].split('==')[0]}")
    print("----------------")

What is the maximum sustainable yield (MSY) usually higher than? 
 text: 
 title: Maximum sustainable yield
In population ecology and economics, maximum sustainable yield (MSY) is theoretically, the largest yield (or catch) that can be taken from a species' stock over an indefinite period. Fundamental to the notion of sustainable harvest, the concept of MSY aims to maintain the population size at the point of maximum growth rate by harvesting the individuals that would normally be added to the population, allowing the population to continue to be productive indefinitely. Under the assumption of logistic growth, resource limitation does not constrain individuals' reproductive rates when populations are small, but because there are few individuals, the overall yield is small. At intermediate population densities, also represented by half the carrying capacity, individuals are able to breed to their maximum rate. At this point, called the maximum sustainable yield, there is a surplus of i

In [40]:
df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [20]:
pd.DataFrame(texts)["A"].isnull().sum()

69

In [41]:
df_science = get_df(files[0])
df_science[df_science["id"] == '29637793']

Unnamed: 0,id,title,text,categories


In [42]:
df_science

Unnamed: 0,id,title,text,categories
368626,1908395,Artificial brain,title: Artificial brain\nAn artificial brain (...,"[Computational neuroscience, Robotics, Emergin..."
34268,10410698,Abdul Amir al-Jamri,title: Abdul Amir al-Jamri\nSheikh Abdul Amir ...,"[1938 births, 2006 deaths, Deaths from kidney ..."
233077,1958222,Amyloid beta,title: Amyloid beta\nAmyloid beta (Aβ or Abeta...,"[Peptides, Molecular neuroscience, Alzheimer's..."
9870,3621668,A Woman of the Iron People,title: A Woman of the Iron People\nA Woman of ...,"[1991 American novels, 1991 science fiction no..."
139719,38366604,Albert Spaier,title: Albert Spaier\nAlbert Spaier (9 July 18...,"[1883 births, 1934 deaths, Writers from Iași, ..."
...,...,...,...,...
62685,4474244,Actuarial reserves,"title: Actuarial reserves\nIn insurance, an ac...","[Actuarial science, Capital requirement de:Dec..."
357456,4260564,Arrival II,title: Arrival II\nArrival ll (alternatively t...,"[1998 films, 1998 science fiction films, Ameri..."
391514,32894329,Astrobiophysics,title: Astrobiophysics\nAstrobiophysics is a f...,"[Astrophysics, Biophysics .]"
10000,2824171,A World of Difference (novel),title: A World of Difference (novel)\nA World ...,"[1990 American novels, Novels set during the C..."
