In [1]:
import openai
import pandas as pd
import tqdm
import glob

import warnings
warnings.filterwarnings("ignore")

In [2]:
with open("../apikey/apikey.txt", "r") as f:
    openai.api_key = f.readline().replace("\n", "")

In [3]:
def query_prompt(prompt, max_tokens=4000):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        messages=[
            {"role": "system", "content": "You are a professor at a science university and creating a exam for your students."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        temperature=0,
    )
    return response["choices"][0]["message"]["content"]

In [4]:
def get_df(fname):
    def f(categories):
        for cat in categories:
            for word in [
                "geology",
                "physics",
                "chemistry",
                "mathematical",
                "biology",
                "astronomy",
                "ecology",
                "genetics",
                "statistics",
                "theoretical"
            ]:
                if word.lower() in cat.lower():
                    return True
            
        return False
    
    def text_preprocess(text):
        return text.replace("===", "\n").replace("==", "\n")

    df = pd.read_parquet(fname)
    df_science = df[df["categories"].apply(f)]
    df_science["text"] = "title: " + df_science["title"] + "\n" + df_science["text"].apply(text_preprocess)
    return df_science.sample(len(df_science)//10)

In [5]:
files = glob.glob("../data/wikipedia/a.parquet")

In [6]:
import time

In [7]:
import pickle

In [8]:
import json

In [9]:
from datetime import datetime as dt
import os

In [10]:
texts = []

In [11]:
import traceback 
batch_size = 1

def make_prompt(series):
    prompt = f"""
You are an expert AI assistant who specializes in answering multiple-choice questions. You may use the context below if it helps you to answer the following multiple-choice question.
The output should be an array of json format, with "prompt" as the question statement, "A," "B," "C," "D," and "E" as choices, "answer" as the answer choice (one of A through E).

Context:
{series['text']}

"""
    return prompt

def f(series):
    if series["A"] != series["A"]:
        if type(series["choices"]) == dict:
            for key in ["A", "B", "C", "D", "E"]:
                series[key] = series["choices"][key]
        elif type(series["choices"] == list):
            for i, key in enumerate(["A", "B", "C", "D", "E"]):
                series[key] = series["choices"][i]
    return series

now_date = dt.now().strftime("%Y%m%d%H%M%S")

first = True
for file in files:
    if os.path.basename(file) in ["all.parquet"]:
        print(f"pass: {file}")
        continue
    df_science = get_df(file)
    
    for i in tqdm.tqdm(range(len(df_science)), desc=file):
        try:
            series = df_science.iloc[i]
            prompt = make_prompt(series)
            text = query_prompt(prompt)
            texts_json = json.loads(text)
            if first:
                print(texts_json)
                first = False
            if type(texts_json) == dict:
                text_json["wiki_id"] = series["id"]
                text_json["original_text"] = series["text"]
                texts.append(text_json)
            else:
                for text_json in texts_json:
                    text_json["wiki_id"] = series["id"]
                    text_json["original_text"] = series["text"]
                    texts.append(text_json)
        except Exception as e:
            print(e)
            traceback.print_exc()
            print(text)
        if i % 20 == 0:
            df_texts = pd.DataFrame(texts)
            df_texts = df_texts.apply(f, axis=1)

            df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

../data/wikipedia/a.parquet:   0%|▍                                                                                                                     | 1/252 [00:03<14:22,  3.44s/it]

[{'prompt': 'Who is the theoretical physicist that expounded the thought experiment of Astrochicken?', 'A': 'John von Neumann', 'B': 'Rodney Brooks', 'C': 'Michio Kaku', 'D': 'Freeman Dyson', 'E': 'None of the above', 'answer': 'D'}]


Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:   1%|▉                                                                                                                     | 2/252 [00:06<13:24,  3.22s/it]

Expecting value: line 1 column 1 (char 0)
Which player holds the record for the most appearances for the Albania national football team? 

A) Erjon Bogdani
B) Lorik Cana
C) Etrit Berisha
D) Ansi Agolli
E) Altin Lala


../data/wikipedia/a.parquet:   1%|█▍                                                                                                                    | 3/252 [00:10<15:06,  3.64s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:   2%|█▊                                                                                                                    | 4/252 [00:13<13:12,  3.19s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following is NOT a feature of an armillary sphere?

A) Representation of lines of celestial longitude and latitude
B) Mapping of constellations
C) Representation of the ecliptic
D) Representation of the equinoctial and solstitial colures
E) Measurement of the distance between celestial objects


../data/wikipedia/a.parquet:  31%|███████████████████████████████████▊                                                                                 | 77/252 [05:10<10:22,  3.56s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  31%|████████████████████████████████████▏                                                                                | 78/252 [05:13<10:19,  3.56s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following statements is true about abyssal channels?

A) They are formed by slow-flowing floods of clear water.
B) They are responsible for the accumulation of most sandstone deposits found on continental slopes.
C) They are the least understood sedimentary processes.
D) They do not have any significant impact on the transfer of carbon from the continental shelf to the deeper parts of the continental margins.
E) They are commonly referred to as channel levee systems.

Answer: B) They are responsible for the accumulation of most sandstone deposits found on continental slopes.


Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  31%|████████████████████████████████████▋                                                                                | 79/252 [05:15<09:02,  3.13s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following is NOT a department within the African Wildlife Defence Force (AWDF)?

A) Rangers
B) Advanced Force Rangers
C) Special Force Rangers
D) Special Operations Affiliate Ranger Group
E) Aviation


../data/wikipedia/a.parquet:  32%|█████████████████████████████████████▏                                                                               | 80/252 [05:18<08:52,  3.10s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 38, in <module>
    text = query_prompt(prompt)
  File "/tmp/ipykernel_54387/1456674105.py", line 2, in query_prompt
    response = openai.ChatCompletion.create(
  File "/opt/conda/lib/python3.10/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/opt/conda/lib/python3.10/site-packages/openai/api_requestor.py", line 298, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/opt/conda/lib/python3.10/site-packages/openai/api_requestor.py", line 700, in _in

This model's maximum context length is 16385 tokens. However, you requested 17753 tokens (13753 in the messages, 4000 in the completion). Please reduce the length of the messages or completion.
[
  {
    "prompt": "What is the purpose of astronomical surveys?",
    "A": "To observe specific celestial objects",
    "B": "To catalog celestial objects and perform statistical analyses",
    "C": "To search for transient astronomical events",
    "D": "To detect potentially hazardous objects",
    "E": "All of the above",
    "answer": "E"
  }
]


../data/wikipedia/a.parquet:  38%|████████████████████████████████████████████                                                                         | 95/252 [06:08<10:04,  3.85s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  38%|████████████████████████████████████████████▌                                                                        | 96/252 [06:13<10:21,  3.99s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following statements about the Ancestral Thames is true?

A) The river originated from the emergence of Britain from a Cretaceous sea.
B) The river's course was modified by the Anglian glaciation.
C) The river's deposits have been extensively studied in the field of archaeology.
D) The river flowed from the south-east towards what later became southern England.
E) The river's course remained unchanged throughout the Pleistocene period.


../data/wikipedia/a.parquet:  46%|█████████████████████████████████████████████████████▍                                                              | 116/252 [07:32<09:55,  4.38s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  46%|█████████████████████████████████████████████████████▊                                                              | 117/252 [07:36<09:45,  4.34s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following statements about the ARGUS distribution is true?

A. The ARGUS distribution is named after a famous physicist.
B. The probability density function of the ARGUS distribution is given by f(x; χ, c).
C. The cumulative distribution function of the ARGUS distribution is given by F(x).
D. The parameter c in the ARGUS distribution is estimated using the maximum likelihood approach.
E. The generalized ARGUS distribution is used to describe a more peaking-like distribution.

Answer: C


../data/wikipedia/a.parquet:  55%|███████████████████████████████████████████████████████████████▉                                                    | 139/252 [09:04<06:13,  3.30s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 38, in <module>
    text = query_prompt(prompt)
  File "/tmp/ipykernel_54387/1456674105.py", line 2, in query_prompt
    response = openai.ChatCompletion.create(
  File "/opt/conda/lib/python3.10/site-packages/openai/api_resources/chat_completion.py", line 25, in create
    return super().create(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/openai/api_resources/abstract/engine_api_resource.py", line 153, in create
    response, _, api_key = requestor.request(
  File "/opt/conda/lib/python3.10/site-packages/openai/api_requestor.py", line 298, in request
    resp, got_stream = self._interpret_response(result, stream)
  File "/opt/conda/lib/python3.10/site-packages/openai/api_requestor.py", line 700, in _in

This model's maximum context length is 16385 tokens. However, you requested 16406 tokens (12406 in the messages, 4000 in the completion). Please reduce the length of the messages or completion.
[
  {
    "prompt": "Arnold Kosevich was known for his contributions to which fields?",
    "A": "Quantum mechanics and astrophysics",
    "B": "The electron theory of metals and the theory of crystals",
    "C": "Nuclear physics and particle physics",
    "D": "Thermodynamics and fluid dynamics",
    "E": "Optics and electromagnetism",
    "answer": "B"
  }
]


../data/wikipedia/a.parquet:  64%|██████████████████████████████████████████████████████████████████████████▌                                         | 162/252 [10:29<05:12,  3.48s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  65%|███████████████████████████████████████████████████████████████████████████                                         | 163/252 [10:32<05:06,  3.44s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following statements about Arthur Eddington is true?

A. He was a mathematician and philosopher of science.
B. He discovered the theory of general relativity.
C. He conducted an expedition to observe a solar eclipse in 1919.
D. He was the first to correctly speculate on the source of stellar energy.
E. He was born in Weston-super-Mare, England.


../data/wikipedia/a.parquet:  75%|███████████████████████████████████████████████████████████████████████████████████████▍                            | 190/252 [12:16<03:52,  3.75s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  76%|███████████████████████████████████████████████████████████████████████████████████████▉                            | 191/252 [12:18<03:28,  3.42s/it]

Expecting value: line 1 column 1 (char 0)
Which airport is the third largest civilian airport by traffic in Pakistan?

A) Jinnah International Airport, Karachi
B) Islamabad International Airport
C) Allama Iqbal International Airport
D) Walton Airport
E) Lahore International Airport

Answer: C) Allama Iqbal International Airport


../data/wikipedia/a.parquet:  78%|██████████████████████████████████████████████████████████████████████████████████████████▋                         | 197/252 [12:40<03:10,  3.47s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  79%|███████████████████████████████████████████████████████████████████████████████████████████▏                        | 198/252 [12:43<03:03,  3.40s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following statements is true about the AIDA diabetes simulator?

A) It can be used for individual patient simulation and glycemic prediction.
B) It is only intended for insulin therapy planning.
C) It is primarily used for research purposes.
D) It is not suitable for teaching or self-learning.
E) It is available for download on the AIDA website.


../data/wikipedia/a.parquet:  79%|███████████████████████████████████████████████████████████████████████████████████████████▌                        | 199/252 [12:47<03:04,  3.49s/it]Traceback (most recent call last):
  File "/tmp/ipykernel_54387/324541624.py", line 39, in <module>
    texts_json = json.loads(text)
  File "/opt/conda/lib/python3.10/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/opt/conda/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/opt/conda/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
../data/wikipedia/a.parquet:  79%|████████████████████████████████████████████████████████████████████████████████████████████                        | 200/252 [12:49<02:48,  3.24s/it]

Expecting value: line 1 column 1 (char 0)
Which of the following is NOT a type of assay based on the nature of the assay process?

A) End point assay
B) Kinetic assay
C) High throughput assay
D) Multiplex assay
E) Ligand binding assay

Answer: E) Ligand binding assay


../data/wikipedia/a.parquet: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 252/252 [15:40<00:00,  3.73s/it]


In [12]:
 df_texts.to_csv(f"output_gpt3.5_generate/{now_date}.csv")

In [13]:

df_texts["answer"].value_counts()

A                                    113
B                                     44
D                                     34
C                                     31
E                                     20
Aurora Max                             2
寶瓶座 (bǎo píng zuò)                     1
Voronezh University                    1
Stanley Autler and Charles Townes      1
0.4-1.5 million years                  1
Name: answer, dtype: int64