First, we need to load some stuff, and create a so called Model Schema, that will allow the model to return structured output that can then easily be converted into a data frame.

In [5]:
from openai import OpenAI
import time
import pandas as pd
from contextlib import redirect_stdout
import os
from pydantic import BaseModel
from typing import List, Optional
import random
import json
import csv
from glob import glob

oai_keyfile = "oai_key.py"

openai_key = {}
with open(oai_keyfile) as f:
    exec(f.read(), openai_key)


client = OpenAI(api_key=openai_key['openai_key'])

class TestStatistic(BaseModel):
    statistic_type: str  # e.g., 't', 'F', 'z', etc.
    value: float
    degrees_of_freedom: Optional[str] # e.g., 'df=20'
    p_value_exact: Optional[bool] # should be true if p-value is reported with = sign or false if < or > 
    p_value: Optional[float]
    confidence_interval: Optional[str] # e.g., '95% CI [1.2, 3.4]'
    effect_measure: Optional[str] # e.g., 'r', 'OR', 'RR', d, 
    effect_size: Optional[str] # e.g., 'd=0.5'
    hypothesis_confirmed: Optional[bool] # should be true if p-value is less than alpha level and in line with hypothesized direction
    

    class Config:
        extra = "forbid"

class Hypothesis(BaseModel):
    hypothesis_text: str
    test_statistics: List[TestStatistic]

    class Config:
        extra = "forbid"

class Study(BaseModel):
    study_title: str
    hypotheses: List[Hypothesis]

    class Config:
        extra = "forbid"

class ResearchPaper(BaseModel):
    paper_title: str
    file_name: str
    studies: List[Study]

    class Config:
        extra = "forbid"

# print(ResearchPaper.model_json_schema())

Now we create an assistant, which is basically a persistant GPT instance (like a digital RA) that we pass our papers to and ask it to complete the task. Unlike an actual RA, it will always forget what it did in between papers.

In [2]:
assistant = client.beta.assistants.create(
    name="Research Paper Extractor",
    instructions="You are a skilled researcher tasked with accurately and completely extracting hypotheses and test results from a research paper and providing the output in structured JSON format.",
    model="gpt-4o",  # Ensure this model supports structured outputs
    temperature=0.1,
    tools=[{"type": "file_search"}],
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": "ResearchPaper",
            "strict": True,
            "schema": ResearchPaper.model_json_schema()
        }
    }
)


# check if token usage file exists
if not os.path.exists("token_usage.txt"):
  f = open("token_usage.txt", "w")
  f.write("0\n")
  f.close()

Now we point to some directory (yes I know this is not how I should be doing folder management, I'll change that at some point), and take some pdfs in there for testing this approach. Usually we would want to use all that we are interested in, now we just sample a few.

In [3]:
filepath = "/home/julian/projects/auto_rep_dgps/downloads/Psychological_Science/"

# Get a list of all PDF files in the directory
pdf_files = [f for f in os.listdir(filepath) if f.endswith('.pdf')]

# Set a seed for reproducibility
random.seed(42)

# Select 10 random PDF files
selected_pdfs = random.sample(pdf_files, 10)

print("Selected PDF files:", selected_pdfs)

Selected PDF files: ['10117709567976241235932.pdf', '10117709567976231222836.pdf', '10117709567976241246561.pdf', '10117709567976241242105.pdf', '10117709567976241239932.pdf', '10117709567976241254312.pdf', '10117709567976231215298.pdf', '10117709567976231221789.pdf', '10117709567976241243370.pdf', '10117709567976231221990.pdf']


Give it the paper by uploading it to the assistant

In [4]:
def create_thread(file_path):
    # Upload the file
    with open(file_path, "rb") as f:
        message_file = client.files.create(file=f, purpose="assistants")
    
    # Create a thread and attach the file to the message
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": "Here is a research paper. You will be asked to extract multiple things from this paper. Do each task exactly as posed and very carefully. The correct completion of these tasks is VERY IMPORTANT. MAKE NO MISTAKES.",
                "attachments": [
                    {"file_id": message_file.id, "tools": [{"type": "file_search"}]}
                ],
            }
        ]
    )
    return thread


Give it the task definition and ask it to complete it.

In [5]:

def create_message(file_name, thread):
    task_message = {
        "role": "user",
        "content": f"""
        You just received a research paper. The file name of the paper is {file_name} Please complete the following tasks:

        **Main Objective:**
        Your main objective is to extract all relevant test statistics of focal hypothesis tests from the research paper.

        *Definition of 'relevant test statistics':*
        - A test statistic is relevant if it is used to test a main hypothesis of the research paper.
        - Relevant test statistics are typically reported in the results section of the paper.
        - Relevant test statistics are typically reported as t, F, z, b, gamma, beta, r, R², etc., potentially including degrees of freedom, and most importantly p-values and sometimes confidence intervals (CIs). Also they may potentially include effect sizes (e.g., Cohen's d).

        *Definition of 'focal hypothesis tests':*
        - The paper might have multiple main hypotheses and/or multiple studies.
        - A hypothesis test is focal if it is a main hypothesis of the paper, typically explicitly or implicitly stated in the introduction or methods section.
        - Ensure completeness but remain concise—do not include irrelevant test statistics or fabricate hypotheses.
        - A hypothesis can be tested in just one out of all studies, or in multiple studies. If a hypothesis is tested repeatedly, make sure you do not miss this in the extraction.
        - Ideally, but by far not always, papers distinguish between pre-registration and exploratory analyses. If so, focus on pre-registered hypotheses only. If no such distinction is made, focus on what you think are the tests of predicted hypotheses.

        **Tasks:**
        1. Identify the number of studies reported in the paper.
        2. Identify the main hypothesis tests reported in the paper.
        3. Extract the test statistics for each hypothesis test. Also determine if the hypothesis was confirmed or not based on the p-value and the hypothesized direction.

        ** Word of Caution:**
        - Be extremely careful with the extraction of test statistics. The correct extraction of test statistics is crucial for the interpretation of the results.
        - Sometimes identifying what is important means drawing from a large and extensive context. Make sure to read the paper carefully and fully understand it before starting on the task.

        ** Examples of a Test statistic Output:**
        - Example 1:
        * In the paper:*
            "The results showed a significant effect, F(1,20) = 3.45, p < 0.05, d = .20."
        * Expected output:*
            {{
                "statistic_type": "F",
                "value": 3.45,
                "degrees_of_freedom": "1,20",
                "p_value_exact": false,
                "p_value": 0.05,
                "confidence_interval": null,
                "effect_measure": d,
                "effect_size": "0.20"
                "hypothesis_confirmed": true
            }}
        - Example 2:
            * In the paper:*
                "The correlation was significant, b = 0.35, p = 0.01."
            * Expected output:*
                {{
                    "statistic_type": "b",
                    "value": 0.35,
                    "degrees_of_freedom": null,
                    "p_value_exact": true,
                    "p_value": 0.01,
                    "confidence_interval": null,
                    "effect_measure": null,
                    "effect_size": null
                    "hypothesis_confirmed": true
                }}

        Please provide the extracted information in the specified JSON format.
        """
    }

    # Add the task message to the thread
    client.beta.threads.messages.create(
        thread_id=thread.id,
        role=task_message["role"],
        content=task_message["content"]
    )




run the above in a loop for all the papers - done!

In [6]:

# run method in loop

log_file_path = "log.txt"

t_total = time.time()

failed_hyp_path = "failed_extractions.csv"

json_files_dir = "./json_files"

# create dir if it does not exist
if not os.path.exists(json_files_dir):
    os.makedirs(json_files_dir)

with open(log_file_path, "a") as log_file:
    with redirect_stdout(log_file):
        print(f"\n******************************Starting processing of {len(selected_pdfs)} documents at {t_total}******************************\n")
        for i, pdf in enumerate(selected_pdfs):
            t0 = time.time()
            print(f"\n******************************Starting processing of document {pdf} at {t0}******************************\n")
            thread = create_thread(filepath + pdf)
            print(f"Thread created for document {pdf}")
            create_message(pdf, thread)
            run = client.beta.threads.runs.create_and_poll(
                thread_id=thread.id, assistant_id=assistant.id, model="gpt-4o"
            )
            print(f"Run completed for document {pdf}")
            print(run)
            messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
            message_content = messages[0].content[0].text
            extracted_data = json.loads(message_content.value)
            # save to json file
            json_file_path = os.path.join(json_files_dir, pdf.replace(".pdf", ".json"))
            with open(json_file_path, "w") as f:
                json.dump(extracted_data, f, indent=2)
            t1 = time.time()
            print(f"\n******************************Finished processing of document {pdf} at {t1}******************************\n")
            print(f"Time taken to process document {pdf}: {t1 - t0} seconds\n")
        t_final = time.time()
        print(f"\n******************************Finished processing of {len(selected_pdfs)} documents at {t_final}******************************\n")
        print(f"Total time taken to process {len(selected_pdfs)} documents: {t_final - t_total} seconds\n")
        

        


oh yes, and convert to csv.

In [7]:
# Set the folder containing the JSON files and the output CSV file path
input_folder = "./json_files_4o"
output_csv = '/combined_data.csv'

rows = []
for filepath in glob(os.path.join(input_folder, '*.json')):
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    paper_title = data.get('paper_title', '')
    file_name   = data.get('file_name', '')
    
    for study in data.get('studies', []):
        study_title = study.get('study_title', '')
        for hypothesis in study.get('hypotheses', []):
            hypothesis_text = hypothesis.get('hypothesis_text', '')
            for stat in hypothesis.get('test_statistics', []):
                row = {
                    'paper_title': paper_title,
                    'file_name': file_name,
                    'study_title': study_title,
                    'hypothesis_text': hypothesis_text,
                    'statistic_type': stat.get('statistic_type', ''),
                    'value': stat.get('value', ''),
                    'degrees_of_freedom': stat.get('degrees_of_freedom', ''),
                    'p_value_exact': stat.get('p_value_exact', ''),
                    'p_value': stat.get('p_value', ''),
                    'confidence_interval': stat.get('confidence_interval', ''),
                    'effect_measure': stat.get('effect_measure', ''),
                    'effect_size': stat.get('effect_size', ''),
                    'hypothesis_confirmed': stat.get('hypothesis_confirmed', '')
                }
                rows.append(row)

fieldnames = [
    'paper_title', 'file_name', 'study_title', 'hypothesis_text',
    'statistic_type', 'value', 'degrees_of_freedom', 'p_value_exact',
    'p_value', 'confidence_interval', 'effect_measure', 'effect_size',
    'hypothesis_confirmed'
]

with open(input_folder+output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)
