In [4]:
# load OPENAI_API_KEY from .env file
# !pip -q install python-dotenv
from dotenv import load_dotenv

load_dotenv()

True

## Step 1: Create a new Assistant with File Search Enabled

In [5]:
from openai import OpenAI
 
client = OpenAI()
 
assistant = client.beta.assistants.create(
  name="Paper Data Extractor v1.1",
  model="gpt-4-turbo",
  tools=[{"type": "file_search"}],
)

## Step 2: Read the PDF file paths

In [49]:
# Read all files in "current dir/pdfs" and save to file_paths 
import os
file_paths = []

for root, dirs, files in os.walk("pdfs"):
    for file in files:
        if file.endswith(".pdf"):
            file_paths.append(os.path.join(root, file))

file_paths

['pdfs/Tracking Real Time Layoffs with SEC Filings - A Preliminary Investigation.pdf']

## Step 3: Extract the relevant information from a PDF

In [9]:
# Upload a single PDF to the assistant
message_file = client.files.create(
  file=open(file_paths[0], "rb"), 
  purpose="assistants"
)

ToolResourcesFileSearch(vector_store_ids=['vs_PpGJ2dbvfZ8Wv809tyi23gh0'])


In [39]:
# Extract the structured data from the PDF
prompt = """Extract the following data from the provided paper: Title, the research questions, the types of data used for the study, the size of the data set, the history of the dataset (i.e. how many years does it cover), the source of the data, the methods used to answer the research questions, the various metrics used for measuring, and the outcomes the authors found. Return the extracted structured data as a JSON object. Only respond with the JSON object, and do not respond with anything else.

Return your response as a structured JSON object using the following format:
''' 
{
  "title_of_paper": "What is the title of the paper?", // string: the title of the paper
  "research_questions": ["What is the research question?"], // array of strings: the research questions. If there are multiple research questions, list them all as separate items in the array
  "data_types": ["What types of data were used?"], // array of strings: the types of data used for the study. If there are multiple types of data, list them all as separate items in the array
  "data_size": "What is the size of the dataset?", // string: the size of the data set, i.e. number of observations, samples, etc.
  "data_history": "How many years does the dataset cover?", // string: the history of the dataset
  "data_sources": ["What are the sources of the data?"], // array of string: the sources of the data. If there are multiple sources, list them all as separate items in the array
  "methods": ["What methods were used to answer the research questions?"], // array of string: the methods used to answer the research questions. If there are multiple methods, list them all as separate items in the array
  "metrics": ["What metrics were used for measuring?"], // array of string: the various metrics used for measuring. If there are multiple metrics, list them all as separate items in the array
  "outcomes": ["What outcomes did the authors find?"] // array of string: the outcomes the authors found. If there are multiple outcomes, list them all as separate items in the array
}
'''

Response:
"""

# Create a thread and attach the file to the message
thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
            "attachments": [
                {"file_id": message_file.id, "tools": [{"type": "file_search"}]}
            ],
        }
    ]
)

# The thread now has a vector store with that file in its tool resources.
print(thread.tool_resources.file_search)

ToolResourcesFileSearch(vector_store_ids=['vs_gvmYTO1VpvkKVMumrei2Gsl4'])


In [40]:
# Create a run and poll the status of the run until it's in a terminal state
run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id, assistant_id=assistant.id
)

messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

message_content = messages[0].content[0].text
# Remove annotations from response
for index, annotation in enumerate(message_content.annotations):
    message_content.value = message_content.value.replace(annotation.text, "")

print(message_content.value)

```json
{
  "title_of_paper": "Tracking Real Time Layoffs with SEC Filings: A Preliminary Investigation",
  "research_questions": [
    "What are the alternative timely indicators of layoffs based on 8-K filings?"
  ],
  "data_types": [
    "8-K filings data",
    "WARN notices data",
    "firm-level data"
  ],
  "data_size": "285 linked layoffs between WARN notices and 8-K filings",
  "data_history": "Covers multiple years, includes two recessions",
  "data_sources": [
    "8-K filings",
    "WARN notices",
    "Compustat"
  ],
  "methods": [
    "Natural language processing",
    "Sentence embeddings from BERT",
    "Prompting generative large language model (Llama 2)",
    "Quantitative regression analysis"
  ],
  "metrics": [
    "Number of reported layoff events",
    "Number of affected workers"
  ],
  "outcomes": [
    "8-K filings are sometimes available before WARN notices",
    "The 8-K layoff series are highly correlated with the business cycle and other layoff indicators",


## Step 4: Convert GPT's output to JSON

In [41]:
import re
import json

# Remove starting "```json" and ending "```" values from GPTs response
json_string = (
    message_content.value.replace("```json\n", "").replace("```", "").replace("\n", "")
)
# Remove annotations from JSON string
json_string = re.sub(r"【.*】", "", json_string)
# Convert message_content.value to JSON
json_response = json.loads(json_string)

json_response

{'title_of_paper': 'Tracking Real Time Layoffs with SEC Filings: A Preliminary Investigation',
 'research_questions': ['What are the alternative timely indicators of layoffs based on 8-K filings?'],
 'data_types': ['8-K filings data', 'WARN notices data', 'firm-level data'],
 'data_size': '285 linked layoffs between WARN notices and 8-K filings',
 'data_history': 'Covers multiple years, includes two recessions',
 'data_sources': ['8-K filings', 'WARN notices', 'Compustat'],
 'methods': ['Natural language processing',
  'Sentence embeddings from BERT',
  'Prompting generative large language model (Llama 2)',
  'Quantitative regression analysis'],
 'metrics': ['Number of reported layoff events', 'Number of affected workers'],
 'outcomes': ['8-K filings are sometimes available before WARN notices',
  'The 8-K layoff series are highly correlated with the business cycle and other layoff indicators',
  'Preliminary evidence that the 8-K series are useful for forecasting important quantities 

In [44]:
!pip -q install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [48]:
# Convert the JSON response to a pandas DataFrame
import pandas as pd

json_response_df = pd.DataFrame([json_response])

json_response_df

Unnamed: 0,title_of_paper,research_questions,data_types,data_size,data_history,data_sources,methods,metrics,outcomes
0,Tracking Real Time Layoffs with SEC Filings: A...,[What are the alternative timely indicators of...,"[8-K filings data, WARN notices data, firm-lev...",285 linked layoffs between WARN notices and 8-...,"Covers multiple years, includes two recessions","[8-K filings, WARN notices, Compustat]","[Natural language processing, Sentence embeddi...","[Number of reported layoff events, Number of a...",[8-K filings are sometimes available before WA...


## Step 5: Store extracted data to Airtable

- Create a new table in Airtable
- Create a new automation using the "When webhook received" trigger and "Create record" action
- Map the JSON fields to the fields in the Airtable table inside the "Create record" tab. For example, map the `title_of_paper` field to the `Title` field in the Airtable table.

Alternative storage options: Python Pandas dataframe saved to local Parquet file, MongoDB, SQL databases (MySQL, PostgreSQL, etc), DynamoDB, Google Sheets, etc.

In [32]:
!pip -q install requests


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [43]:
import requests

# The webhook URL for the table in Airtable
webhook_url = "https://hooks.airtable.com/workflows/v1/genericWebhook/apphgC5p4Jgz2VV5w/wflPQAUElCuobq4wO/wtrRmZljovvAMXQYJ"

headers = {"Content-Type": "application/json"}

# Make the POST request to insert the data
response = requests.post(webhook_url, json=json_response, headers=headers)

# Check the response
if response.status_code == 200:
    print("Data inserted successfully!")
else:
    print(f"Failed to insert data: {response.text}")

Data inserted successfully!
