In [1]:
import palimpzest as pz
import pandas as pd
import time
import os
import IPython

formatter = IPython.get_ipython().display_formatter.formatters['text/plain']
formatter.max_seq_length = 0

# set OPENAI_API_KEY environment variable based on OPENAI_API_KEY
# os.environ["OPENAI_API_KEY"] = "{{ OPENAI_API_KEY }}" 

sci_paper_cols = [
    {"name": "title", "type": str, "desc": "The title of the paper. This is a natural language title, not a number or letter."},
    {"name": "author", "type": str, "desc": "The name of the first author of the paper"},
    {"name": "abstract", "type": str, "desc": "A short description of the paper contributions and findings"},
]

reference_cols = [
    {"name": "index", "type": int, "desc": "The index of the reference in the paper"},
    {"name": "title", "type": str, "desc": "The title of the paper being cited"},
    {"name": "first_author", "type": str, "desc": "The author of the paper being cited"},
    {"name": "year", "type": int, "desc": "The year in which the cited paper was published"},
]

print("Setup complete")

Setup complete


What are the datasets available in the system?

In [2]:
import pandas as pd
from prettytable import PrettyTable

dataset_path = "/home/gerardo/bdf-pz/testdata"
ds = os.listdir(dataset_path)

# construct table for printing
records = [["Name", "Path", "N. Files"]]
for path in ds:
    abspath = os.path.join(dataset_path,path)
    records.append([path,abspath,len(os.listdir(abspath))])

# print table of registered datasets
t = PrettyTable(records[0])
t.add_rows(records[1:])
t

Name,Path,N. Files
bdf-usecase3-tiny,/home/gerardo/bdf-pz/testdata/bdf-usecase3-tiny,4
sigmod-demo,/home/gerardo/bdf-pz/testdata/sigmod-demo,11
bdf-demo,/home/gerardo/bdf-pz/testdata/bdf-demo,11


Show me the files available in bdf-demo

In [3]:
import os
files = os.listdir("/home/gerardo/bdf-pz/testdata/bdf-demo")

files

['bolderson2010.pdf',
 'bonfiglio2023.pdf',
 'marques2021.pdf',
 'brunyanszki2014.pdf',
 'liu2024.pdf',
 'li2021.pdf',
 'zethoven2022.pdf',
 'vasaikar2019.pdf',
 'ceccaldi2015.pdf',
 'munkhbaatar2020.pdf',
 'gupte2017.pdf']

Create a schema to extract authors.

In [4]:
# Define a class name
class_name = "Author"

# Create the class dynamically
attributes = {"__doc__": " Schema for extracting author information from documents."}

schema_dicts = []
for name, desc in zip(['name', 'affiliation', 'email'], ['The full name of the author.', 'The affiliation of the author.', 'The email address of the author.']):
    schema_dicts.append({"name":name, "type":str, "desc":desc})

schema_dicts

[{'name': 'name', 'type': str, 'desc': 'The full name of the author.'},
 {'name': 'affiliation',
  'type': str,
  'desc': 'The affiliation of the author.'},
 {'name': 'email', 'type': str, 'desc': 'The email address of the author.'}]

Consider the bdf-demo dataset. This is a dataset of scientific papers. Extract all authors of papers about brain cancer.

In [5]:
import palimpzest as pz
# try:
    # schema = existing_schemas["Author"]
# except KeyError:
    # raise ValueError(f"Schema 'Author' not found in existing schemas!")
dataset = pz.Dataset("testdata/bdf-demo")


In [6]:
condition = "The paper is about brain cancer."
dataset = dataset.sem_filter(condition)

In [7]:
convert_schema = schema_dicts
cardinality_str = "one_to_many"
cardinality = pz.Cardinality.ONE_TO_MANY if cardinality_str == "one_to_many" else pz.Cardinality.ONE_TO_ONE
dataset = dataset.sem_add_columns(convert_schema, cardinality=cardinality)

In [8]:
if "dataset" not in locals():
    output = "bdf-demo"
else:
    output = dataset

policy_method = "min_cost"

# optimization block
if policy_method == "min_cost":
    policy = pz.MinCost()
elif policy_method == "max_quality":
    policy = pz.MaxQuality()

config = pz.QueryProcessorConfig(
    policy=policy,
    nocache=True,
    verbose=False,
    processing_strategy="streaming",
    execution_strategy="sequential",
    optimizer_strategy="pareto",
    allow_token_reduction=False,
    allow_code_synth=False,
)

iterable = output.run(config)

In [9]:
statistics = []
results = []
for data_record_collection in iterable:
    records = data_record_collection.data_records
    stats = data_record_collection.plan_stats
    record_time = time.time()
    statistics.append(stats)
    results.extend([r.to_dict() for r in records])

results_df = pd.DataFrame(results)

Time for planning:  0.00623321533203125
Generated plan:
  0. MarshalAndScanDataOp -> PDFFile 

 1. PDFFile -> LLMFilter -> PDFFile
    (contents, filename, text_conte) -> (contents, filename, text_conte)
    Model: Model.GPT_4o_MINI
    Filter: The paper is about brain cancer.

 2. PDFFile -> LLMConvertBonded -> Schema[['affiliation', 'contents', 'email', 'filename', 'name', 'text_contents']]
    (contents, filename, text_conte) -> (affiliation, contents, email, )
    Model: Model.GPT_4o_MINI
    Prompt Strategy: PromptStrategy.COT_QA




In [10]:
results_df

Unnamed: 0,contents,filename,text_contents,affiliation,email,name
0,b'%PDF-1.3\n%\xc7\xec\x8f\xa2\n5 0 obj\n<</Len...,marques2021.pdf,*For correspondence:\nCorrespondence: maria.ca...,"Seve Ballesteros Foundation Brain Tumor Group,...",maria.carro@uniklinik-freiburg.de,Carolina Marques


In [15]:
print(stats.total_plan_time)

56.8012158870697
