In [9]:
import palimpzest as pz
import pandas as pd
import time
import os
import IPython

formatter = IPython.get_ipython().display_formatter.formatters['text/plain']
formatter.max_seq_length = 0

# set OPENAI_API_KEY environment variable based on OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = "{{ OPENAI_API_KEY }}" 

class ScientificPaper(pz.PDFFile):
   """Represents a scientific research paper, which in practice is usually from a PDF file"""
   paper_title = pz.Field(desc="The title of the paper. This is a natural language title, not a number or letter.", required=True)
   author = pz.Field(desc="The name of the first author of the paper", required=True)
   abstract = pz.Field(desc="A short description of the paper contributions and findings", required=False)

class Reference(pz.Schema):
    """ Represents a reference to another paper, which is cited in a scientific paper"""
    index = pz.Field(desc="The index of the reference in the paper", required=True)
    title = pz.Field(desc="The title of the paper being cited", required=True)
    first_author = pz.Field(desc="The author of the paper being cited", required=True)
    year = pz.Field(desc="The year in which the cited paper was published", required=True)

from palimpzest.corelib.schemas import File, Number, TextFile, RawJSONObject, PDFFile, ImageFile, EquationImage, PlotImage, URL, Download, WebPage, XLSFile, Table

existing_schemas = {
    "File": File,
    "Number": Number,
    "TextFile": TextFile,
    "RawJSONObject": RawJSONObject,
    "PDFFile": PDFFile,
    "ImageFile": ImageFile,
    "EquationImage": EquationImage,
    "PlotImage": PlotImage,
    "URL": URL,
    "Download": Download,
    "WebPage": WebPage,
    "XLSFile": XLSFile,
    "Table": Table,
    "ScientificPaper": ScientificPaper,
    "Reference": Reference
}

print("Setup complete")

Setup complete


What are the datasets available in the system?

In [5]:
import pandas as pd
from prettytable import PrettyTable

ds = pz.DataDirectory().list_registered_datasets()

# construct table for printing
table = [["Name", "Type", "Path"]]
for path, descriptor in ds:
    table.append([path, descriptor[0], descriptor[1]])

# print table of registered datasets
t = PrettyTable(table[0])
t.add_rows(table[1:])
t

Name,Type,Path
real-estate-eval-20,dir,/home/gerardo/palimpzest/testdata/real-estate-eval-20
biofabric-tiny-filtered,dir,/home/gerardo/palimpzest/testdata/biofabric-tiny-filtered
real-estate-eval-tiny,dir,/home/gerardo/palimpzest/testdata/real-estate-eval-tiny
groundtruth,dir,/home/gerardo/palimpzest/testdata/groundtruth
askem,dir,/home/gerardo/palimpzest/testdata/askem
enron-eval,dir,/home/gerardo/palimpzest/testdata/enron-eval
pdfs-tiny,dir,/home/gerardo/palimpzest/testdata/pdfs-tiny
enron-tiny,dir,/home/gerardo/palimpzest/testdata/enron-tiny
enron-small,dir,/home/gerardo/palimpzest/testdata/enron-small
bdf-usecase3-references-pdffull,dir,/home/gerardo/palimpzest/testdata/bdf-usecase3-references-pdffull


Show me the files available in bdf-demo

In [6]:
import os
files = os.listdir("/home/gerardo/bdf-pz/testdata/bdf-demo")

files

['bolderson2010.pdf',
 'bonfiglio2023.pdf',
 'marques2021.pdf',
 'brunyanszki2014.pdf',
 'liu2024.pdf',
 'li2021.pdf',
 'zethoven2022.pdf',
 'vasaikar2019.pdf',
 'ceccaldi2015.pdf',
 'munkhbaatar2020.pdf',
 'gupte2017.pdf']

Create a schema to extract authors.

In [10]:
# Define a class name
class_name = "Author"

# Create the class dynamically
attributes = {"__doc__": " Schema for extracting author information from documents."}

# Custom __repr__ to output detailed information about the class and its fields
def custom_repr(self):
    class_info = [f"{class_name}: {self.__doc__}"]
    for name, field in self.__class__.__dict__.items():
        if isinstance(field, pz.Field):
            class_info.append(f"{name}: description='{field.desc}', required={field.required}")
    return "\n".join(class_info)

# Add the custom __repr__ method to the class attributes
attributes["__repr__"] = custom_repr

for name, desc, required in zip(['name', 'affiliation', 'email'], ['The full name of the author.', 'The affiliation of the author.', 'The email address of the author.'], [True, False, False]):
    attributes[name] = pz.Field(desc=desc, required=required)

# Create the class dynamically using type()
new_class = type(class_name, (pz.Schema,), attributes)

# Assign the dynamically created class to a variable using globals()
globals()[class_name] = new_class
existing_schemas[class_name] = new_class
new_class

__main__.Author

Consider the bdf-demo dataset. This is a dataset of scientific papers. Extract all authors of papers about brain cancer.

In [12]:
try:
    schema = existing_schemas["Author"]
except KeyError:
    raise ValueError(f"Schema 'Author' not found in existing schemas!")
dataset = pz.Dataset(source="bdf-demo", schema=schema)


In [13]:
condition = "The paper is about brain cancer."
dataset = dataset.filter(condition)

In [14]:
convert_schema = Author
cardinality_str = "one_to_many"
cardinality = pz.Cardinality.ONE_TO_MANY if cardinality_str == "one_to_many" else pz.Cardinality.ONE_TO_ONE
dataset = dataset.convert(convert_schema, desc=Author.__doc__, cardinality=cardinality)

In [None]:
if "dataset" not in locals():
    output = "bdf-demo"
else:
    output = dataset

policy_method = "min_cost"

# optimization block
engine = pz.StreamingSequentialExecution
if policy_method == "min_cost":
    policy = pz.MinCost()
elif policy_method == "max_quality":
    policy = pz.MaxQuality()
iterable  =  pz.Execute(output,
                        policy = policy,
                        nocache=True,
                        allow_code_synth=False,
                        allow_token_reduction=False,
                        execution_engine=engine)

results = []
statistics = []

for idx, (extraction, plan, stats) in enumerate(iterable):
    
    record_time = time.time()
    statistics.append(stats)

    for ex in extraction:
        ex_obj = {}
        for name in output.schema.field_names():
            ex_obj[name] = ex.__getattr__(name)
        print(ex_obj)
        results.append(ex_obj)

results_df = pd.DataFrame(results)

KeyError: 'PDFFile.56364ce7be.contents'

In [20]:
results_df

NameError: name 'results_df' is not defined

In [None]:
if "dataset" not in locals():
    print("Setting dataset")
    dataset = pz.Dataset("bdf-demo", schema=ScientificPaper)
condition = "The paper is published from 2019 onwards"

dataset = dataset.filter(condition)

dataset
if "dataset" not in locals():
    dataset = pz.Dataset("bdf-demo", schema=ScientificPaper)
convert_schema = AuthorAffiliationSchema

caridinality_str = "one_to_many"
cardinality = pz.Cardinality.ONE_TO_MANY if caridinality_str == "one_to_many" else pz.Cardinality.ONE_TO_ONE
dataset = dataset.convert(convert_schema, desc=AuthorAffiliationSchema.__doc__, cardinality=cardinality)
dataset
# Error: KeyError 'PDFFile.b434331434.contents'

In [None]:

if "dataset" not in locals():
    output = "bdf-demo"
else:
    output = dataset

policy_method = "min_cost"

# optimization block
engine = pz.StreamingSequentialExecution
if policy_method == "min_cost":
    policy = pz.MinCost()
elif policy_method == "max_quality":
    policy = pz.MaxQuality()
iterable  =  pz.Execute(output,
                        policy = policy,
                        nocache=True,
                        allow_code_synth=False,
                        allow_token_reduction=False,
                        execution_engine=engine)

results = []
statistics = []

for idx, (extraction, plan, stats) in enumerate(iterable):

    record_time = time.time()
    statistics.append(stats)

    for ex in extraction:
        ex_obj = {}
        for name in output.schema.field_names():
            ex_obj[name] = ex.__getattr__(name)
        print(ex_obj)
        results.append(ex_obj)

results_df = pd.DataFrame(results)
print(results_df)
