In [1]:
import palimpzest as pz
import pandas as pd
import time
import os
import IPython


class ScientificPaper(pz.PDFFile):
   """Represents a scientific research paper, which in practice is usually from a PDF file"""
   paper_title = pz.Field(desc="The title of the paper. This is a natural language title, not a number or letter.", required=True)
   author = pz.Field(desc="The name of the first author of the paper", required=True)
   abstract = pz.Field(desc="A short description of the paper contributions and findings", required=False)

class Reference(pz.Schema):
    """ Represents a reference to another paper, which is cited in a scientific paper"""
    index = pz.Field(desc="The index of the reference in the paper", required=True)
    title = pz.Field(desc="The title of the paper being cited", required=True)
    first_author = pz.Field(desc="The author of the paper being cited", required=True)
    year = pz.Field(desc="The year in which the cited paper was published", required=True)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if "dataset" not in locals():
    print("Setting dataset")
    dataset = pz.Dataset("bdf-demo", schema=ScientificPaper)
condition = "The paper is published after 2021"

dataset = dataset.filter(condition)
print(dataset)

Setting dataset
Dataset(schema=ScientificPaper(abstract, author, contents, filename, paper_title, text_contents), desc=Apply filter(s), filter=Filter(The paper is published after 2021), udf=None, aggFunc=None, limit=None, uid=d1855ec127)


In [4]:
# Define a class name
class_name = "AuthorAffiliationSchema"
# Create the class dynamically
attributes = {"__doc__": " Schema to extract authors and their affiliations from scientific papers."}

# Custom __repr__ to output detailed information about the class and its fields
def custom_repr(self):
    class_info = [f"{class_name}: {self.__doc__}"]
    for name, field in self.__class__.__dict__.items():
        if isinstance(field, pz.Field):
            class_info.append(f"{name}: description='{field.desc}', required={field.required}")
    return "\n".join(class_info)

# Add the custom __repr__ method to the class attributes
attributes["__repr__"] = custom_repr

for name, desc, required in zip(['author_name', 'affiliation'], ['Name of the author', 'Affiliation of the author'], [True, True]):
    attributes[name] = pz.Field(desc=desc, required=required)

# Create the class dynamically using type()
new_class = type(class_name, (pz.Schema,), attributes)

# Assign the dynamically created class to a variable using globals()
globals()[class_name] = new_class

new_class

__main__.AuthorAffiliationSchema

In [5]:
if "dataset" not in locals():
    dataset = pz.Dataset("bdf-demo", schema=ScientificPaper)
convert_schema = AuthorAffiliationSchema

caridinality_str = "one_to_many"

cardinality = pz.Cardinality.ONE_TO_MANY if caridinality_str == "one_to_many" else pz.Cardinality.ONE_TO_ONE

dataset = dataset.convert(convert_schema, desc=AuthorAffiliationSchema.__doc__, cardinality=cardinality)

dataset
if "dataset" not in locals():
    output = "bdf-demo"
else:
    output = dataset

policy_method = "min_cost"

# optimization block
engine = pz.StreamingSequentialExecution
if policy_method == "min_cost":
    policy = pz.MinCost()
elif policy_method == "max_quality":
    policy = pz.MaxQuality()
iterable  =  pz.Execute(output,
                        policy = policy,
                        nocache=True,
                        allow_code_synth=False,
                        allow_token_reduction=False,
                        execution_engine=engine)

results = []
statistics = []

for idx, (extraction, plan, stats) in enumerate(iterable):

    record_time = time.time()
    statistics.append(stats)

    for ex in extraction:
        ex_obj = {}
        for name in output.schema.fieldNames():
            ex_obj[name] = ex.__getattribute__(name)
        print(ex_obj)
        results.append(ex_obj)

results_df = pd.DataFrame(results)
print(results_df)
# context: frozendict.frozendict({'type': 'tool', 'name': 'filter_data'})
# Unable to parse result.

Time for planning:  0.04557490348815918


 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb


{'affiliation': 'CEINGE Biotecnologie Avanzate s.c.ar.l., Naples, Italy', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation': 'Department of Chemical, Materials and Production Engineering, University of Naples Federico II, Naples, Italy', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation': 'Department of Molecular Medicine and Medical Biotechnologies, University of Naples Federico II, Naples, Italy', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation': 'European School of Medical Medicine, University of Milan, Milan, Italy', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation': 'Laboratory of Molecular Biology, IRCCS Istituto Giannina Gaslini, Genova, Italy', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation': 'Department of Medical Biotechnologies, University of Siena, Siena, Italy', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation': 'Hopp-Children ’s Cancer Center at the NCT Heidelberg (KiTZ), Heidelberg, Germany', 'author_name': 'Ferdinando Bon ﬁglio'}
{'affiliation'

In [None]:
if "dataset" not in locals():
    print("Setting dataset")
    dataset = pz.Dataset("bdf-demo", schema=ScientificPaper)
condition = "The paper is published from 2019 onwards"

dataset = dataset.filter(condition)

dataset
if "dataset" not in locals():
    dataset = pz.Dataset("bdf-demo", schema=ScientificPaper)
convert_schema = AuthorAffiliationSchema

caridinality_str = "one_to_many"
cardinality = pz.Cardinality.ONE_TO_MANY if caridinality_str == "one_to_many" else pz.Cardinality.ONE_TO_ONE
dataset = dataset.convert(convert_schema, desc=AuthorAffiliationSchema.__doc__, cardinality=cardinality)
dataset
# Error: KeyError 'PDFFile.b434331434.contents'

In [None]:

if "dataset" not in locals():
    output = "bdf-demo"
else:
    output = dataset

policy_method = "min_cost"

# optimization block
engine = pz.StreamingSequentialExecution
if policy_method == "min_cost":
    policy = pz.MinCost()
elif policy_method == "max_quality":
    policy = pz.MaxQuality()
iterable  =  pz.Execute(output,
                        policy = policy,
                        nocache=True,
                        allow_code_synth=False,
                        allow_token_reduction=False,
                        execution_engine=engine)

results = []
statistics = []

for idx, (extraction, plan, stats) in enumerate(iterable):

    record_time = time.time()
    statistics.append(stats)

    for ex in extraction:
        ex_obj = {}
        for name in output.schema.fieldNames():
            ex_obj[name] = ex.__getattribute__(name)
        print(ex_obj)
        results.append(ex_obj)

results_df = pd.DataFrame(results)
print(results_df)
