In [1]:
# Chat Models
from huggingface_hub import hf_hub_download
from langchain_ollama import ChatOllama

# Chat structure
from langchain_core.prompts import PromptTemplate
from langgraph.graph import START, END, StateGraph

# Typing
from typing_extensions import TypedDict
from pydantic import BaseModel, Field

from systematic_review import *

%load_ext autoreload
%autoreload 2

In [4]:
from lxml import etree

filepath = "../collection/examples/processed/ponds1.grobid.tei.xml"
tree = etree.parse(filepath)
root = tree.getroot()
ns = {
                'tei': 'http://www.tei-c.org/ns/1.0',
                'xlink': 'http://www.w3.org/1999/xlink',
                'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
            }

In [5]:
root.findall('.//tei:blargus', ns)

[]

In [23]:
s = get_filenames_in_directory('../collection/examples/processed/')

In [24]:
s[0].partition('.grobid')[0]

'ponds3'

In [25]:
s

['ponds3.grobid.tei.xml',
 'lakes3.grobid.tei.xml',
 'lakes2.grobid.tei.xml',
 'fake1.grobid.tei.xml',
 'ponds2.grobid.tei.xml',
 'fake3.grobid.tei.xml',
 'fake2.grobid.tei.xml',
 'lakes1.grobid.tei.xml',
 'ponds1.grobid.tei.xml',
 'definitions1.grobid.tei.xml',
 'definitions3.grobid.tei.xml',
 'definitions2.grobid.tei.xml']

### Load PDF

In [12]:
doc = XmlDocument(doi = '1234')
doc.load('../collection/examples/processed/definitions2.grobid.tei.xml', token_size = 1000)

In [16]:
print(doc.title_abstract)

# The importance of small waterbodies for biodiversity and ecosystem services: implications for policy makers
## Abstract
Small waterbodies, including ponds and small lakes, low-order streams, ditches and springs, are the most numerous freshwater environments globally, are critical for freshwater biodiversity and are increasingly recognised for their role in ecosystem service delivery. Small waters often represent the best remaining examples of intact freshwater habitats and are the most likely to remain unpolluted, often being a refuge for species which have disappeared from larger, more damaged, waterbodies. Practically all waterrelated ecosystem services are initially mediated by small waters and some, such as carbon cycling, may be dominated by them. Small waters are exposed to all the threats affecting larger waters, and some experienced only by small waters. Despite this, small waters remain the least investigated part of the water environment and are largely excluded from water 

In [10]:
print(doc.pages[4])

## Springs
Hydrologically, springs are defined as strictly delimited places where the groundwater emerges at the surface (Cantonati et al., 2006). They can also be seen as points of natural, concentrated discharge of groundwater, at a rate high enough to maintain flow on the surface (van Everdingen, 1991). The German Institute for Norms (DIN) precisely defines a spring as a ''spatially restricted groundwater emergence, which at least temporally leads to a superficial discharge'' (DIN, 1994). This definition also includes anthropogenically modified springs such as wells. Geohydrological spring types, such as overflow springs or artesian springs, are distinguished depending on the geology and the characteristics of the aquifer (Martin et al., 2015). In general, the aquifer is the storage body of water gained by precipitation and lost by spring flow after a certain time lag (Glazier, 2014). Depending on the geology, storage times in the aquifer differ considerably from a few hours to over

In [11]:
len(doc.pages)

27

### Language Model

In [27]:
# Load the language model
llm = ChatOllama(
    model="gemma3:12b-it-qat",
    temperature=0,
    num_ctx = 25000 # Set to 50k on SCC?
)

# Specify prompt and response formats
prompt_template = PromptTemplate.from_template(
    "<start_of_turn>user\n{instructions}<end_of_turn>\n"
    "<start_of_turn>user\n{context}<end_of_turn>\n"
    "<start_of_turn>user\n{query}<end_of_turn>\n"
    "<start_of_turn>model\n"
)

class BooleanResponse(BaseModel):
    """
    Manages a structured, boolean response from a language model.
    """
    content : bool = Field(
        description = 
            "Respond with False if the answer is No or Unknown. "
            "Respond True only if the answer is Yes. "
    )

# State for an individual paper
class State(TypedDict):
    context : str
    definition_bool : bool
    definition : str
    table_bool : bool

boolean_llm = llm.with_structured_output(schema = BooleanResponse)

In [28]:
def screen_definition(state: State):
    """
    Screen the current page for a scientific definition.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["context"]
    query = (
        "Does this page contain a definition for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = prompt_template.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = boolean_llm.invoke(messages)
    return {"definition_bool": response.content}


def definition_routing(state : State):
    return state['definition_bool']


def extract_definition(state: State):
    """
    Extract a scientific definition from the given page.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
    )
    context = state["context"]
    query = (
        "What definition does the context give for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = prompt_template.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = llm.invoke(messages)
    return {"definition": response.content}


def screen_table(state: State):
    """
    Screen the current page for tabular data.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["context"]
    query = (
        "Does this page include a table containing data related to "
        "physical, chemical, or biological attributes of individual ponds or lakes?"
        "Data must be reported in a table format, and should only be given for individually "
        "studied ponds or lakes, instead of aggregate statistics for groups of waterbodies. "
        "Examples include but are not limited to depth, surface area, temperature, or pH."
    )

    messages = prompt_template.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = boolean_llm.invoke(messages)
    return {"table_bool": response.content}


def table_routing(state : State):
    return state['table_bool']

In [29]:
graph_builder = StateGraph(State)
graph_builder.add_node("screen_definition", screen_definition)
graph_builder.add_node("extract_definition", extract_definition)
graph_builder.add_node("screen_table", screen_table)
graph_builder.add_edge(START, "screen_definition")
graph_builder.add_conditional_edges(
    "screen_definition",
    definition_routing,
    {True : "extract_definition", False: "screen_table"}
)
graph_builder.add_edge("extract_definition", "screen_table")
graph_builder.add_edge("screen_table", END)
graph = graph_builder.compile()

In [12]:
callable(graph)

False

In [30]:
chat = ChatWithHistory(llm = graph)
response = chat.invoke(
    {"context" : doc.pages[1]}, 
    identifier = "Chunk 1",
    ignore = ['context']
)
response = chat.invoke(
    {"context" : doc.pages[2]}, 
    identifier = "Chunk 2",
    ignore = ['context']
)

In [33]:
response["definition_bool"]

False

In [26]:
chat.save('../experiments/data/test_history.csv')

Unnamed: 0,definition_bool,definition,table_bool
Chunk 1,True,Ponds are described as small standing waters v...,False
Chunk 2,False,,False


In [34]:
import pandas as pd
D = {
     0 : {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
     1 : {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
}
pd.DataFrame.from_dict(D, orient='index')

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [36]:
D = [
    {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
    {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
]
pd.DataFrame(D)

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [192]:
D = ["hello", "world"]
pd.DataFrame(D)

Unnamed: 0,0
0,hello
1,world
