In [2]:
# Chat Models
from huggingface_hub import hf_hub_download
from langchain_ollama import ChatOllama

# Chat structure
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import START, END, StateGraph

# Typing
from typing_extensions import TypedDict
from pydantic import BaseModel, Field

from systematic_review import *

%load_ext autoreload
%autoreload 2

### Load PDF

In [91]:
doc = XmlDocument(doi = '1234')
doc.load('../collection/examples/processed/definitions2.grobid.tei.xml', token_size = 1000)

In [92]:
print(doc.title_abstract)

# The importance of small waterbodies for biodiversity and ecosystem services: implications for policy makers
## Abstract
Small waterbodies, including ponds and small lakes, low-order streams, ditches and springs, are the most numerous freshwater environments globally, are critical for freshwater biodiversity and are increasingly recognised for their role in ecosystem service delivery. Small waters often represent the best remaining examples of intact freshwater habitats and are the most likely to remain unpolluted, often being a refuge for species which have disappeared from larger, more damaged, waterbodies. Practically all waterrelated ecosystem services are initially mediated by small waters and some, such as carbon cycling, may be dominated by them. Small waters are exposed to all the threats affecting larger waters, and some experienced only by small waters. Despite this, small waters remain the least investigated part of the water environment and are largely excluded from water 

In [93]:
print(doc.pages[1])

## Definitions
'Small waterbodies' is an ambiguous term with, as yet, no universally accepted or legal definition. In this paper, we use the term to refer to ponds and small lakes, small streams including headwaters, ditches and springs. In the following section, the approaches which have been taken to derive these definitions are described.

## Ponds and small lakes
Ponds are small standing waters varying in size from 1 m 2 to about 2-5 ha in area and may be permanent or seasonal, man-made or naturally created (Pond Conservation Group, 1993;Collinson et al., 1995;Biggs et al., 2007;E.P.C.N., 2007;Cereghino et al., 2008). Although there is a long history, dating back to the nineteenth century, of attempts to define the difference between a pond and a lake (Biggs et al., 2005), large ponds and small lakes share many characteristics in terms of structure and function, and the transition zone between the two types of habitat is very gradual (Søndergaard et al., 2005;De Meester et al., 200

### Language Model

In [35]:
# Load the language model
llm = ChatOllama(
    model="gemma3:12b-it-qat",
    temperature=0,
    num_ctx = 25000 # Set to 50k on SCC?
)

# Specify prompt and response formats
prompt_template1 = PromptTemplate.from_template(
    "<start_of_turn>user\n{instructions}<end_of_turn>\n"
    "<start_of_turn>user\n{context}<end_of_turn>\n"
    "<start_of_turn>user\n{query}<end_of_turn>\n"
    "<start_of_turn>model\n"
)

prompt_template2 = ChatPromptTemplate([
    ("user", "{instructions}"),
    ("user", "{context}"),
    ("user", "{query}")
])

class BooleanResponse(BaseModel):
    """
    Manages a structured, boolean response from a language model.
    """
    content : bool = Field(
        description = 
            "Respond with False if the answer is No or Unknown. "
            "Respond True only if the answer is Yes. "
    )

# State for an individual paper
class State(TypedDict):
    context : str
    definition_bool : bool
    definition : str
    table_bool : bool

boolean_llm = llm.with_structured_output(schema = BooleanResponse)

In [111]:
from ollama import chat
from ollama import ChatResponse

#model = "gemma3:12b-it-qat"
model = "olmo2:13b"

class BooleanResponse(BaseModel):
    """
    Manages a structured, boolean response from a language model.
    """
    content : bool = Field(
        description = 
            "Respond with False if the answer is No or Unknown. "
            "Respond True only if the answer is Yes. "
    )

boolean_format = BooleanResponse.model_json_schema()
response_boolean_formatter = lambda response: BooleanResponse.model_validate_json(
    response.message.content
).content

# State for an individual paper
class State(TypedDict):
    context : str
    definition_bool : bool
    definition : str
    table_bool : bool


In [112]:
response: ChatResponse = chat(model=model, messages=[
  {
    'role': 'user',
    'content': 'The sky is orange',
  },
  {
    'role': 'user',
    'content': 'Is the sky blue?',
  },
],
#format = boolean_format
)
#print(response_boolean_formatter(response))
response.message.content

'The color of the sky can vary depending on multiple factors such as time of day, weather conditions, and atmospheric particulates. At sunrise and sunset, the sky often takes on shades of orange, red, and pink due to the way the Earth\'s atmosphere scatters shorter wavelengths of light (like blue) and allows longer wavelengths (like reds and oranges) to pass through more easily.\n\nUnder clear daylight conditions without such obstructions or when viewed from space, the sky appears blue because molecules in the Earth\'s atmosphere scatter blue light from the sun more than they scatter red light. This process is called Rayleigh scattering.\n\nTherefore, while it\'s accurate to say the sky is typically blue during the day under normal circumstances, the statement "The sky is blue" simplifies a complex natural phenomenon and isn\'t always true in every situation.\n\nIf you\'re observing an orange sky during sunrise or sunset, it\'s perfectly normal and expected. The context is crucial for 

In [113]:
class State(TypedDict):
    abstract : str
    text : str
    abstract_bool : bool
    definition_bool : bool
    definition : str
    table_bool : bool


def screen_abstract(state: State):
    """
    Screen the abstract of the current paper for relevance to ponds or lakes.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    # Check that abstract has not already been screened
    if state.get("abstract_bool") is None:
        instructions = (
            "You will be given contextual information from the title and abstract of a "
            "scientific research paper and asked to accurately infer information about "
            "the paper's contents. Your answer should be a boolean value with a value "
            "of False if the answer is No or Unknown and a value of True only if the answer is Yes. "
        )
        context = state["abstract"]
        query = (
            "Does this paper study or discuss freshwater ponds or lakes in some capacity?"
        )
        messages = [
            {'role': 'system', 'content': instructions},
            {'role': 'user', 'content': context},
            {'role': 'user', 'content': query}
        ]
        response: ChatResponse = chat(model=model, messages=messages, format=boolean_format)
        return {"abstract_bool": response_boolean_formatter(response)}
    else:
        return state


def screen_definition(state: State):
    """
    Screen the current page for a scientific definition.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["text"]
    query = (
        "Does this page contain a definition for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = [
            {'role': 'system', 'content': instructions},
            {'role': 'user', 'content': context},
            {'role': 'user', 'content': query}
        ]
    response: ChatResponse = chat(model=model, messages=messages, format=boolean_format)
    return {"definition_bool": response_boolean_formatter(response)}


def definition_routing(state : State):
    return state['definition_bool']


def extract_definition(state: State):
    """
    Extract a scientific definition from the given page.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
    )
    context = state["text"]
    query = (
        "What definition does the context give for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = [
            {'role': 'system', 'content': instructions},
            {'role': 'user', 'content': context},
            {'role': 'user', 'content': query}
        ]
    response: ChatResponse = chat(model=model, messages=messages)
    return {"definition": response.message.content}


def screen_table(state: State):
    """
    Screen the current page for tabular data.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["text"]
    query = (
        "Does this page include a table containing data related to "
        "physical, chemical, or biological attributes of individual ponds or lakes?"
        "Data must be reported in a table format, and should only be given for individually "
        "studied ponds or lakes, instead of aggregate statistics for groups of waterbodies. "
        "Examples include but are not limited to depth, surface area, temperature, or pH."
    )

    messages = [
            {'role': 'system', 'content': instructions},
            {'role': 'user', 'content': context},
            {'role': 'user', 'content': query}
        ]
    response: ChatResponse = chat(model=model, messages=messages, format=boolean_format)
    return {"table_bool": response_boolean_formatter(response)}


def table_routing(state : State):
    return state['table_bool']

In [114]:
graph_builder = StateGraph(State)
graph_builder.add_node("screen_abstract", screen_abstract)
graph_builder.add_node("screen_definition", screen_definition)
graph_builder.add_node("extract_definition", extract_definition)
graph_builder.add_node("screen_table", screen_table)
graph_builder.add_edge(START, "screen_abstract")
graph_builder.add_conditional_edges(
    "screen_abstract",
    lambda state: state['abstract_bool'] if state.get('text') is not None else False,
    {True: "screen_definition", False: END}
)
graph_builder.add_conditional_edges(
    "screen_definition",
    lambda state: state['definition_bool'],
    {True : "extract_definition", False: "screen_table"}
)
graph_builder.add_edge("extract_definition", "screen_table")
graph_builder.add_edge("screen_table", END)
GRAPH = graph_builder.compile()

In [115]:
chat_history = ChatWithHistory(llm = GRAPH)

# Screen abstract:
response = chat_history.invoke(
    {'abstract' : doc.title_abstract},
    identifier = {'doi' : doc.doi, 'chunk' : -1}, # -1 indicates abstract
    ignore = ['abstract','text']
)

if response["abstract_bool"]:
    # Screen text:
    for i,page in enumerate(doc.pages[:3]):
        #print(f"Processing {doc.title[:25]}... Page {i+1}/{len(doc.pages)}")
        response = chat_history.invoke(
            {'text': page, 'abstract_bool': True},
            identifier = {'doi' : doc.doi, 'chunk' : i},
            ignore = ['abstract', 'text']
        )

In [116]:
chat_history.history

[{'doi': '1234', 'chunk': -1, 'abstract_bool': True},
 {'doi': '1234',
  'chunk': 0,
  'abstract_bool': True,
  'definition_bool': False,
  'table_bool': False},
 {'doi': '1234',
  'chunk': 1,
  'abstract_bool': True,
  'definition_bool': False,
  'table_bool': False},
 {'doi': '1234',
  'chunk': 2,
  'abstract_bool': True,
  'definition_bool': False,
  'table_bool': False}]

In [102]:
doc.pages[2]

"## Materials and methods\n## Study area\nTable 1 shows the location of the 104 permanent small water bodies sampled within the following four altitudinal vegetation belts in Switzerland: colline, montane, subalpine, and alpine. They vary in size from 5 m 2 to 10 ha (Table 2), with a mean depth comprising between 15 and 910 cm. We will further refer to these small water bodies as ''ponds'', since most of them correspond to the criteria of the definition of a pond presented by Oertli et al. (2005a). Only one third of these ponds are known to have a natural origin with an age exceeding 4,000 years (last glacial retreat). The others, with\nTable 1 Number of sampled ponds per altitudinal vegetation belt (colline (200-800 m), montane (600-1,400 m), subalpine (1,300-2,000 m), alpine ([1,800 m)) and trophic state (based on the concentration of total phosphorus (TP) and total nitrogen (TN) as described by OECD (1982) and Wetzel (1983) Colline Montane Subalpine Alpine n = total of ponds\nOligot

In [26]:
chat.save('../experiments/data/test_history.csv')

Unnamed: 0,definition_bool,definition,table_bool
Chunk 1,True,Ponds are described as small standing waters v...,False
Chunk 2,False,,False


In [34]:
import pandas as pd
D = {
     0 : {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
     1 : {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
}
pd.DataFrame.from_dict(D, orient='index')

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [36]:
D = [
    {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
    {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
]
pd.DataFrame(D)

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [192]:
D = ["hello", "world"]
pd.DataFrame(D)

Unnamed: 0,0
0,hello
1,world
