In [2]:
# Chat Models
from huggingface_hub import hf_hub_download
from langchain_ollama import ChatOllama

# Chat structure
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import START, END, StateGraph

# Typing
from typing_extensions import TypedDict
from pydantic import BaseModel, Field

from systematic_review import *

%load_ext autoreload
%autoreload 2

### Load PDF

In [29]:
doc = XmlDocument(doi = '1234')
doc.load('../collection/examples/processed/definitions1.grobid.tei.xml', token_size = 1000)

In [30]:
print(doc.full_text)

# Eutrophication: are mayflies (Ephemeroptera) good bioindicators for ponds?
## Abstract
Ephemeroptera larvae are recognized worldwide for their sensitivity to oxygen depletion in running waters, and are therefore commonly used as bioindicators in many monitoring programmes. Mayflies inhabiting lentic waters, like lakes and ponds, in contrary have been poorly prospected in biomonitoring. For this purpose, a better understanding of their distribution in lentic habitats and of the relations of species presence with environmental conditions are needed. Within this framework, 104 ponds were sampled in Switzerland. The Ephemeroptera are found to be an insect order particularly well represented in the ponds studied here (93% of the lowland ponds). Nevertheless, in terms of diversity, they are relatively poorly represented (mean species number = 1.9). Two species dominated: Cloeon dipterum (Baetidae) and Caenis horaria (Caenidae). The investigations contributed to the updating of the geograph

In [17]:
ftext = doc.full_text.replace('\n\n', ' ')

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
            encoding_name="cl100k_base",
            chunk_size=1000,
            chunk_overlap=0,
            separators = ["\n\n"]
        )
pages = text_splitter.split_text(ftext)

In [21]:
print(pages[0])

# Eutrophication: are mayflies (Ephemeroptera) good bioindicators for ponds?
## Abstract
Ephemeroptera larvae are recognized worldwide for their sensitivity to oxygen depletion in running waters, and are therefore commonly used as bioindicators in many monitoring programmes. Mayflies inhabiting lentic waters, like lakes and ponds, in contrary have been poorly prospected in biomonitoring. For this purpose, a better understanding of their distribution in lentic habitats and of the relations of species presence with environmental conditions are needed. Within this framework, 104 ponds were sampled in Switzerland. The Ephemeroptera are found to be an insect order particularly well represented in the ponds studied here (93% of the lowland ponds). Nevertheless, in terms of diversity, they are relatively poorly represented (mean species number = 1.9). Two species dominated: Cloeon dipterum (Baetidae) and Caenis horaria (Caenidae). The investigations contributed to the updating of the geograph

In [4]:
print(doc.title_abstract)

# Eutrophication: are mayflies (Ephemeroptera) good bioindicators for ponds?
## Abstract
Ephemeroptera larvae are recognized worldwide for their sensitivity to oxygen depletion in running waters, and are therefore commonly used as bioindicators in many monitoring programmes. Mayflies inhabiting lentic waters, like lakes and ponds, in contrary have been poorly prospected in biomonitoring. For this purpose, a better understanding of their distribution in lentic habitats and of the relations of species presence with environmental conditions are needed. Within this framework, 104 ponds were sampled in Switzerland. The Ephemeroptera are found to be an insect order particularly well represented in the ponds studied here (93% of the lowland ponds). Nevertheless, in terms of diversity, they are relatively poorly represented (mean species number = 1.9). Two species dominated: Cloeon dipterum (Baetidae) and Caenis horaria (Caenidae). The investigations contributed to the updating of the geograph

In [11]:
print(doc.pages[3])

## Statistical analyses
Statistical analyses were performed exclusively on 71 out of the 104 ponds from the colline and montane vegetation belts. The remaining 33 ponds from the subalpine and alpine belts were excluded from this dataset because of the particularity of their mayfly assemblages: only 11 ponds contained Ephemeroptera (Table 1). In addition, Cloeon dipterum and Caenis horaria, the two most abundant species present in many lowland ponds, were much less common at these altitudes. Indeed, most of the mayflies that are present in the subalpine and alpine belts were rare species.
A between-class Principal Component Analysis (PCA) was performed to test if there was an overall difference between the ponds containing Caenidae + Baetidae (33 ponds) and those with Baetidae only (31 ponds) for 12 relevant selected environmental and physico-chemical variables. Three of these variables were log-transformed: area, mean depth and sinuosity of the shoreline; five were transformed in categ

In [40]:
print(doc.pages[2])

## Materials and methods
## Study area
Table 1 shows the location of the 104 permanent small water bodies sampled within the following four altitudinal vegetation belts in Switzerland: colline, montane, subalpine, and alpine. They vary in size from 5 m 2 to 10 ha (Table 2), with a mean depth comprising between 15 and 910 cm. We will further refer to these small water bodies as ''ponds'', since most of them correspond to the criteria of the definition of a pond presented by Oertli et al. (2005a). Only one third of these ponds are known to have a natural origin with an age exceeding 4,000 years (last glacial retreat). The others, with
Table 1 Number of sampled ponds per altitudinal vegetation belt (colline (200-800 m), montane (600-1,400 m), subalpine (1,300-2,000 m), alpine ([1,800 m)) and trophic state (based on the concentration of total phosphorus (TP) and total nitrogen (TN) as described by OECD (1982) and Wetzel (1983) Colline Montane Subalpine Alpine n = total of ponds
Oligotrophi

### Language Model

In [41]:
# Load the language model
llm = ChatOllama(
    model="gemma3:12b-it-qat",
    temperature=0,
    num_ctx = 25000 # Set to 50k on SCC?
)

# Specify prompt and response formats
prompt_template1 = PromptTemplate.from_template(
    "<start_of_turn>user\n{instructions}<end_of_turn>\n"
    "<start_of_turn>user\n{context}<end_of_turn>\n"
    "<start_of_turn>user\n{query}<end_of_turn>\n"
    "<start_of_turn>model\n"
)

prompt_template2 = ChatPromptTemplate([
    ("user", "{instructions}"),
    ("user", "{context}"),
    ("user", "{query}")
])

class BooleanResponse(BaseModel):
    """
    Manages a structured, boolean response from a language model.
    """
    content : bool = Field(
        description = 
            "Respond with False if the answer is No or Unknown. "
            "Respond True only if the answer is Yes. "
    )

# State for an individual paper
class State(TypedDict):
    context : str
    definition_bool : bool
    definition : str
    table_bool : bool

boolean_llm = llm.with_structured_output(schema = BooleanResponse)

In [103]:
LLM = ChatOllama(
    model="gemma3:12b-it-qat",
    #model = "olmo2:13b",
    temperature=0,
    num_ctx = 128_000 # Maximum context length for Gemma3
)

prompt_template = PromptTemplate.from_template(
    "<start_of_turn>user\n{instructions}<end_of_turn>\n"
    "<start_of_turn>user\n{context}<end_of_turn>\n"
    "<start_of_turn>user\n{query}<end_of_turn>\n"
    "<start_of_turn>model\n"
)

class BooleanResponse(BaseModel):
    """
    Manages a structured, boolean response from a language model.
    """
    content : bool = Field(
        description = 
            "Respond with False if the answer is No or Unknown. "
            "Respond True only if the answer is Yes. "
    )

boolean_llm = LLM.with_structured_output(schema = BooleanResponse)

In [104]:
prompt_template2 = ChatPromptTemplate([
    ("user", "{instructions}"),
    ("user", "{context}"),
    ("user", "{query}"),
])

prompt_template = PromptTemplate.from_template(
    "<|user|>\n{instructions}\n"
    "<|user|>\n{context}\n"
    "<|user|>\n{query}\n"
    "<|assistant|>\n"
)

In [105]:
class State(TypedDict):
    abstract : str
    text : str
    abstract_bool : bool
    definition_bool : bool
    definition : str
    table_bool : bool


def screen_abstract(state: State):
    """
    Screen the abstract of the current paper for relevance to ponds or lakes.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    # Check that abstract has not already been screened
    if state.get("abstract_bool") is None:
        instructions = (
            "You will be given contextual information from the title and abstract of a "
            "scientific research paper and asked to accurately infer information about "
            "the paper's contents. Your answer should be a boolean value with a value "
            "of False if the answer is No or Unknown and a value of True only if the answer is Yes. "
        )
        context = state["abstract"]
        query = (
            "Does this paper study or discuss freshwater ponds or lakes in some capacity?"
        )
        messages = prompt_template2.invoke(
            {"instructions": instructions, "context": context, "query": query}
        )
        response = boolean_llm.invoke(messages)
        return {"abstract_bool": response.content}
    else:
        return state


def screen_definition(state: State):
    """
    Screen the current page for a scientific definition.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["text"]
    query = (
        "Does this page contain a definition for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = prompt_template2.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = boolean_llm.invoke(messages)
    return {"definition_bool": response.content}


def definition_routing(state : State):
    return state['definition_bool']


def extract_definition(state: State):
    """
    Extract a scientific definition from the given page.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
    )
    context = state["text"]
    query = (
        "What definition does the context give for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = prompt_template2.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = LLM.invoke(messages)
    return {"definition": response.content}


def screen_table(state: State):
    """
    Screen the current page for tabular data.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["text"]
    query = (
        "Does this page include a table containing data related to "
        "physical, chemical, or biological attributes of individual ponds or lakes?"
        "Data must be reported in a table format, and should only be given for individually "
        "studied ponds or lakes, instead of aggregate statistics for groups of waterbodies. "
        "Examples include but are not limited to depth, surface area, temperature, or pH."
    )

    messages = prompt_template2.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = boolean_llm.invoke(messages)
    return {"table_bool": response.content}


def table_routing(state : State):
    return state['table_bool']

In [106]:
graph_builder = StateGraph(State)
graph_builder.add_node("screen_abstract", screen_abstract)
graph_builder.add_node("screen_definition", screen_definition)
graph_builder.add_node("extract_definition", extract_definition)
graph_builder.add_node("screen_table", screen_table)
graph_builder.add_edge(START, "screen_abstract")
graph_builder.add_conditional_edges(
    "screen_abstract",
    lambda state: state['abstract_bool'] if state.get('text') is not None else False,
    {True: "screen_definition", False: END}
)
graph_builder.add_conditional_edges(
    "screen_definition",
    lambda state: state['definition_bool'],
    {True : "extract_definition", False: "screen_table"}
)
graph_builder.add_edge("extract_definition", "screen_table")
graph_builder.add_edge("screen_table", END)
GRAPH = graph_builder.compile()

In [109]:
chat = ChatWithHistory(llm = GRAPH)

# Screen abstract:
response = chat.invoke(
    {'abstract' : doc.title_abstract},
    identifier = {'doi' : doc.doi, 'chunk' : -1}, # -1 indicates abstract
    ignore = ['abstract','text']
)

if response["abstract_bool"]:
    # Screen text:
    for i,page in enumerate(doc.pages[:3]):
        #print(f"Processing {doc.title[:25]}... Page {i+1}/{len(doc.pages)}")
        response = chat.invoke(
            {'text': page, 'abstract_bool': True},
            identifier = {'doi' : doc.doi, 'chunk' : i},
            ignore = ['abstract', 'text']
        )

In [111]:
chat.history

[{'doi': '1234', 'chunk': -1, 'abstract_bool': True},
 {'doi': '1234',
  'chunk': 0,
  'abstract_bool': True,
  'definition_bool': False,
  'table_bool': False},
 {'doi': '1234',
  'chunk': 1,
  'abstract_bool': True,
  'definition_bool': False,
  'table_bool': False},
 {'doi': '1234',
  'chunk': 2,
  'abstract_bool': True,
  'definition_bool': False,
  'table_bool': False}]

In [102]:
doc.pages[2]

"## Materials and methods\n## Study area\nTable 1 shows the location of the 104 permanent small water bodies sampled within the following four altitudinal vegetation belts in Switzerland: colline, montane, subalpine, and alpine. They vary in size from 5 m 2 to 10 ha (Table 2), with a mean depth comprising between 15 and 910 cm. We will further refer to these small water bodies as ''ponds'', since most of them correspond to the criteria of the definition of a pond presented by Oertli et al. (2005a). Only one third of these ponds are known to have a natural origin with an age exceeding 4,000 years (last glacial retreat). The others, with\nTable 1 Number of sampled ponds per altitudinal vegetation belt (colline (200-800 m), montane (600-1,400 m), subalpine (1,300-2,000 m), alpine ([1,800 m)) and trophic state (based on the concentration of total phosphorus (TP) and total nitrogen (TN) as described by OECD (1982) and Wetzel (1983) Colline Montane Subalpine Alpine n = total of ponds\nOligot

In [26]:
chat.save('../experiments/data/test_history.csv')

Unnamed: 0,definition_bool,definition,table_bool
Chunk 1,True,Ponds are described as small standing waters v...,False
Chunk 2,False,,False


In [34]:
import pandas as pd
D = {
     0 : {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
     1 : {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
}
pd.DataFrame.from_dict(D, orient='index')

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [36]:
D = [
    {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
    {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
]
pd.DataFrame(D)

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [192]:
D = ["hello", "world"]
pd.DataFrame(D)

Unnamed: 0,0
0,hello
1,world
