In [1]:
# Chat Models
from huggingface_hub import hf_hub_download
from langchain_ollama import ChatOllama

# Chat structure
from langchain_core.prompts import PromptTemplate
from langgraph.graph import START, END, StateGraph

# Typing
from typing_extensions import TypedDict
from pydantic import BaseModel, Field

from systematic_review import *

%load_ext autoreload
%autoreload 2

### Load PDF

In [8]:
doc = XmlDocument()
doc.load('../papers/processed/definitions2.grobid.tei.xml', token_size = 1000)

In [9]:
print(doc.pages[1])

## Definitions
'Small waterbodies' is an ambiguous term with, as yet, no universally accepted or legal definition. In this paper, we use the term to refer to ponds and small lakes, small streams including headwaters, ditches and springs. In the following section, the approaches which have been taken to derive these definitions are described.

## Ponds and small lakes
Ponds are small standing waters varying in size from 1 m 2 to about 2-5 ha in area and may be permanent or seasonal, man-made or naturally created (Pond Conservation Group, 1993;Collinson et al., 1995;Biggs et al., 2007;E.P.C.N., 2007;Cereghino et al., 2008). Although there is a long history, dating back to the nineteenth century, of attempts to define the difference between a pond and a lake (Biggs et al., 2005), large ponds and small lakes share many characteristics in terms of structure and function, and the transition zone between the two types of habitat is very gradual (Søndergaard et al., 2005;De Meester et al., 200

### Language Model

In [3]:
# Load the language model
llm = ChatOllama(
    model="gemma3:12b-it-qat",
    temperature=0,
    num_ctx = 25000 # Set to 50k on SCC?
)

# Specify prompt and response formats
prompt_template = PromptTemplate.from_template(
    "<start_of_turn>user\n{instructions}<end_of_turn>\n"
    "<start_of_turn>user\n{context}<end_of_turn>\n"
    "<start_of_turn>user\n{query}<end_of_turn>\n"
    "<start_of_turn>model\n"
)

class BooleanResponse(BaseModel):
    """
    Manages a structured, boolean response from a language model.
    """
    content : bool = Field(
        description = 
            "Respond with False if the answer is No or Unknown. "
            "Respond True only if the answer is Yes. "
    )

# State for an individual paper
class State(TypedDict):
    context : str
    definition_bool : bool
    definition : str
    table_bool : bool

boolean_llm = llm.with_structured_output(schema = BooleanResponse)

In [4]:
def screen_definition(state: State):
    """
    Screen the current page for a scientific definition.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["context"]
    query = (
        "Does this page contain a definition for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = prompt_template.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = boolean_llm.invoke(messages)
    return {"definition_bool": response.content}


def definition_routing(state : State):
    return state['definition_bool']


def extract_definition(state: State):
    """
    Extract a scientific definition from the given page.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
    )
    context = state["context"]
    query = (
        "What definition does the context give for either ponds or lakes?"
        "A definition should specify distinguishing attributes or descriptive characteristics."
        "The definition may be for either ponds or lakes, but not other types of waterbodies."
    )
    messages = prompt_template.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = llm.invoke(messages)
    return {"definition": response.content}


def screen_table(state: State):
    """
    Screen the current page for tabular data.

    Args:
        state (State): Current state of the chat.
    Returns:
        state (State): Updated state with generated response.
    """
    instructions = (
        "You will be given contextual information from a page of a scientific research paper "
        "and asked to accurately answer questions about its contents. Please answer only "
        "for the information shown on the current page, and not the paper as a whole."
        "Your answer should be a boolean value with a value of False if the "
        "answer is No or Unknown and a value of True only if the answer is Yes. "
    )
    context = state["context"]
    query = (
        "Does this page include a table containing data related to "
        "physical, chemical, or biological attributes of individual ponds or lakes?"
        "Data must be reported in a table format, and should only be given for individually "
        "studied ponds or lakes, instead of aggregate statistics for groups of waterbodies. "
        "Examples include but are not limited to depth, surface area, temperature, or pH."
    )

    messages = prompt_template.invoke(
        {"instructions": instructions, "context": context, "query": query}
    )
    response = boolean_llm.invoke(messages)
    return {"table_bool": response.content}


def table_routing(state : State):
    return state['table_bool']

In [5]:
graph_builder = StateGraph(State)
graph_builder.add_node("screen_definition", screen_definition)
graph_builder.add_node("extract_definition", extract_definition)
graph_builder.add_node("screen_table", screen_table)
graph_builder.add_edge(START, "screen_definition")
graph_builder.add_conditional_edges(
    "screen_definition",
    definition_routing,
    {True : "extract_definition", False: "screen_table"}
)
graph_builder.add_edge("extract_definition", "screen_table")
graph_builder.add_edge("screen_table", END)
graph = graph_builder.compile()

In [12]:
callable(graph)

False

In [25]:
chat = ChatWithHistory(llm = graph)
response = chat.invoke(
    {"context" : doc.pages[1]}, 
    identifier = "Chunk 1",
    ignore = ['context']
)
response = chat.invoke(
    {"context" : doc.pages[2]}, 
    identifier = "Chunk 2",
    ignore = ['context']
)

In [26]:
chat.save('../experiments/data/test_history.csv')

Unnamed: 0,definition_bool,definition,table_bool
Chunk 1,True,Ponds are described as small standing waters v...,False
Chunk 2,False,,False


In [196]:
import pandas as pd
D = {
     0 : {'dinner': 'chicken', 'lunch': 'salad', 'breakfast': 'eggs'},
     1 : {'dinner': 'fish', 'brunch': 'soup', 'breakfast': 'toast'}
}
pd.DataFrame.from_dict(D, orient='index')

Unnamed: 0,dinner,lunch,breakfast,brunch
0,chicken,salad,eggs,
1,fish,,toast,soup


In [192]:
D = ["hello", "world"]
pd.DataFrame(D)

Unnamed: 0,0
0,hello
1,world
