In [1]:
import os
from sentence_transformers import SentenceTransformer
import faiss

from systematic_review import *

%load_ext autoreload
%autoreload 2

In [2]:
model = SentenceTransformer("allenai/scibert_scivocab_uncased")

No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with mean pooling.


In [2]:
directory = "../collection/examples/processed/"
papers = get_filenames_in_directory(directory)

In [98]:
import re
paper = papers[0]
file_path = os.path.join(directory, paper)
doi = paper.partition(".grobid")[0]
doi = doi.replace("_", "/")
doc = XmlDocument(doi = doi)
doc.load(file_path, token_size = 512, separators = ["(.)\n\n(.)", r'([.?!]"?)\n(.)'])

In [88]:
print(doc.full_text)

# Drivers of carbon dioxide and methane supersaturation in small, temporary ponds
## Abstract
Inland waters are an important component of the global carbon cycle, but there is a poor understanding of carbon dynamics in very small ponds. In this study, I evaluated the concentrations and drivers of carbon dioxide (CO 2 ) and methane (CH 4 ) in six small (\1000 m 2 ), temporary, forested ponds in Connecticut, USA. The six ponds were on average 19-fold supersaturated in CO 2 and 504-fold supersaturated in CH 4 relative to the atmosphere. For both gases, this level of supersaturation is among the highest reported for lentic freshwaters. The physical, chemical, and biological parameters in the ponds differ from larger lakes, and may explain the supersaturation. Specifically, the ponds have high terrestrial carbon loadings, are shallow, and polymictic, meaning much of the water is in contact with the carbon-rich sediments. Pond CO 2 concentrations were best predicted by a negative relationshi

In [99]:
[p[-1] for p in doc.pages]

['.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 'O',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 '2',
 ' ',
 's',
 's']

In [96]:
doc.pages

['# Drivers of carbon dioxide and methane supersaturation in small, temporary ponds\n## Abstract\nInland waters are an important component of the global carbon cycle, but there is a poor understanding of carbon dynamics in very small ponds. In this study, I evaluated the concentrations and drivers of carbon dioxide (CO 2 ) and methane (CH 4 ) in six small (\\1000 m 2 ), temporary, forested ponds in Connecticut, USA. The six ponds were on average 19-fold supersaturated in CO 2 and 504-fold supersaturated in CH 4 relative to the atmosphere. For both gases, this level of supersaturation is among the highest reported for lentic freshwaters. The physical, chemical, and biological parameters in the ponds differ from larger lakes, and may explain the supersaturation. Specifically, the ponds have high terrestrial carbon loadings, are shallow, and polymictic, meaning much of the water is in contact with the carbon-rich sediments. Pond CO 2 concentrations were best predicted by a negative relati

In [79]:
directory = "../collection/examples/processed/"
papers = get_filenames_in_directory(directory)
token_size = 200

# Load and process documents as chunks with specified token size:
all_content = []
content_id = []
for paper in papers:
    file_path = os.path.join(directory, paper)
    doi = paper.partition(".grobid")[0]
    doi = doi.replace("_", "/")
    doc = XmlDocument(doi = doi)
    doc.load(file_path, token_size = token_size, separators = ["\n\n", "\n", ". "])

    for i,page in enumerate(doc.pages):
        if page == ".":
            continue
        else:
            text = page.strip()
            if text.startswith(". "):
                text = text[2:]
            all_content.append(text)
            content_id.append(f"{doi}_{i}")


In [80]:
all_content

['# Drivers of carbon dioxide and methane supersaturation in small, temporary ponds\n## Abstract',
 'Inland waters are an important component of the global carbon cycle, but there is a poor understanding of carbon dynamics in very small ponds. In this study, I evaluated the concentrations and drivers of carbon dioxide (CO 2 ) and methane (CH 4 ) in six small (\\1000 m 2 ), temporary, forested ponds in Connecticut, USA. The six ponds were on average 19-fold supersaturated in CO 2 and 504-fold supersaturated in CH 4 relative to the atmosphere. For both gases, this level of supersaturation is among the highest reported for lentic freshwaters. The physical, chemical, and biological parameters in the ponds differ from larger lakes, and may explain the supersaturation. Specifically, the ponds have high terrestrial carbon loadings, are shallow, and polymictic, meaning much of the water is in contact with the carbon-rich sediments',
 'Pond CO 2 concentrations were best predicted by a negative 

In [90]:
embeddings = model.encode(all_content)
query = "A pond is a small body of still water, usually less than 5 acres (2 hectares) in surface " \
"area and typically less than 6.6 feet (2 meters) deep. It can be natural or artificial and is " \
"often shallow enough for sunlight to reach the bottom, supporting plant and animal life throughout."

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
distances, indices = index.search(model.encode([query]), k=100)
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}:")
    print(f"Content ID: {content_id[idx]}")
    print(f"Distance: {distances[0][i]}")
    print(f"Content: {all_content[idx]}\n")

Result 1:
Content ID: ponds2_3
Distance: 41.23841857910156
Content: ## Introduction Ponds are small lentic ecosystems that permanently or temporarily hold water [1]. They are shallow and their size ranges from a few square metres to several hectares. They can be natural or man-made. Their number is much higher than that of large lakes, which constitute a small percentage of the total number of lakes [2]. Despite this, studies of lentic ecosystems have concentrated mainly on moderately large lakes [2].
Ponds differ functionally from larger lakes [3], since their littoral structure and its productivity dominates the ecosystem [2]. Despite their small size they contain a significant part of aquatic biodiversity on the landscape scale [4,5]. Humans have created millions of ponds for multiple purposes [2], but today they serve as refugia for a variety of freshwater biota [5] and are, as such, an irreplaceable type of habitat [6][7][8].

Result 2:
Content ID: definitions2_9
Distance: 41.3225

In [39]:
query_pairs = [(query, all_content[idx]) for idx in indices[0]]
query_pairs

[('A pond is a small body of still water, usually less than 5 acres (2 hectares) in surface area and typically less than 6.6 feet (2 meters) deep. It can be natural or artificial and is often shallow enough for sunlight to reach the bottom, supporting plant and animal life throughout.',
  '## IntroductionPonds are small lentic ecosystems that permanently or temporarily hold water [1]. They are shallow and their size ranges from a few square metres to several hectares. They can be natural or man-made'),
 ('A pond is a small body of still water, usually less than 5 acres (2 hectares) in surface area and typically less than 6.6 feet (2 meters) deep. It can be natural or artificial and is often shallow enough for sunlight to reach the bottom, supporting plant and animal life throughout.',
  '. In the UK landscape, the majority of ditches are 1-3 m wide, with only a small proportion narrower or wider than this (Brown et al., 2006). A survey of the ditch network by Shore et al'),
 ('A pond i

In [91]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')
scores = model.predict(query_pairs)


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

In [93]:
import numpy as np
np.argmax(scores)

np.int64(36)

In [94]:
all_content[indices[0][np.argmax(scores)]]

"Although there is ample evidence of hydrologic, chemical and biological links between these 'upstream' and 'downstream' waterbodies, few studies specifically address the magnitude of these fluxes. Some effects of ponds and wetlands on downstream waters are due to their isolation, rather than their connectivity, particularly because of the wetland's ability to isolate material fluxes (US EPA, 2015). Finally, small-scale water supply is provided by ponds throughout the world but has attracted little scientific attention. In Eurasia, small-scale water storage using ponds has occurred for centuries and many ponds in Europe have been created for water supply reasons. In the Americas, as agricultural intensification continues, a large number of agricultural ponds have been created, probably having a substantial, but little investigated, impact on hydrology"

### Experiments with text splitting

In [85]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownTextSplitter

text_splitter = MarkdownTextSplitter(
    chunk_size=20,  # Or your desired chunk size
    chunk_overlap=0, # Or your desired chunk overlap
    #separators=["\n\n", "\n"], # Include period as a separator
)

text = "This is the first sentence. This is the second sentence."
chunks = text_splitter.split_text(text)

In [86]:
chunks

['This is the first', 'sentence. This is', 'the second', 'sentence.']

In [87]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

class MarkdownRecursiveSplitter:
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.paragraph_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", r"(?<=\.) "],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

    def split_markdown(self, text: str):
        sections = self._split_by_headers(text)
        final_chunks = []

        for section in sections:
            if self._is_header(section):
                final_chunks.append(section.strip())
            else:
                chunks = self.paragraph_splitter.split_text(section)
                final_chunks.extend(chunks)

        return final_chunks

    def _split_by_headers(self, text: str):
        """
        Splits text into sections, keeping headers as their own elements.
        """
        lines = text.splitlines()
        chunks = []
        buffer = []

        for line in lines:
            if re.match(r"^#{1,6} ", line):  # Markdown headers from # to ######
                if buffer:
                    chunks.append("\n".join(buffer).strip())
                    buffer = []
                chunks.append(line.strip())  # Keep header as its own chunk
            else:
                buffer.append(line)
        if buffer:
            chunks.append("\n".join(buffer).strip())

        return chunks

    def _is_header(self, text: str) -> bool:
        return bool(re.match(r"^#{1,6} ", text.strip()))


In [89]:
markdown_text = """
# Introduction

This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. 

## Related Work

Previous studies have shown interesting phenomena.

### Subsection

More technical details are covered here.
"""

splitter = MarkdownRecursiveSplitter(chunk_size=300)
chunks = splitter.split_markdown(markdown_text)

for i, chunk in enumerate(chunks):
    print(f"[{i}] {repr(chunk)}")


[0] '# Introduction'
[1] 'This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics. This paper explores the physics of black holes. It explains recent advancements in astrophysics.'
[2] '## Related Work'
[3] 'Previous studies have shown interesting phenomena.'
[4] '### Subsection'
[5] 'More technical details are covered here.'
