In [2]:
import os
import pandas as pd
from systematic_review import *

### Gemma extraction

In [19]:
gemma_df = pd.read_csv('../experiments/data/pond_screening3.csv', index_col = 0)
titles = gemma_df.title.value_counts().index

In [27]:
titles

Index(['Definition 2: The importance of small waterbodies for biodiversity and ecosystem services: implications for policy makers',
       'Fake 2: Methane and Carbon Dioxide Fluxes in a Temperate Tidal Salt Marsh: Comparisons Between Plot and Ecosystem Measurements',
       'Lake 3: Patterns in the Species Composition and Richness of Fish Assemblages in Northern Wisconsin Lakes',
       'Lake 1: Lake metabolism scales with lake morphometry and catchment conditions',
       'Fake 1: Quantifying saltmarsh vegetation and its effect on wave height dissipation: Results from a UK East coast saltmarsh',
       'Pond 3: Drivers of carbon dioxide and methane supersaturation in small, temporary ponds',
       'Definition 3: Agricultural Freshwater Pond Supports Diverse and Dynamic Bacterial and Viral Populations',
       'Lake 2: Net Heterotrophy in Small Danish Lakes: A Widespread Feature  Over Gradients in Trophic Status and Land Cover',
       'Pond 1: Specificity of zooplankton distribution

In [88]:
title = titles[6]
print(title)
title_df = gemma_df.loc[gemma_df.title == title]
definition_bool = title_df['definition_bool'].value_counts()
definitions = title_df['definition'].value_counts().index
table_bool = title_df['table_bool'].value_counts()

Definition 3: Agricultural Freshwater Pond Supports Diverse and Dynamic Bacterial and Viral Populations


### Elicit experiments

In [14]:
pond_df = pd.read_csv('../../pond-data/pond_data.csv', encoding='unicode_escape')
elicit_df = pd.read_csv('../../pond-data/elicit-data-search.csv')

In [15]:
pond_df

Unnamed: 0,author,year,title,journal,citation,author_term,location,pondname,humanbuilt_manipulated,ponduse,...,macrophytespresence,macrophytes_percentcover,ph,turbidity_secchi_m,tss_mgpl,doc_mgpl,chla_ugpl,tp_ugpl,tn_ugpl,cond_uspcm
0,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor1,n,,...,,,6.730,,,,28.827,1383.3,,173.0
1,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor2,n,,...,,,6.950,,,,15.640,1223.3,,761.0
2,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor3,n,,...,,,6.755,,,,8.150,695.0,,338.0
3,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor4,n,,...,,,6.555,,,,29.940,1130.0,,445.0
4,hanrahan,2008,multivariate chemometrical classification and ...,ecological informatics,Hanrahan Grady; Siraj Gibani; Kent Miller. 200...,aquatic environment,mojave desert; california; united states,lake tuendae,y,fisheries,...,,,9.400,,,,,90.0,980.0,45.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1322,eskinazi-sant'anna; eneida maria; pace; michae...,2018,the potential of the zooplankton resting-stage...,journal of plankton research,Eskinazi-SantAnna Eneida Maria; Michael L Pace...,temporary lake,brazil,meio,,,...,,,,,,,,,,
1323,eskinazi-sant'anna; eneida maria; pace; michae...,2018,the potential of the zooplankton resting-stage...,journal of plankton research,Eskinazi-SantAnna Eneida Maria; Michael L Pace...,temporary lake,brazil,seca,,,...,,,5.400,,,,8.200,11.3,276.2,
1324,sepulveda-jauregui; armando; martinez-cruz; ka...,2018,assessment of methane and carbon dioxide emiss...,freshwater biology,Sepulveda-Jauregui A; Martinez-Cruz K; Lau M; ...,small acidic bog lake,mecklenburg-brandenburg lake district,grosse fuchskuhle,n,other,...,,,4.700,,,2.7,8.000,,,
1325,sepulveda-jauregui; armando; martinez-cruz; ka...,2018,assessment of methane and carbon dioxide emiss...,freshwater biology,Sepulveda-Jauregui A; Martinez-Cruz K; Lau M; ...,small acidic bog lake,mecklenburg-brandenburg lake district,grosse fuchskuhle ne,y,other,...,,,6.500,,,16.4,32.300,27.0,1700.0,44.8


In [16]:
import re
import unicodedata

def extract_title(citation):
    match = re.search(r'\d{4}\.\s+(.*?[.?!])(?=\s+[A-Z])', citation)
    if match:
        return match.group(1).strip()
    return None

def normalize_text(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

from rapidfuzz import fuzz, process

def find_best_match(title, other_titles, threshold=75):
    normalized_title = normalize_text(title)
    best_match = process.extractOne(
        normalized_title,
        other_titles,
        scorer = fuzz.ratio
    )
    if best_match and best_match[1] >= threshold:
        return best_match
    return None

In [18]:
pond_list = pond_df['title'].value_counts().index.to_list()
#pond_list = [extract_title(title) for title in pond_list]
#pond_list = [title for title in pond_list if title is not None]
pond_list = [normalize_text(title) for title in pond_list]
elicit_list = elicit_df['Title'].to_list()
elicit_list = [normalize_text(title) for title in elicit_list]

matches = 0
for t in pond_list:
    result = find_best_match(t, elicit_list)
    if result:
        matches += 1
        print(f"Match found: {t} => {result[0]} (Score: {result[1]})")
    else:
        print(f"No match")

No match
No match
No match
No match
No match
Match found: physical and chemical limnological characteristics of 38 lakes and ponds on bathurst island nunavut canadian high arctic => physical and chemical characteristics of 1300 lakes and ponds across the canadian arctic (Score: 77.88461538461539)
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
Match found: the diversity of macroinvertebrate and macrophyte communities in ponds => the diversity of macro invertebrate and macrophyte communities in ponds (Score: 99.29078014184397)
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No 

### Screening Experiments

In [3]:
fname = "../extraction/data/coastal/screening3.csv"
df = pd.read_csv(fname, index_col=0)

In [4]:
df

Unnamed: 0,doi,chunk,abstract_bool,definition_bool,table_bool,definition
0,10.1002/lno.12769,-1,False,,,
1,10.1002/lno.70071,-1,False,,,
2,10.1002/lno.12802,-1,True,,,
3,10.1002/lno.12802,0,True,False,False,
4,10.1002/lno.12812,-1,False,,,
...,...,...,...,...,...,...
1955,10.1002/lno.70072,18,True,False,False,
1956,10.1002/lno.70072,19,True,False,False,
1957,10.1002/lno.70072,20,True,False,False,
1958,10.1002/lno.70072,21,True,False,False,


In [5]:
df.abstract_bool.value_counts()

abstract_bool
True     1902
False      58
Name: count, dtype: int64

In [34]:
query = (
    "Does this page include a table containing data related to "
    "physical, chemical, or biological attributes of coastal ecosystems? "
    "Data must be reported in a table format, and should only be given for individually "
    "studied ecosystems, instead of aggregate statistics for groups of ecosystems. "
    "Examples include but are not limited to water depth, temperature, or pH."
    "Coastal ecosystems may include but are not limited to intertidal zones, estuaries, "
    "lagoons, reefs, magroves, marshes, segagrass meadows, kelp forests, and coastal wetlands."
)
print(query)

Does this page include a table containing data related to physical, chemical, or biological attributes of coastal ecosystems? Data must be reported in a table format, and should only be given for individually studied ecosystems, instead of aggregate statistics for groups of ecosystems. Examples include but are not limited to water depth, temperature, or pH.Coastal ecosystems may include but are not limited to intertidal zones, estuaries, lagoons, reefs, magroves, marshes, segagrass meadows, kelp forests, and coastal wetlands.


In [10]:
df.columns

Index(['doi', 'chunk', 'abstract_bool', 'definition_bool', 'table_bool',
       'definition'],
      dtype='object')

In [12]:
df.definition_bool.value_counts()

definition_bool
False    1809
True       17
Name: count, dtype: int64

In [7]:
df.table_bool.value_counts()

table_bool
False    1743
True       83
Name: count, dtype: int64

In [9]:
df.loc[df.definition_bool == True].iloc[0].definition

'```json\n{\n  "ecosystems": [\n    {\n      "name": "Mangroves",\n      "quantitative_attributes": [\n        "Store and sequester 0.5-0.8% of anthropogenic carbon dioxide emissions (141-466 tons of carbon yr À1 )"\n      ]\n    },\n    {\n      "name": "Salt Marshes",\n      "quantitative_attributes": [\n        "Store and sequester 0.5-0.8% of anthropogenic carbon dioxide emissions (141-466 tons of carbon yr À1 )"\n      ]\n    },\n    {\n      "name": "Seagrass Meadows",\n      "quantitative_attributes": [\n        "Store and sequester 0.5-0.8% of anthropogenic carbon dioxide emissions (141-466 tons of carbon yr À1 )"\n      ]\n    },\n    {\n      "name": "Macroalgal forests",\n      "quantitative_attributes": [\n        "First-order estimates of particulate macroalgal organic carbon sequestration suggest a contribution of 56 (10-170) tons of carbon yr À1 globally"\n      ]\n    }\n  ]\n}\n```</end_of_turn>'

In [79]:
print(df.loc[df.definition_bool == True].iloc[4].definition)

**Seagrass:** Defined by seagrass cover, decreasing from 60% in seagrass plots to 0% in tidal flat plots. The transition from seagrass to tidal flat plots occurs across an aboveground transition, with the inflection point located at -0.04 m on the transect.

**Mangrove:** Defined by mangrove cover, decreasing from approximately 100% in inner mangrove sampling plots to 0% in surrounding tidal flat. The transition occurs across an aboveground transition with inflection points at 0.84 m (Sungei Buloh) and 0.66 m (Sungei Puaka).

**Tidal Flats:** Defined by a lack of seagrass and mangrove cover (0% cover). They occur adjacent to both seagrass and mangrove ecosystems.

</end_of_turn>


In [15]:
df.loc[df.definition_bool == True].doi.value_counts()

doi
10.1002/lno.12811    2
10.1002/lno.12795    2
10.1002/lno.70056    2
10.1002/lno.12739    2
10.1002/lno.70006    1
10.1002/lno.70080    1
10.1002/lno.70089    1
10.1002/lno.12742    1
10.1002/lno.70062    1
10.1002/lno.70060    1
10.1002/lno.12772    1
10.1002/lno.12724    1
10.1002/lno.70046    1
Name: count, dtype: int64

In [50]:
df.loc[df.doi == '10.1002/lno.12730']

Unnamed: 0,doi,chunk,abstract_bool,definition_bool,table_bool,definition
69,10.1002/lno.12730,-1,True,,,
70,10.1002/lno.12730,0,True,False,False,
71,10.1002/lno.12730,1,True,False,False,
72,10.1002/lno.12730,2,True,False,False,
73,10.1002/lno.12730,3,True,False,False,
74,10.1002/lno.12730,4,True,False,False,
75,10.1002/lno.12730,5,True,False,True,
76,10.1002/lno.12730,6,True,False,True,
77,10.1002/lno.12730,7,True,False,True,
78,10.1002/lno.12730,8,True,False,True,


In [54]:
directory = "../collection/processed/"
paper = '10.1002_lno.12730.grobid.tei.xml'
token_size = 1000
file_path = os.path.join(directory, paper)
doi = paper.partition(".grobid")[0]
doi = doi.replace("_", "/")
doc = XmlDocument(doi = doi)
doc.load(file_path, token_size = token_size)


In [75]:
print(doc.pages[5])

## Satellite data
The level-3 sea surface Chl a and SST with 4-km spatial resolution were derived from the Visible and Infrared Imager/ Radiometer Suite and Moderate Resolution Imaging Spectroradiometer, respectively, and downloaded from Ocean Color Web (http://oceancolor.gsfc.nasa.gov). For comparison, both daily sea surface Chl a and SST were averaged across the coverage of Lat. ( N) Â Long. ( E) = (121. 3 Â 26.8,123 Â 26.2,122.7 Â 25.2,and 121 Â 25.8;Fig. 1a).

## Data analyses
For comparison, the depth-integrated values of the chemical and biological variables for the euphotic zone were estimated using the trapezoidal method for each station. The depth-integrated values and their mean values were further compared and analyzed. Statistical analyses included t-tests for comparison, multiple linear regression analyses, and ANOVA, all conducted using SigmaStat (version 3.5, Systat Software, Inc.). Additionally, to asses linear correlation between two variables, we employed type II regr