In [2]:
import pandas as pd

### Gemma extraction

In [19]:
gemma_df = pd.read_csv('../experiments/data/pond_screening3.csv', index_col = 0)
titles = gemma_df.title.value_counts().index

In [27]:
titles

Index(['Definition 2: The importance of small waterbodies for biodiversity and ecosystem services: implications for policy makers',
       'Fake 2: Methane and Carbon Dioxide Fluxes in a Temperate Tidal Salt Marsh: Comparisons Between Plot and Ecosystem Measurements',
       'Lake 3: Patterns in the Species Composition and Richness of Fish Assemblages in Northern Wisconsin Lakes',
       'Lake 1: Lake metabolism scales with lake morphometry and catchment conditions',
       'Fake 1: Quantifying saltmarsh vegetation and its effect on wave height dissipation: Results from a UK East coast saltmarsh',
       'Pond 3: Drivers of carbon dioxide and methane supersaturation in small, temporary ponds',
       'Definition 3: Agricultural Freshwater Pond Supports Diverse and Dynamic Bacterial and Viral Populations',
       'Lake 2: Net Heterotrophy in Small Danish Lakes: A Widespread Feature  Over Gradients in Trophic Status and Land Cover',
       'Pond 1: Specificity of zooplankton distribution

In [63]:
titles[0]

'Definition 2: The importance of small waterbodies for biodiversity and ecosystem services: implications for policy makers'

In [88]:
title = titles[6]
print(title)
title_df = gemma_df.loc[gemma_df.title == title]
definition_bool = title_df['definition_bool'].value_counts()
definitions = title_df['definition'].value_counts().index
table_bool = title_df['table_bool'].value_counts()

Definition 3: Agricultural Freshwater Pond Supports Diverse and Dynamic Bacterial and Viral Populations


In [89]:
definition_bool

definition_bool
False    12
True      1
Name: count, dtype: int64

In [90]:
table_bool

table_bool
False    12
True      1
Name: count, dtype: int64

In [91]:
definitions

Index(['Ponds are generally defined as small (1 m² to ∼50,000 m²) shallow, standing water bodies that can either permanently or temporarily collect freshwater. \n\nLakes are not specifically defined in this text, but ponds are distinguished from them as being *smaller* in size.\n'], dtype='object', name='definition')

### Elicit experiments

In [14]:
pond_df = pd.read_csv('../../pond-data/pond_data.csv', encoding='unicode_escape')
elicit_df = pd.read_csv('../../pond-data/elicit-data-search.csv')

In [15]:
pond_df

Unnamed: 0,author,year,title,journal,citation,author_term,location,pondname,humanbuilt_manipulated,ponduse,...,macrophytespresence,macrophytes_percentcover,ph,turbidity_secchi_m,tss_mgpl,doc_mgpl,chla_ugpl,tp_ugpl,tn_ugpl,cond_uspcm
0,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor1,n,,...,,,6.730,,,,28.827,1383.3,,173.0
1,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor2,n,,...,,,6.950,,,,15.640,1223.3,,761.0
2,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor3,n,,...,,,6.755,,,,8.150,695.0,,338.0
3,kuczynska-kippen; n.; basinska; a. m.; swidnic...,2013,specificity of zooplankton distribution in met...,knowledge and management of aquatic ecosystems,Kuczynska-Kippen N; Basinska A; Swidnicki K. ...,meteorite crater ponds,western poland,meteor4,n,,...,,,6.555,,,,29.940,1130.0,,445.0
4,hanrahan,2008,multivariate chemometrical classification and ...,ecological informatics,Hanrahan Grady; Siraj Gibani; Kent Miller. 200...,aquatic environment,mojave desert; california; united states,lake tuendae,y,fisheries,...,,,9.400,,,,,90.0,980.0,45.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1322,eskinazi-sant'anna; eneida maria; pace; michae...,2018,the potential of the zooplankton resting-stage...,journal of plankton research,Eskinazi-SantAnna Eneida Maria; Michael L Pace...,temporary lake,brazil,meio,,,...,,,,,,,,,,
1323,eskinazi-sant'anna; eneida maria; pace; michae...,2018,the potential of the zooplankton resting-stage...,journal of plankton research,Eskinazi-SantAnna Eneida Maria; Michael L Pace...,temporary lake,brazil,seca,,,...,,,5.400,,,,8.200,11.3,276.2,
1324,sepulveda-jauregui; armando; martinez-cruz; ka...,2018,assessment of methane and carbon dioxide emiss...,freshwater biology,Sepulveda-Jauregui A; Martinez-Cruz K; Lau M; ...,small acidic bog lake,mecklenburg-brandenburg lake district,grosse fuchskuhle,n,other,...,,,4.700,,,2.7,8.000,,,
1325,sepulveda-jauregui; armando; martinez-cruz; ka...,2018,assessment of methane and carbon dioxide emiss...,freshwater biology,Sepulveda-Jauregui A; Martinez-Cruz K; Lau M; ...,small acidic bog lake,mecklenburg-brandenburg lake district,grosse fuchskuhle ne,y,other,...,,,6.500,,,16.4,32.300,27.0,1700.0,44.8


In [16]:
import re
import unicodedata

def extract_title(citation):
    match = re.search(r'\d{4}\.\s+(.*?[.?!])(?=\s+[A-Z])', citation)
    if match:
        return match.group(1).strip()
    return None

def normalize_text(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

from rapidfuzz import fuzz, process

def find_best_match(title, other_titles, threshold=75):
    normalized_title = normalize_text(title)
    best_match = process.extractOne(
        normalized_title,
        other_titles,
        scorer = fuzz.ratio
    )
    if best_match and best_match[1] >= threshold:
        return best_match
    return None

In [18]:
pond_list = pond_df['title'].value_counts().index.to_list()
#pond_list = [extract_title(title) for title in pond_list]
#pond_list = [title for title in pond_list if title is not None]
pond_list = [normalize_text(title) for title in pond_list]
elicit_list = elicit_df['Title'].to_list()
elicit_list = [normalize_text(title) for title in elicit_list]

matches = 0
for t in pond_list:
    result = find_best_match(t, elicit_list)
    if result:
        matches += 1
        print(f"Match found: {t} => {result[0]} (Score: {result[1]})")
    else:
        print(f"No match")

No match
No match
No match
No match
No match
Match found: physical and chemical limnological characteristics of 38 lakes and ponds on bathurst island nunavut canadian high arctic => physical and chemical characteristics of 1300 lakes and ponds across the canadian arctic (Score: 77.88461538461539)
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
Match found: the diversity of macroinvertebrate and macrophyte communities in ponds => the diversity of macro invertebrate and macrophyte communities in ponds (Score: 99.29078014184397)
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No match
No 

In [17]:
matches

2

### Screening Experiments

In [11]:
fname = "../extraction/data/pond_screening4.csv"
df = pd.read_csv(fname, index_col=0)

In [12]:
df

Unnamed: 0,doi,chunk,abstract_bool,definition_bool,table_bool
0,10.1002/lno.12769,-1,True,,
1,10.1002/lno.12769,0,True,False,False
2,10.1002/lno.12769,1,True,False,False
3,10.1002/lno.12769,2,True,False,False
4,10.1002/lno.12769,3,True,False,False
...,...,...,...,...,...
467,10.1002/lno.70044,0,True,False,False
468,10.1002/lno.12754,-1,True,,
469,10.1002/lno.12754,0,True,False,False
470,10.1002/lno.70005,-1,False,,


In [15]:
df.abstract_bool.value_counts()

abstract_bool
True     377
False     95
Name: count, dtype: int64

In [13]:
df.definition_bool.value_counts()

definition_bool
False    338
Name: count, dtype: int64

In [14]:
df.table_bool.value_counts()

table_bool
False    321
True      17
Name: count, dtype: int64

In [17]:
df.loc[df.table_bool == True].doi.value_counts().index

Index(['10.1002/lno.70020', '10.1002/lno.12767', '10.1002/lno.70095',
       '10.1002/lno.12788', '10.1002/lno.70053', '10.1002/lno.70064'],
      dtype='object', name='doi')