In [3]:
import os
import numpy as np
import pandas as pd
from IPython.display import Markdown, display
import markdown2
from systematic_review import *

### Pond Screening

In [2]:
fname = "../extraction/data/pond/pond_screening3.csv"
df = pd.read_csv(fname, index_col=0)

In [None]:
# At least one of the relevant columns is True
relevant = df.loc[df['definition_bool'] | df['table_bool'] | df['measurement_bool']]
print(f"Number of relevant papers: {len(relevant.doi.value_counts())}")

# Papers with a definition 
definitions = df.loc[df['definition_bool'] == True]
print(f"Number of papers with a definition: {len(definitions.doi.value_counts())}")

# Papers with a table
tables = df.loc[df['table_bool'] == True]
print(f"Number of papers with a table: {len(tables.doi.value_counts())}")

In [None]:
# Chunks with definition
for i in range(len(definitions)):
    sample = definitions.iloc[i,:]
    print(f"DOI: {sample.doi}:")
    print(f"Definition: {sample.definition}")
    print()
    print(f"Text: {sample.text}")
    print()
    print()

In [None]:
# Chunks with table
for i in range(len(tables)):
    sample = tables.iloc[i,:]
    print(f"DOI: {sample.doi}:")
    print()
    print(f"Text: {sample.text}")
    print()
    print()

### Coastal Screening

In [4]:
fname = "../extraction/data/coastal/screening_12k.csv"
df = pd.read_csv(fname, index_col=0, encoding="utf-8")

def normalize_quotes(text):
    if isinstance(text, str):
        return (text.replace("“", '"')
                    .replace("”", '"')
                    .replace("‘", "'")
                    .replace("’", "'"))
    return text  # leave as-is if not a string (e.g., NaN)

text_columns = df.select_dtypes(include='object').columns

for col in text_columns:
    df[col] = df[col].apply(normalize_quotes)

In [5]:
df.doi.value_counts()

doi
10.1002/lno.12199    104
10.1002/lno.11759     82
10.1002/lno.12254     73
10.1002/lno.11734     65
10.1002/lno.12260     63
                    ... 
10.1002/lno.12141      1
10.1002/lno.12125      1
10.1002/lno.12792      1
10.1002/lno.11503      1
10.1002/lno.12753      1
Name: count, Length: 1947, dtype: int64

In [6]:
df

Unnamed: 0,doi,chunk,text,ecosystem_bool,ecosystem,definition_bool,measurement_bool,table_bool,measurement,definition
0,10.1002/lno.12682,0,# Aerobic anoxygenic phototrophic bacteria cor...,False,,,,,,
1,10.1002/lno.12682,1,"## Untitled Section\nThe epipelagic zone, wher...",False,,,,,,
2,10.1002/lno.12682,2,Aerobic anoxygenic phototrophic bacteria harve...,True,"According to the excerpt, the following types ...",False,False,False,,
3,10.1002/lno.12682,3,"Additionally, and in contrast to the growing k...",False,,,,,,
4,10.1002/lno.12682,4,## Materials and methods\n## Sampling\nThe POS...,False,,,,,,
...,...,...,...,...,...,...,...,...,...,...
54225,10.1002/lno.11145,16,Mortality rates of Daphnia were modified by th...,False,,,,,,
54226,10.1002/lno.11145,17,For a parthenogenetic invertebrate such as Dap...,False,,,,,,
54227,10.1002/lno.11145,18,We have shown how predator cues and food C : P...,False,,,,,,
54228,10.1002/lno.11145,19,"### Fig. 1 .**Caption:** Fig. 1. (a) MSGR, (b)...",False,,,,,,


In [13]:
# At least one of the relevant columns is True
relevant = df.loc[df['ecosystem_bool']]
print(f"Number of relevant papers: {len(relevant.doi.value_counts())}")

negatives = df.loc[df['ecosystem_bool'] == False]

# Papers with a definition 
definitions = df.loc[df['definition_bool'] == True]
print(f"Number of papers with a definition: {len(definitions.doi.value_counts())}")

# Papers with a table
tables = df.loc[df['table_bool'] == True]
print(f"Number of papers with a table: {len(tables.doi.value_counts())}")

# Papers with a measurement
measurements = df.loc[df['measurement_bool'] == True]
print(f"Number of papers with a measurement: {len(measurements.doi.value_counts())}")

# Papers with a measurement or a definition 
definitions_or_measurements = df.loc[df['definition_bool'] | df['measurement_bool']]
print(f"Number of papers with a definition or a measurement: {len(definitions_or_measurements.doi.value_counts())}")

definitions_and_measurements = df.loc[df['definition_bool'] & df['measurement_bool']]

Number of relevant papers: 1352
Number of papers with a definition: 316
Number of papers with a table: 779
Number of papers with a measurement: 137
Number of papers with a definition or a measurement: 379


In [43]:
html_parts = ["<html><head><style>",
              "body { font-family: Arial, sans-serif; padding: 12px; }",
              "mark { background-color: #ffff99; }",
              "div.entry {margin-bottom: 50px; padding: 20px; border: 2px solid #333; border-radius: 8px; background-color: #f9f9f9; box-shadow: 2px 2px 8px rgba(0, 0, 0, 0.1);}",
              "</style></head><body>"]

html_parts = [
    "<html><head><style>",
    """
    body {
      font-family: Arial, sans-serif;
      padding: 20px;
      font-size: 16px; /* Base font size */
    }
    h1 {
      font-size: 24px;  /* Headings like 'DOI', 'Text' */
      color: #222;
      margin-bottom: 10px;
    }
    p {
      font-size: 16px;
      line-height: 1.6;
    }
    mark {
      background-color: #ffff99;
    }
    div.entry {
      margin-bottom: 50px;
      padding: 20px;
      border: 2px solid #333;
      border-radius: 8px;
      background-color: #f9f9f9;
      box-shadow: 2px 2px 8px rgba(0, 0, 0, 0.1);
    }
    """,
    "</style></head><body>"
]

category = tables.sample(n=10, random_state=567)

for i, row in category.iterrows():
    html_parts.append('<div class="entry">')
    html_parts.append(f"<h1><u><strong>Entry:</strong></u> {i + 1}</h1>")
    html_parts.append(f"<h1><u><strong>DOI:</strong></u> {row['doi']}</h1>")
    html_parts.append(f"<h1><u><strong>Text:</strong></h1></u>")
    html_parts.append(markdown2.markdown(row['text']))

    if row['ecosystem_bool']:
      html_parts.append(f"<h1><u><strong>Ecosystem:</strong></h1></u> {markdown2.markdown(row['ecosystem'])}")
      html_parts.append(f"<h1><u><strong>Definition:</strong></h1></u> {row['definition_bool']}")
      html_parts.append(f"<h1><u><strong>Table:</strong></h1></u> {row['table_bool']}")
      html_parts.append(f"<h1><u><strong>Measurement:</strong></h1></u> {row['measurement_bool']}")
      if row['definition_bool']:
          html_parts.append(f"<h1><u><strong>LLM Definition:</strong></h1></u>")
          html_parts.append(markdown2.markdown(row['definition']))

      if row['measurement_bool']:
          html_parts.append(f"<h1><u><strong>LLM Measurement:</strong></h1></u>")
          html_parts.append(markdown2.markdown(row['measurement']))

    html_parts.append("</div>")

html_parts.append("</body></html>")

with open("../../notes/results/tables.html", "w", encoding="utf-8") as f:
    f.write("\n".join(html_parts))

In [34]:
df.ecosystem

0                                                      NaN
1                                                      NaN
2        According to the excerpt, the following types ...
3                                                      NaN
4                                                      NaN
                               ...                        
54225                                                  NaN
54226                                                  NaN
54227                                                  NaN
54228                                                  NaN
54229                                                  NaN
Name: ecosystem, Length: 54230, dtype: object