In [38]:
import os
import numpy as np
import pandas as pd
from IPython.display import Markdown, display
import markdown2
from systematic_review import *

### Pond Screening

In [33]:
fname = "../extraction/data/pond/pond_screening3.csv"
df = pd.read_csv(fname, index_col=0)

In [None]:
# At least one of the relevant columns is True
relevant = df.loc[df['definition_bool'] | df['table_bool'] | df['measurement_bool']]
print(f"Number of relevant papers: {len(relevant.doi.value_counts())}")

# Papers with a definition 
definitions = df.loc[df['definition_bool'] == True]
print(f"Number of papers with a definition: {len(definitions.doi.value_counts())}")

# Papers with a table
tables = df.loc[df['table_bool'] == True]
print(f"Number of papers with a table: {len(tables.doi.value_counts())}")

In [None]:
# Chunks with definition
for i in range(len(definitions)):
    sample = definitions.iloc[i,:]
    print(f"DOI: {sample.doi}:")
    print(f"Definition: {sample.definition}")
    print()
    print(f"Text: {sample.text}")
    print()
    print()

In [None]:
# Chunks with table
for i in range(len(tables)):
    sample = tables.iloc[i,:]
    print(f"DOI: {sample.doi}:")
    print()
    print(f"Text: {sample.text}")
    print()
    print()

### Coastal Screening

In [54]:
fname = "../extraction/data/coastal/screening_100_3.csv"
df = pd.read_csv(fname, index_col=0, encoding="utf-8")

def normalize_quotes(text):
    if isinstance(text, str):
        return (text.replace("“", '"')
                    .replace("”", '"')
                    .replace("‘", "'")
                    .replace("’", "'"))
    return text  # leave as-is if not a string (e.g., NaN)

text_columns = df.select_dtypes(include='object').columns

for col in text_columns:
    df[col] = df[col].apply(normalize_quotes)

In [55]:
df.doi.value_counts()

doi
10.1002/lno.11734    66
10.1002/lno.11678    49
10.1002/lno.12161    46
10.1002/lno.11731    45
10.1002/lno.11773    44
                     ..
10.1002/lno.11878     1
10.1002/lno.11271     1
10.1002/lno.12758     1
10.1002/lno.12179     1
10.1002/lno.12780     1
Name: count, Length: 100, dtype: int64

In [56]:
# At least one of the relevant columns is True
relevant = df.loc[df['definition_bool'] | df['table_bool'] | df['measurement_bool']]
print(f"Number of relevant papers: {len(relevant.doi.value_counts())}")

# Papers with a definition 
definitions = df.loc[df['definition_bool'] == True]
print(f"Number of papers with a definition: {len(definitions.doi.value_counts())}")

# Papers with a table
tables = df.loc[df['table_bool'] == True]
print(f"Number of papers with a table: {len(tables.doi.value_counts())}")

# Papers with a measurement
measurements = df.loc[df['measurement_bool'] == True]
print(f"Number of papers with a measurement: {len(measurements.doi.value_counts())}")

# Papers with a measurement or a definition 
definitions_or_measurements = df.loc[df['definition_bool'] | df['measurement_bool']]
print(f"Number of papers with a definition or a measurement: {len(definitions_or_measurements.doi.value_counts())}")

definitions_and_measurements = df.loc[df['definition_bool'] & df['measurement_bool']]

Number of relevant papers: 27
Number of papers with a definition: 13
Number of papers with a table: 26
Number of papers with a measurement: 6
Number of papers with a definition or a measurement: 15


In [57]:
html_parts = ["<html><head><style>",
              "body { font-family: Arial, sans-serif; padding: 12px; }",
              "mark { background-color: #ffff99; }",
              "div.entry {margin-bottom: 50px; padding: 20px; border: 2px solid #333; border-radius: 8px; background-color: #f9f9f9; box-shadow: 2px 2px 8px rgba(0, 0, 0, 0.1);}",
              "</style></head><body>"]

html_parts = [
    "<html><head><style>",
    """
    body {
      font-family: Arial, sans-serif;
      padding: 20px;
      font-size: 16px; /* Base font size */
    }
    h1 {
      font-size: 24px;  /* Headings like 'DOI', 'Text' */
      color: #222;
      margin-bottom: 10px;
    }
    p {
      font-size: 16px;
      line-height: 1.6;
    }
    mark {
      background-color: #ffff99;
    }
    div.entry {
      margin-bottom: 50px;
      padding: 20px;
      border: 2px solid #333;
      border-radius: 8px;
      background-color: #f9f9f9;
      box-shadow: 2px 2px 8px rgba(0, 0, 0, 0.1);
    }
    """,
    "</style></head><body>"
]

category = measurements

for i, row in category.iterrows():
    html_parts.append('<div class="entry">')
    html_parts.append(f"<h1><u><strong>Entry:</strong></u> {i + 1}</h1>")
    html_parts.append(f"<h1><u><strong>DOI:</strong></u> {row['doi']}</h1>")
    html_parts.append(f"<h1><u><strong>Text:</strong></h1></u>")
    html_parts.append(markdown2.markdown(row['text']))
    html_parts.append(f"<h1><u><strong>Definition:</strong></h1></u> {row['definition_bool']}")
    html_parts.append(f"<h1><u><strong>Table:</strong></h1></u> {row['table_bool']}")
    html_parts.append(f"<h1><u><strong>Measurement:</strong></h1></u> {row['measurement_bool']}")
    if row['definition_bool']:
        html_parts.append(f"<h1><u><strong>LLM Definition:</strong></h1></u>")
        html_parts.append(markdown2.markdown(row['definition']))
    html_parts.append("</div>")

html_parts.append("</body></html>")

with open("output.html", "w", encoding="utf-8") as f:
    f.write("\n".join(html_parts))

In [58]:
tables

Unnamed: 0,doi,chunk,abstract_bool,text,definition_bool,table_bool,measurement_bool,definition
14,10.1002/lno.12365,6,True,## In situ data\nAksnes and Ohman (2009) assem...,False,True,False,
38,10.1002/lno.11678,7,True,## Radon and radium data\nMost of the radon an...,False,True,False,
39,10.1002/lno.11678,8,True,## Initial estimation of mass balance fluxes\n...,False,True,False,
40,10.1002/lno.11678,9,True,To estimate the weighted average 222 Rn and 22...,False,True,False,
43,10.1002/lno.11678,12,True,Sensitivity to average tracer concentrations i...,False,True,False,
...,...,...,...,...,...,...,...,...
1046,10.1002/lno.12241,19,True,Area Site name Latitude ( N) Longitude ( E) Ag...,False,True,True,
1052,10.1002/lno.12241,25,True,To address local-scale variation in mangrove s...,False,True,True,
1076,10.1002/lno.12471,13,True,Due to no consensus of the best k-wind speed p...,False,True,False,
1078,10.1002/lno.12471,15,True,## Discussion\n## Spatial-temporal variability...,False,True,False,
