In [1]:
from dotenv import load_dotenv
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import MarkdownHeaderTextSplitter
from IPython.display import Markdown
import os

ModuleNotFoundError: No module named 'langchain_chroma'

In [2]:
# from langchain_chroma import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from langchain.retrievers import ParentDocumentRetriever 
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import PDFPlumberLoader
from langchain_community.document_transformers import MarkdownifyTransformer
from langchain_community.document_loaders import AsyncHtmlLoader

from langchain.storage import InMemoryStore

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = ChatOpenAI(model='gpt-4o')

# a report
with open('report_results.md', 'r') as file:
    new_file = file.read()

with open('vaud_report_results.md', 'r') as file:
    vaud_file = file.read()

with open('iqaasl.md', 'r') as file:
    iqaasl_file = file.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
   
]

context_splitter =  MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
iq_header_splits = context_splitter.split_text(iqaasl_file)
md_header_splits = context_splitter.split_text(new_file)
vd_header_splits = context_splitter.split_text(vaud_file)

embeddings = OpenAIEmbeddings()


vectorstore = Chroma(collection_name="my_collection", embedding_function=embeddings)
vectorstore.add_documents(md_header_splits)
# vectorstore.add_documents(iq_header_splits)
# vectorstore.add_documents(vd_header_splits)

urls = [
    "https://www.bayesrulesbook.com/chapter-6",
    "https://allendowney.github.io/ThinkBayes2/hospital.html",
    "https://en.wikipedia.org/wiki/Dirichlet-multinomial_distribution",
    "https://en.wikipedia.org/wiki/Conjugate_prior",
    "https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator",
    "https://en.wikipedia.org/wiki/Random_forest",
    "https://en.wikipedia.org/wiki/Linear_regression",
    "https://en.wikipedia.org/wiki/Gradient_boosting",
    "https://en.wikipedia.org/wiki/Bootstrap_aggregating",
    "https://en.wikipedia.org/wiki/Cluster_analysis",
]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

md = MarkdownifyTransformer()
converted_docs = md.transform_documents(docs)

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
   
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
t = [markdown_splitter.split_text(x.page_content) for x in converted_docs]




USER_AGENT environment variable not set, consider setting it to identify your requests.
Fetching pages: 100%|###########################| 10/10 [00:02<00:00,  4.03it/s]


In [3]:
client = ChatOpenAI(model='gpt-4o', temperature=.5)

# default_retriever_args = {'search_kwargs':{"k": 4, "score_threshold": 0.5}} 
# retriever_d = vectorstore.as_retriever(
#     search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k":4}
# )
# # "score_threshold": 0.5
# # , search_kwargs={"score_threshold": 0.5}
system_prompt = (
    "You are an environmental scientits. You have benn assigned the task of"
    "responding to clients questions about the beach-litter survey data in their region."
    "Your assistants have assembled the relevant context for you. Please use the context to"
    "answer the clients question in professional manner. The customer may have specific instructions. "
    "Follow them. If you don't know the answer, say that you don't know."
    "\n\n"
    "{context}"
)

promptx = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(client, promptx)
# data_chain = create_retrieval_chain(retriever_d, question_answer_chain)

# a data chain with just one survey record  
vectorstoret = Chroma(collection_name="my_collection", embedding_function=embeddings)
vectorstoret.add_documents(md_header_splits)

# add the resources

for docs in t:
    vectorstoret.add_documents(docs)
rdt = vectorstoret.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k":4}
)
data_chait = create_retrieval_chain(rdt, question_answer_chain)

# from langchain.retrievers.document_compressors import LLMChainFilter
# from langchain.retrievers import ContextualCompressionRetriever
# _filter = LLMChainFilter.from_llm(client)
# compression_retriever = ContextualCompressionRetriever(
#     base_compressor=_filter, base_retriever=retriever_d
# )
# data_chainx = create_retrieval_chain(compression_retriever, question_answer_chain)

In [20]:
# parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
# child_splitter = RecursiveCharacterTextSplitter(chunk_size=250)
# # The vectorstore to use to index the child chunks
# text_vectorstore_i = Chroma(
#     collection_name="split_parents", embedding_function=OpenAIEmbeddings()
# )

# text_store_i = InMemoryStore()

# text_retriever_i = ParentDocumentRetriever(
#     vectorstore=text_vectorstore_i,
#     docstore=text_store_i,
#     child_splitter=child_splitter,
#     parent_splitter=parent_splitter,
# )

# land_use_loader = PDFPlumberLoader('resources/revealing_the_role_of_landuse.pdf')
# land_use_def = land_use_loader.load()

# the_guide_loader = PDFPlumberLoader('resources/eu-guide-marine-litter-2023.pdf')
# mlw_guide = the_guide_loader.load()

# history_marine_litter = PDFPlumberLoader('resources/brief_history_marine_litter.pdf')
# history_marine_def = history_marine_litter.load()

# leakage_loader = PDFPlumberLoader('resources/identifying_leakage_un_2020.pdf')
# leakage_def = leakage_loader.load()

# water_shed = PDFPlumberLoader('resources/anthropogenic-litter-accumulation-watershed.pdf')
# water_sh = water_shed.load()

# eu_baselines = PDFPlumberLoader('resources/eu_marine_litter_baselines.pdf')
# eu_base = eu_baselines.load()

# land_use_malaysia = PDFPlumberLoader('resources/land-use-marine-litter-malaysia.pdf')
# land_um = land_use_malaysia.load()

# road_side_l = PDFPlumberLoader('resources/road-side-litter-accumulation-ny.pdf')
# road_side = road_side_l.load()

# fore_cast = PDFPlumberLoader('resources/Walvoort-ea-2021-Modelling-Forecasting-Beach-Litter-Assessment-Values-1.pdf')
# fore_cv = fore_cast.load()

# thold_val = PDFPlumberLoader('resources/coastline_litter_threshold_value_report_14_9_2020_final.pdf')
# thold_v = thold_val.load()

# text_retriever_i.add_documents(land_use_def, ids=None)
# text_retriever_i.add_documents(history_marine_def, ids=None)
# text_retriever_i.add_documents(leakage_def, ids=None)
# text_retriever_i.add_documents(water_sh, ids=None)
# text_retriever_i.add_documents(eu_base, ids=None)
# text_retriever_i.add_documents(land_um, ids=None)
# text_retriever_i.add_documents(road_side, ids=None)
# text_retriever_i.add_documents(fore_cv, ids=None)
# text_retriever_i.add_documents(thold_v, ids=None)
# text_retriever_i.add_documents(mlw_guide, ids=None)

In [21]:
# new_retriever = text_retriever_i.vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k":4})

# qtext_answer_chain = create_stuff_documents_chain(client, promptx)
# dtext_chain = create_retrieval_chain(new_retriever, qtext_answer_chain)

In [22]:
import session_config
from session_config import  collect_survey_data, feature_variables, agg_groups
from reports import make_report_objects, reports_and_forecast
from reports import admin_report, features_present, histograms_standard
from reports import ecdf_plots_standard, scatter_plot_standard
from reports import labels_for_display, make_standard_report, make_report_objects
import pandas as pd

import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from IPython.display import display, Markdown
from myst_nb import glue
from shapely.geometry import Point

beaches = pd.read_csv('data/end_process/beaches.csv')
lat_lon = beaches[['slug', 'latitude', 'longitude']].set_index('slug')
datax = collect_survey_data()
codes = pd.read_csv('data/end_process/codes.csv').set_index('code')

def map_markers(df, lat_lon: pd.DataFrame = lat_lon):
    """Map the markers"""
    nsamples = df.groupby('location', observed=True)['sample_id'].nunique()
    qty_location = df.groupby('location', observed=True)['quantity'].sum()
    rate_location = df.groupby('location', observed=True)['pcs/m'].mean().round(2)
    last_sample = df.groupby('location', observed=True)['date'].max()
    df = pd.concat([nsamples, qty_location, rate_location, last_sample], axis=1)
    df = df.merge(lat_lon, left_index=True, right_index=True)
    df['location'] = df.index
    max_lat, min_lat = df['latitude'].max(), df['latitude'].min()
    max_lon, min_lon = df['longitude'].max(), df['longitude'].min()
    td = df.to_dict(orient='records')
    geometry = [Point(loc['longitude'], loc['latitude']) for loc in td]
    geo_frame = gpd.GeoDataFrame(td, geometry=geometry, crs="EPSG:4326")
    
    return geo_frame , {'max_lat': max_lat, 'min_lat': min_lat, 'max_lon': max_lon, 'min_lon': min_lon}

def scatter_plot_display(report_results, standard_report):

    points = []
    dates = "" 
    
    if report_results['prior_report'] != 'No prior':
        pp = (report_results['prior_report'].sample_results(), standard_report['prior-labels'], session_config.palette['prior'])
        points.append(pp)
        dates += standard_report['prior-labels'][:4]
        dates += ' - ' 
    if report_results['this_report'] != 'No likelihood':
        lp = (report_results['this_report'].sample_results(), standard_report['likelihood-labels'], session_config.palette['likelihood'])
        points.append(lp)
        dates += standard_report['likelihood-labels'][-4:]
    
    title = f'All samples {canton}: {dates}'
    
    

    return scatter_plot_standard(points), title




# from use_cases example
ooi = ['G10',  'G30', 'G31', 'G33', 'G34', 'G35', 'G8', 'G7', 'G6', 'G5', 'G4', 'G37', 'G2', 'G27', 'G25', 'G26', 'G11']
# more refined search
tobo_snacks = ['G27', 'G30', 'G35']
# unidentified, plastic, different uses
# udi = ['Gfrags', 'Gfoams']
# industrial
indus = ['G89', 'G67', 'G112', 'G93' , 'G66','G74', 'G72', 'G87', 'G65', 'G69', 'G68', 'G43', 'G41', 'G38', 'G36', 'G19', 'G17', 'Gfrags']

# features
land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'recreation', 'streets']

o_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}

canton = 'Bern'
this_feature_type = 'l'
region_type = 'canton'

d = datax.reset_index(drop=True)

d = d[d.canton.isin(['Genève', 'Valais', 'Vaud', 'Zürich', 'Bern'])]

# make complete report
params_l = {'canton':canton, 'date_range':o_dates, 'feature_type': this_feature_type}
params_p = {'canton':canton, 'date_range':prior_dates, 'feature_type':this_feature_type}

# set the parameters for the weighted prior
# exclude records in the likelihood, set date range and feature type
lu_catalogue = d[(d.canton != canton)&(d['date'] <= o_dates['end'])&(d.feature_type == 'l')&(d.code.isin(tobo_snacks))].copy()
lu_catalogue.reset_index(inplace=True, drop=True)
catalog_surveys, catalog_features = make_report_objects(lu_catalogue)

# this is the prior data: all data collected from
# the same feature type. Lakes, rivers or parks
prior_feature = catalog_features.df_cat
prior_feature['feature_type'] = 'l'

# the prior and likelihood data from the region of interest
all_data_of_interest = d[(d['date'] >= prior_dates['start']) & (d['date'] <= o_dates['end'])&(d.feature_type == 'l')&(d.code.isin(tobo_snacks))].copy()
all_data_of_interest = all_data_of_interest[all_data_of_interest.canton == canton].copy()

all_data_of_interest['use'] = 'pers'
all_data_of_interest.reset_index(inplace=True, drop=True)

land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'streets', 'orchards', 'use', 'canton', 'city', 'feature_name']

all_report, all_land_use = make_report_objects(all_data_of_interest, info_columns = ['use', 'canton', 'city', 'feature_name'])

args = {
    'likelihood': {'canton':canton, 'date_range':o_dates},
    'prior' : {'canton':canton, 'date_range':prior_dates},
    'data' : all_data_of_interest.copy(),
    'land-use-inventory' : prior_feature.copy()
}

combined_results = reports_and_forecast(args['likelihood'], args['prior'], ldata=args['data'])
standard_combined = make_standard_report(combined_results, args)


lake_report = combined_results['this_report']
lake_prior_report = combined_results['prior_report']
lake_land_use = combined_results['this_land_use']

In [23]:
fig, title = scatter_plot_display(combined_results, standard_combined)
glue('scatter-plot-summary', fig, display=False)

In [24]:
lake_histogram_of_results = histograms_standard(standard_combined['observed-values'])
glue('histogram-of_results', fig, display=False)

In [25]:
lake_forecasted_distributions = ecdf_plots_standard(standard_combined['forecasted-values'])
glue('forecasts', lake_forecasted_distributions, display=False)

In [26]:
# Create a GeoDataFrame from the list of locations
dbc = gpd.read_file('data/ignorethis/shapes/kantons.shp')
dbc = dbc.to_crs(epsg=4326)
dbc = dbc[dbc.NAME == canton].copy()
dbckey = dbc[['NAME', 'KANTONSNUM']].set_index('NAME')
dbckey = dbckey.drop_duplicates()
thiscanton = dbckey.loc[canton, 'KANTONSNUM']
db = gpd.read_file('data/ignorethis/shapes/municipalities.shp')
db = db.to_crs(epsg=4326)
thesecities = db[db.KANTONSNUM == thiscanton]
surveyedcities = all_data_of_interest.city.unique()

bounds = dbc.total_bounds
minx, miny, maxx, maxy = bounds


rivers = gpd.read_file('data/ignorethis/shapes/rivers.shp')
rivers = rivers.to_crs(epsg=4326)
# Filter the background layer to cover the bounding box
rivers_within_bounds = rivers.clip(dbc, keep_geom_type=True) # cx[minx:maxx, miny:maxy]


lakes = gpd.read_file('data/ignorethis/shapes/lakes.shp')
lakes = lakes.to_crs(epsg=4326)
lakes_within_bounds = lakes.clip(dbc, keep_geom_type=True) # [minx:maxx, miny:maxy]

# Define the plot
fig, ax = plt.subplots(figsize=(12,10))

citymap = thesecities.plot(ax=ax, edgecolor='black', facecolor='None', linewidth=.1)

surveyed = thesecities[thesecities.NAME.isin(surveyedcities)].plot(ax=ax, color='salmon', alpha=0.6)

dbc.plot(ax=ax, edgecolor='black', facecolor='None', linewidth=.4)
rivers_within_bounds.plot(ax=ax, edgecolor='dodgerblue', alpha=.4)
lakes_within_bounds.plot(ax=ax, edgecolor='dodgerblue', color='dodgerblue', linewidth=.2, alpha=.4)

lakes = lake_report.sample_results()
prior_samples = lake_prior_report.sample_results()

lakes_locations,  map_boundsa = map_markers(lakes)
lakes_locations.plot(ax=ax, color= session_config.palette['likelihood'], markersize=80, zorder=5)

prior_locations,  map_boundsb = map_markers(prior_samples)
prior_locations.plot(ax=ax, color= session_config.palette['prior'], marker='x', markersize=80, zorder=5)

ax.set_title(f'Survey locations {canton}')
plt.xlabel('')
plt.ylabel('')

plt.axis('off')

legend_elements = [
    Line2D([0], [0], marker='o', label='After 2020', markersize=10, markeredgecolor='w', markerfacecolor= session_config.palette['likelihood']),
    Line2D([0], [0], marker='x', label='Before 2020', markersize=10, markeredgecolor= session_config.palette['prior'])
]

plt.legend(handles=legend_elements, loc='upper right')

glue('map-of-survey-locations', fig, display=False)
plt.close()

# Canton Bern Lakes

## Vital Statistics

In [27]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Administrative boundaries Bern 2015-11-15 2021-12-31'
instructions = 'Please answer in paragraph form and label your response with "### Political boundaries"'

admin = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('political-boundaries', Markdown(admin['answer']), display=False)

```{glue:md} political-boundaries
:format: myst
```

In [28]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Named features Bern 2015-11-15 2021-12-31 : The lakes, rivers and parks'
instructions = 'Please answer in paragraph form and label your response with "### Features of interest"'

surveyed_features = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('surveyed-features', Markdown(surveyed_features['answer']), display=False)

```{glue:md} surveyed-features
:format: myst
```

```{glue} map-of-survey-locations
```

In [29]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Summary statistics Bern 2015-11-15 2021-12-31: The descriptive statistics of the survey results'
instructions = 'Identify the average pcs/m, the 90% interval, anwer in paragraph form and label your response: "### Summary of survey results"'

summary_results = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('summary-results', Markdown(summary_results['answer']), display=False)

```{glue:md} summary-results
:format: myst
```

```{glue} scatter-plot-summary
```

In [30]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Inventory items Bern 2015-11-15 2021-12-31 : The complete list of the objects found and indentified included in this report.'
instructions = 'What was The quantity, average density, % of total and fail rate per object category of the top two items? Answer in paragraph format and provide a table of values. How many were found? Label your response"### Inventory and most common".'

inventory = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('inventory', Markdown(inventory['answer']), display=False)

```{glue:md} inventory
:format: myst
```

In [31]:
sample_read = f"""
\n\n
__Example of how to interpret sampling-stratification table:__\n\n
#
__Sample table:__\n\n

|   Proportion of buffer zone |   ('Proportion of samples collected', 'buildings') |   ('Proportion of samples collected', 'wetlands') |   ('Proportion of samples collected', 'forest') |   ('Proportion of samples collected', 'public-services') |   ('Proportion of samples collected', 'recreation') |   ('Proportion of samples collected', 'undefined') |   ('Proportion of samples collected', 'streets') |   ('Proportion of samples collected', 'vineyards') |   ('Proportion of samples collected', 'orchards') |
|----------------------------:|---------------------------------------------------:|--------------------------------------------------:|------------------------------------------------:|---------------------------------------------------------:|----------------------------------------------------:|---------------------------------------------------:|-------------------------------------------------:|---------------------------------------------------:|--------------------------------------------------:|
|                           1 |                                          0.0588235 |                                                 1 |                                       0.976471  |                                                0.776471  |                                                   1 |                                          0.870588  |                                                0 |                                          0.976471  |                                                 1 |
|                           2 |                                          0.0588235 |                                                 0 |                                       0.0235294 |                                                0.2       |                                                   0 |                                          0.0470588 |                                                0 |                                          0         |                                                 0 |
|                           3 |                                          0.0117647 |                                                 0 |                                       0         |                                                0.0235294 |                                                   0 |                                          0.0823529 |                                                0 |                                          0.0235294 |                                                 0 |
|                           4 |                                          0.376471  |                                                 0 |                                       0         |                                                0         |                                                   0 |                                          0         |                                                0 |                                          0         |                                                 0 |
|                           5 |                                          0.494118  |                                                 0 |                                       0         |                                                0         |                                                   0 |                                          0         |                                                0 |                                          0         |                                                 0 |


__exmple paragraph__

The sampling-stratification of the surveys was as follows: 49% of the surveys were conducted at locations where 80-100% of the buffer was dedicated to buidlings. 37% of the surveys were taken where 60 -80% of the buffer
was dedicated to buidlings. 1% of the surveys were taken where 40-60% of the buffer was dedicated to buidlings. 6% of the samples were taken from locations where 20 - 40% of the buffer was dedicated to buidlings. 
6% of samples was taken from locations where 0-20% of the buffer was dedicated to buidlings. All of the samples were taken in locations where 0-20% of the buffer was dedicated to wetlands. 98% of the samples were 
taken from locations where 0-20% of the buffer was dedicated to forest. 2% of surveys were taken where 20-40% of the buffer was dedicated to forest. 77% of the samples were taken from locations that had
0-20% of the buffer dedicated to public-services. 20% of the surveys were taken from locations that had 20-40% of the buffer dedicated to public services. 2% of surveys were taked from locations that had
20-40% of the buffer dedicated to public services.
"""


In [32]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Sampling stratification Bern 2015-11-15 2021-12-31: The environmental features surrounding the survey location'
instructions = [
    "The sampling stratification quantifies what proportion of the samples were conducted according to the proportion of the buffer zone that is dedicated to a particular land use feature.",
    "Each survey location is surrounded by a buffer zone of radius = 1 500 meters. The buffer zone is comprised of land-use features, each land use feature occupies a proportion of the buffer zone (0 - 100%). ",
    "The sampling stratification is the proportions of samples taken for each land use feature and the proportion of buffer zone that the feature occupies. Each location has the same size buffer zone. ",
    "What changes is how the land use features are distributed within the buffer zone. You are given a table of vaues. Here is an example of how to read it: ",
    "Suppose the proportion of buffer is 0 - 20% and the value for buidlings is .3, that means that 30% of the samples took place at locations where 0-20% of the buffer is dedicated to buildlings. ",
    "Another example: Suppose on the same table for proportion of buffer 40-60% forests = .24, that means that 24% of samples were taken 40-60% was dedidcated to forest. ",
    "What is a buffer zone? What is sample stratification? What role does land use play? Summarize the table of values using the examples provided above"
    "\n\nPlease label your response '### Sampling stratification'"
]
instructions = "".join(instructions)
sampling_stratification = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('sampling-stratification', Markdown(sampling_stratification['answer']), display=False)

In [33]:
 Markdown(sampling_stratification['answer'])

### Sampling stratification

The sampling stratification quantifies the proportion of samples collected based on the proportion of the buffer zone dedicated to specific land use features. Each survey location is surrounded by a buffer zone with a radius of 1,500 meters, and this zone includes various land-use features, each occupying a certain proportion (0-100%) of the buffer zone. The stratification helps understand how the distribution of land-use features within the buffer zone influences the sampling process.

**Buffer Zone Proportions and Sampling Distribution:**

- **0-20% Buffer Zone:**
  - Buildings: 31% of samples were collected where 0-20% of the buffer zone is dedicated to buildings.
  - Wetlands: All samples (100%) were collected from locations with wetlands within this buffer zone.
  - Forest: 24% of samples were from locations where 0-20% of the buffer had forest presence.
  - Public Services, Recreation, Undefined Features: All samples noted these features at this level.
  - Streets: 45% of the samples were associated with streets.
  - Vineyards and Orchards: All samples (100%) were located within this buffer zone.

- **20-40% Buffer Zone:**
  - Buildings: 38% of samples were collected from locations with buildings.
  - Wetlands: No samples were taken from wetlands.
  - Forest: 66% of the samples represented forested areas.
  - Public Services and Recreation: No samples were collected.
  - Undefined Features: 8% of samples were associated with undefined features.
  - Streets: 8% of samples were related to streets.

- **40-60% Buffer Zone:**
  - Buildings: 22% of samples were dedicated to buildings.
  - Wetlands: No samples were taken from wetlands.
  - Forest: Only 9% of samples represented forest.
  - Public Services: No samples were recorded.
  - Undefined Features: 57% of samples fell into this category.
  - Streets: 19% of samples indicated the presence of streets.

- **60-80% Buffer Zone:**
  - Buildings: 7% of samples were linked to buildings.
  - Wetlands: No samples were taken from wetlands.
  - Forest: No samples were noted.
  - Undefined Features: 0% of samples were associated with undefined features.
  - Streets: 20% of samples were related to streets.

- **80-100% Buffer Zone:**
  - Buildings: Only 3% of samples were attributed to buildings.
  - Wetlands and Forest: No samples were taken.
  - Public Services, Recreation, Undefined Features, Streets, Vineyards, and Orchards: No contributions in this buffer zone.

This stratification allows for grouping locations according to the topographical features present within the buffer zone and helps in understanding how the observed litter density changes based on these land use features.

```{glue:md} sampling-stratification
:format: myst
```

In [34]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Topography and trash density Bern 2015-11-15 2021-12-31: The changes in the observed litter density and the changes in land use'
instructions = "Provide a narrative paragraph, and the table of values. note where the pcs/m is highest and lowest. Please label your response '### Topography and trash density'"
instructions = "".join(instructions)
sampling_stratification = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('stratification-pcs-m', Markdown(sampling_stratification['answer']), display=False)

```{glue:md} stratification-pcs-m
:format: myst
```

In [35]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Cluster analysis Bern 2015-11-15 2021-12-31'
instructions = 'Discuss the noteable differences in compostion of the clusters and the differences in the sample average per cluster in paragrpah form. Provide a table of values. Please Label your response "### Cluster Analysis".'
language = 'english'
reply_language = f"\n\nPlease reply in the following language: {language}"
cluster_analysis = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}{reply_language}'})
glue('cluster-analysis', Markdown(cluster_analysis['answer']), display=False)

```{glue:md} cluster-analysis
:format: myst
```

### Cluster analysis and land-use

1. Does the average density per cluster, support any conclusions that could be drawn given the land-use profile?
2. Is their any possible correlations?

In [42]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Cluster analysis Bern 2015-11-15 2021-12-31'
instructions = 'Does the average density per cluster, support any conclusions that could be drawn given the land-use profile? Is their any possible correlations? Please answer in paragraph form.'

cluster_analysis_lu = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('cluster-analysis-lu', Markdown(cluster_analysis_lu['answer']), display=False)

```{glue:md} cluster-analysis-lu
:format: myst
``` 

## Regression Analysis

In [39]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Summary of regression methods Bern 2015-11-15 2021-12-31: The different linear models the data were tested on'
instructions = 'Given the r² values of the regression models how accurate would the predictions be? How accurate would the best model be? How much faith can we put in the predictions ? Answer in paragraph form. Label your response"### Summary of regression methods".'

regression_methods = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('regression-methods', Markdown(regression_methods['answer']), display=False)

```{glue:md} regression-methods
:format: myst
``` 

In [40]:
requesting = 'Please summarize the following sections:'
sections_to_summarize = 'Forecasts grid approximation Bern 2015-11-15 2021-12-31'
instructions = 'Explain what a grid forecast is. Explain each forecast method. Given the results from the forecast methods estimate a probable range of values. Label your response"### Forecasts for 2025".'

forecasts = data_chait.invoke({'input': f'{requesting} {sections_to_summarize}\n\nInstructions\n\n{instructions}'})
glue('forecasts',Markdown(forecasts['answer']), display=False)

```{glue:md} forecasts
:format: myst
``` 