In [1]:
import session_config
import reports
import userdisplay
import geospatial
import gridforecast as gfcast

import logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns
import datetime as dt

from myst_nb import glue
from IPython.display import display, Markdown

def display_forecast(fcast_summary):
    average = fcast_summary['average']
    hdi_min, hdi_max = fcast_summary['hdi'][0], fcast_summary['hdi'][1]
    
    range_90_min, range_90_max= fcast_summary['range'][0], fcast_summary['range'][-1]
    alist = f'\n* Average: {round(average, 2)}\n* HDI 95%: {round(hdi_min, 2)} - {round(hdi_max, 2)}\n* 90% Range: {round(range_90_min, 2)} - {round(range_90_max,2)}'
    return alist

logging.basicConfig(
    filename='app.log', 
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)
# daa520
# deb887
color_style = {'prior':'color: #daa520', 'likelihood':'color: #1e90ff'}
palette = {'prior':'goldenrod', 'likelihood':'dodgerblue'}

In [2]:
data = session_config.collect_survey_data()
o_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}

canton = 'Bern'
d= data.reset_index(drop=True)

# prior data does not include locations in canton
o_prior = d[(d.canton != canton)&(d['date'] <= prior_dates['end'])].copy()
o_report, o_land_use = gfcast.make_report_objects(o_prior)
results = gfcast.reports_and_forecast({'canton':canton, 'date_range':o_dates}, {'canton':canton, 'date_range':prior_dates}, ldata=d.copy(), logger=logger, other_data=o_land_use.df_cat)

# collect the results from the prior and the likelihood
prr_all= results['prior_report'].sample_results.groupby('sample_id')['pcs/m'].sum()
lkl_all  = results['this_report'].sample_results.groupby('sample_id')['pcs/m'].sum()

# consider all values
xii = results['posterior_no_limit'].sample_posterior()

# limit to the 99th percentile
sample_values, posterior, summary_simple = gfcast.dirichlet_posterior(results['posterior_99'])

# all surveys
c_all = d[d.canton == canton].reset_index(drop=True)
call_surveys, call_land = gfcast.make_report_objects(c_all)

# material proportions all data
t = call_surveys.inventory()
t['material'] = t.index.map(lambda x: userdisplay.code_material.loc[x, 'material'])
material_report = t.groupby(['material']).quantity.sum()
mr = material_report/sum(material_report)
mr = (mr*100).astype(int)
mr = pd.DataFrame(mr[mr > 1])
mr['% of total'] = mr.quantity.apply(lambda x: f'{x}%')
mr = mr[['% of total']]
mr = mr.style.set_table_styles(userdisplay.table_css_styles)

# forecast weighted prior
weights = gfcast.land_use_weights(results['this_land_use'], session_config.feature_variables)
g,w  =gfcast.select_prior_data_by_feature_weight(o_land_use.df_cat, weights, session_config.feature_variables)
posterior_by_weight, c  = gfcast.posterior_dirichlet_counts(lkl_all, g['pcs/m'].values)
t, y, u= gfcast.dirichlet_posterior(posterior_by_weight)


In [3]:
fig, ax = plt.subplots()

o_surveys = results['this_report'].sample_results
p_surveys = results['prior_report'] .sample_results

sns.histplot(data=o_surveys, x='pcs/m', stat='probability', label='observed', ax=ax, color=palette['likelihood'])
sns.histplot(data=p_surveys, x='pcs/m', stat='probability', label='prior',ax=ax, color=palette['prior'])
ax.legend()
plt.tight_layout()
glue('prior-likelihood', fig, display=False)
plt.close()

In [4]:
fig, ax = plt.subplots()

sns.ecdfplot(prr_all, label='prior', ls='-', ax=ax, c=palette['prior'], zorder=1)
sns.ecdfplot(lkl_all, label='observed', ls='-', ax=ax, c=palette['likelihood'], zorder=1)
sns.ecdfplot(sample_values, label='expected 99%', ls=':', zorder=2)
sns.ecdfplot(xii, label='expected max', ls='-.', zorder=2)
sns.ecdfplot(t, label='weighted prior', c='black', ls='--', lw=2, ax=ax, zorder=5)
ax.set_xlim(-.1, 10)
ax.legend()
plt.tight_layout()
glue('cumumlative-dist-forecast-prior', fig, display=False)
plt.close()

In [5]:
os = results['this_report'].object_summary()
os.reset_index(drop=False, inplace=True)


most_common_objects, mc_codes, proportions = userdisplay.most_common(os)
most_common_objects = most_common_objects.set_caption("")

new_list = display_forecast(summary_simple)
new_list2 = display_forecast(results['posterior_no_limit'].get_descriptive_statistics())
new_list1 = Markdown('__Given the 99th percentile of all observed samples__' + new_list)
new_list2 = Markdown('__Given the observed max__' + new_list2)

feature_inv = call_surveys.feature_inventory()
feature_inv.pop('p')
feature_inventory = Markdown(userdisplay.feature_inventory(feature_inv, session_language='en'))

aboundaries = call_surveys.administrative_boundaries().copy()
aboundaries.pop('canton')
aboundaries.pop('parent_boundary')
administrative_boundaries = Markdown(userdisplay.boundaries(aboundaries, session_language='en'))

header = userdisplay.sampling_result_summary(call_surveys.sampling_results_summary, session_language='en')[0]
info = userdisplay.sampling_result_summary(call_surveys.sampling_results_summary, session_language='en')[1]
samp_sum = Markdown(f'{header}\n{info}')

all_summary = call_surveys.sampling_results_summary.copy()
all_header = f"<font color=#daa520>{all_summary.pop('start')[:4]} - {all_summary.pop('end')[:4]}</font>"
all_info = userdisplay.sampling_result_summary(all_summary, session_language='en')[1]
all_samp_sum = Markdown(f'{all_header}\n{all_info}')

p_summary = results['prior_report'].sampling_results_summary.copy()
p_header = f"<font color=#daa520>{dt.datetime.strftime(p_summary.pop('start'), format=session_config.date_format)[:4]} - {dt.datetime.strftime(p_summary.pop('end'), format=session_config.date_format)[:4]}</font>"
p_info = userdisplay.sampling_result_summary(p_summary, session_language='en')[1]
p_samp_sum = Markdown(f'{p_header}\n{p_info}')

l_summary = results['this_report'].sampling_results_summary.copy()
l_header = f"<font color=#1e90ff>{dt.datetime.strftime(l_summary.pop('start'), format=session_config.date_format)[:4]} - {dt.datetime.strftime(l_summary.pop('end'), format=session_config.date_format)[:4]} </font>"
l_info = userdisplay.sampling_result_summary(l_summary, session_language='en')[1]
l_samp_sum = Markdown(f'{l_header}\n{l_info}')

ratio_most_common = Markdown(f'The most common objects account for {int(proportions*100)}% of all objects')

# one_list = Markdown(f'{feature_inventory}\n{administrative_boundaries}')
glue('material-report', mr, display=False)
glue('forecast-weighted-prior', new_list2, display=False)
glue('forecast-all-prior', new_list1, display=False)
glue('ratio-most-common', ratio_most_common, display=False)
glue('most_common_objects', most_common_objects, display=False)
glue('l-sampling-summary', l_samp_sum, display=False)
glue('prior-sampling-summary', p_samp_sum, display=False)
glue('sampling-summary', all_samp_sum, display=False)
glue('feature-inventory', feature_inventory, display=False)
glue('administrative-boundaries', administrative_boundaries, display=False)


In [6]:
# lakes
lake_params = {'canton':canton, 'date_range':o_dates, 'feature_type': 'l'}
lake_params_p = {'canton':canton, 'date_range':prior_dates, 'feature_type':'l'}
d_codes = d[d.code.isin(mc_codes)].reset_index(drop=True)

o_prior_l = d[(d.canton != canton)&(d['date'] <= prior_dates['end'])&(d.feature_type == 'l')].copy()
o_report_l, o_land_use_l = gfcast.make_report_objects(o_prior_l)

lake_results = gfcast.reports_and_forecast(lake_params,lake_params_p , ldata=d.copy(), logger=logger, other_data=o_land_use_l.df_cat)

los = lake_results['this_report'].object_summary()
los.reset_index(drop=False, inplace=True)
l_most_common_objects, lmc_codes , lproportions = userdisplay.most_common(los)
l_most_common_objects = l_most_common_objects.set_caption("")

# collect the results from the prior and the likelihood
prr_l = lake_results['prior_report'].sample_results.groupby('sample_id')['pcs/m'].sum()
lkl_l = lake_results['this_report'].sample_results.groupby('sample_id')['pcs/m'].sum()

c_all_l = d[(d.canton == canton)&(d.feature_type == 'l')].reset_index(drop=True)

call_l_surveys, call__l_land = gfcast.make_report_objects(c_all_l)

# consider all values
xii = lake_results['posterior_no_limit'].sample_posterior()

# limit to the 99th percentile
lake_sample_values, lake_posterior, lake_summary_simple = gfcast.dirichlet_posterior(lake_results['posterior_99'])

In [7]:
t = call_l_surveys.inventory()
t['material'] = t.index.map(lambda x: userdisplay.code_material.loc[x, 'material'])
material_report = t.groupby(['material']).quantity.sum()
mrl = material_report/sum(material_report)
mrl = (mrl*100).astype(int)
mrl = pd.DataFrame(mrl[mrl > 1])
mrl['% of total'] = mrl.quantity.apply(lambda x: f'{x}%')
mrl = mrl[['% of total']]
mrl = mrl.style.set_table_styles(userdisplay.table_css_styles)

In [8]:
fig, ax = plt.subplots()

o_surveys = lake_results['this_report'].sample_results
p_surveys = lake_results['prior_report'] .sample_results

sns.histplot(data=o_surveys, x='pcs/m', stat='probability', label='observed', ax=ax, color=palette['likelihood'])
sns.histplot(data=p_surveys, x='pcs/m', stat='probability', label='prior',ax=ax, color=palette['prior'])
ax.legend()
plt.tight_layout()
glue('lake-prior-likelihood', fig, display=False)
plt.close()

weights = gfcast.land_use_weights(lake_results['this_land_use'], session_config.feature_variables)
g,w  =gfcast.select_prior_data_by_feature_weight(o_land_use_l.df_cat, weights, session_config.feature_variables)
posterior_by_weight, c  = gfcast.posterior_dirichlet_counts(lkl_l, g['pcs/m'].values)
t, y, u= gfcast.dirichlet_posterior(posterior_by_weight)

fig, ax = plt.subplots()

sns.ecdfplot(prr_l, label='prior', ls='-', ax=ax, c=palette['prior'], zorder=1)
sns.ecdfplot(lkl_l, label='observed', ls='-', ax=ax, c=palette['likelihood'], zorder=1)
sns.ecdfplot(sample_values, label='expected 99%', ls=':', zorder=2)
sns.ecdfplot(xii, label='expected max', ls='-.', zorder=2)
sns.ecdfplot(t, label='weighted prior', c='black', ls='--', lw=2, ax=ax, zorder=5)
ax.set_xlim(-.1, 10)
ax.legend()
plt.tight_layout()
glue('lake-cumumlative-dist-forecast-prior', fig, display=False)
plt.close()

l_feature_inv = call_l_surveys.feature_inventory().copy()
l_feature_inv.pop('p')
l_feature_inv.pop('r')
l_feature_inv = Markdown(userdisplay.feature_inventory(l_feature_inv, session_language='en'))

l_admin_bounds = call_l_surveys.administrative_boundaries().copy()

l_admin_bounds.pop('canton')
l_admin_bounds.pop('parent_boundary')
l_admin_b = Markdown(userdisplay.boundaries(l_admin_bounds, session_language='en'))

header = userdisplay.sampling_result_summary(call_l_surveys.sampling_results_summary, session_language='en')[0]
info = userdisplay.sampling_result_summary(call_l_surveys.sampling_results_summary, session_language='en')[1]
l_samp_sum_all = Markdown(f'{header}\n{info}')

p_summary_l = lake_results['prior_report'].sampling_results_summary.copy()
p_header_l = f"<font color=#daa520>{dt.datetime.strftime(p_summary_l.pop('start'), format=session_config.date_format)} - {dt.datetime.strftime(p_summary_l.pop('end'), format=session_config.date_format)}</font>"
p_info_l = userdisplay.sampling_result_summary(p_summary_l, session_language='en')[1]
p_samp_sum_l = Markdown(f'{p_header_l}\n{p_info_l}')

l_summary_l = lake_results['this_report'].sampling_results_summary.copy()
l_header_l = f"<font color=#1e90ff>{dt.datetime.strftime(l_summary_l.pop('start'), format=session_config.date_format)} - {dt.datetime.strftime(l_summary_l.pop('end'), format=session_config.date_format)} </font>"
l_info_l = userdisplay.sampling_result_summary(l_summary_l, session_language='en')[1]
l_samp_sum_l = Markdown(f'{l_header_l}\n{l_info_l}')

ratio_most_common_l = Markdown(f'The most common objects account for {int(lproportions*100)}% of all objects')

one_list_l = Markdown(f'{l_feature_inv}\n{l_admin_b}')

new_list_l = display_forecast(lake_summary_simple)
new_list2_l = display_forecast(lake_results['posterior_no_limit'].get_descriptive_statistics())


new_list1_l = Markdown('__Given the 99th percentile of all observed samples__' + new_list_l)
new_list2_l = Markdown('__Given the observed max__' + new_list2_l)


glue('lake-material-report', mrl, display=False)
glue('lake-forecast-99-list',new_list1_l, display=False)
glue('lake-forecast-max-list',new_list2_l, display=False)
glue('lake-ratio-most-common', ratio_most_common_l, display=False)
glue('lake-most_common_objects', l_most_common_objects, display=False)
glue('lake-prior-sampling-summary', p_samp_sum_l, display=False)
glue('lake-observed-sampling-summary',l_samp_sum_l, display=False)
glue('lake-sampling-summary', l_samp_sum_all, display=False)
glue('lake-feature-inventory', l_feature_inv, display=False)
glue('lake-administrative-boundaries', l_admin_b, display=False)

In [9]:
river_params = {'canton':canton, 'date_range':o_dates, 'feature_type': 'r'}
river_params_p = {'canton':canton, 'date_range':prior_dates, 'feature_type':'r'}

o_prior_r = d[(d.canton != canton)&(d.feature_type == 'r')].copy()
o_report_r, o_land_use_r = gfcast.make_report_objects(o_prior_r)

river_results = gfcast.reports_and_forecast(river_params,river_params_p , ldata=d.copy(), logger=logger, other_data=o_land_use_r.df_cat)

# collect the results from the prior and the likelihood
prr_r = river_results['prior_report'].sample_results.groupby('sample_id')['pcs/m'].sum()
lkl_r = river_results['this_report'].sample_results.groupby('sample_id')['pcs/m'].sum()

# consider all values
r_xii = river_results['posterior_no_limit'].sample_posterior()

# limit to the 99th percentile
r_sample_values, r_posterior, r_summary_simple = gfcast.dirichlet_posterior(river_results['posterior_99'])

ros = river_results['this_report'].object_summary()
ros.reset_index(drop=False, inplace=True)

r_most_common_objects, rmc_codes , rproportions = userdisplay.most_common(ros)
r_most_common_objects = r_most_common_objects.set_caption("")

ratio_most_common_r = Markdown(f'The most common objects account for {int(rproportions*100)}% of all objects')

c_all_r = d[(d.canton == canton)&(d.feature_type == 'r')].reset_index(drop=True)

call_r_surveys, call_r_land = gfcast.make_report_objects(c_all_r)

t = call_r_surveys.inventory()
t['material'] = t.index.map(lambda x: userdisplay.code_material.loc[x, 'material'])
material_report = t.groupby(['material']).quantity.sum()
mrr = material_report/sum(material_report)
mrr = (mrr*100).astype(int)
mrr = pd.DataFrame(mrr[mrr > 1])
mrr['% of total'] = mrr.quantity.apply(lambda x: f'{x}%')
mrr = mrr[['% of total']]
mrr = mrr.style.set_table_styles(userdisplay.table_css_styles)


r_feature_inv = call_r_surveys.feature_inventory().copy()
r_feature_inv.pop('p')
r_feature_inv.pop('l')
r_feature_inv = Markdown(userdisplay.feature_inventory(r_feature_inv, session_language='en'))

r_admin_bounds = call_r_surveys.administrative_boundaries().copy()
r_admin_bounds.pop('canton')
r_admin_bounds.pop('parent_boundary')
r_admin_b = Markdown(userdisplay.boundaries(r_admin_bounds, session_language='en'))

headerr = userdisplay.sampling_result_summary(call_r_surveys.sampling_results_summary, session_language='en')[0]
infor = userdisplay.sampling_result_summary(call_r_surveys.sampling_results_summary, session_language='en')[1]
r_samp_sum_all = Markdown(f'{headerr}\n{infor}')

p_summary_r = river_results['prior_report'].sampling_results_summary.copy()
p_header_r = f"<font color=#daa520>{dt.datetime.strftime(p_summary_r.pop('start'), format=session_config.date_format)} - {dt.datetime.strftime(p_summary_r.pop('end'), format=session_config.date_format)}</font>"
p_info_r = userdisplay.sampling_result_summary(p_summary_r, session_language='en')[1]
p_samp_sum_r = Markdown(f'{p_header_r}\n{p_info_r}')

l_summary_r = river_results['this_report'].sampling_results_summary.copy()
l_header_r = f"<font color=#1e90ff>{dt.datetime.strftime(l_summary_r.pop('start'), format=session_config.date_format)} - {dt.datetime.strftime(l_summary_r.pop('end'), format=session_config.date_format)} </font>"
l_info_r = userdisplay.sampling_result_summary(l_summary_r, session_language='en')[1]
l_samp_sum_r = Markdown(f'{l_header_r}\n{l_info_r}')

new_list_r = display_forecast(r_summary_simple)
new_list2_r = display_forecast(river_results['posterior_no_limit'].get_descriptive_statistics())
new_list1_r = Markdown('__Given the 99th percentile of all observed samples__' + new_list_r)
new_list2_r = Markdown('__Given the observed max__' + new_list2_r)

fig, ax = plt.subplots()

o_surveys = river_results['this_report'].sample_results
p_surveys = river_results['prior_report'] .sample_results

sns.histplot(data=o_surveys, x='pcs/m', stat='probability', label='observed', ax=ax, color=palette['likelihood'])
sns.histplot(data=p_surveys, x='pcs/m', stat='probability', label='prior',ax=ax, color=palette['prior'])
ax.legend()
plt.tight_layout()
glue('river-prior-likelihood', fig, display=False)
plt.close()


weights = gfcast.land_use_weights(river_results['this_land_use'], session_config.feature_variables)
g,w  =gfcast.select_prior_data_by_feature_weight(o_land_use_r.df_cat, weights, session_config.feature_variables)
posterior_by_weight, c  = gfcast.posterior_dirichlet_counts(lkl_r, g['pcs/m'].values)
t, y, u= gfcast.dirichlet_posterior(posterior_by_weight)

fig, ax = plt.subplots()

sns.ecdfplot(prr_r, label='prior', ls='-', ax=ax, c=palette['prior'], zorder=1)
sns.ecdfplot(lkl_r, label='observed', ls='-', ax=ax, c=palette['likelihood'], zorder=1)
sns.ecdfplot(sample_values, label='expected 99%', ls=':', ax=ax, zorder=2)
sns.ecdfplot(xii, label='expected max', ls='-.', ax=ax, zorder=2)
sns.ecdfplot(t, label='weighted prior', c='black', ls='--', lw=2, ax=ax, zorder=5)
ax.set_xlim(-.1, 10)
ax.legend()
plt.tight_layout()
glue('river-cumumlative-dist-forecast-prior', fig, display=False)
plt.close()

glue('river-forecast-99-list',new_list1_r, display=False)
glue('river-forecast-max-list',new_list2_r, display=False)
glue('river-material-report', mrr, display=False)
glue('river-ratio-most-common', ratio_most_common_r, display=False)
glue('river-prior-sampling-summary', p_samp_sum_r, display=False)
glue('river-observed-sampling-summary',l_samp_sum_r, display=False)
glue('river-most_common_objects', r_most_common_objects, display=False)
glue('river-sampling-summary', r_samp_sum_all, display=False)
glue('river-feature-inventory', r_feature_inv, display=False)
glue('river-administrative-boundaries', r_admin_b, display=False)

# Canton Bern

This is a sample cantonal report. The structure and the format are based off of the federal report, [IQAASL](https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/). This version is intended for
use as a decsion support tool. Thus, the user is expected to be familiar with the results in the federal report and the methods described in the _Guide for Monitoring Marine Litter on European Seas_ [The guide](https://mcc.jrc.ec.europa.eu/main/dev.py?N=41&O=439&titre_chap=TG%20Litter&titre_page=Guidance%20for%20the%20Monitoring%20of%20Marine%20Litter).

The report will automatically generate a summary of the most common objects found. Therefore, if the user does not know what items 
or objects require a particular focus (if any) the report will provide a list of likely suspects.

__For stakeholders, the assessment begins with indentifying if the attributed resources are inline with what is being seen or experienced, given the observations.__

The survey records are considered according to land use and separated into thematic groups, lakes and rivers. Finally, the results
for each municipality are presented in a table, summarizing the survey results and land use for each.



:::{dropdown} Where does the data come from ?

The data is a combination of observations from variety of groups since 2015. The observations were recorded using an interpretations of the _Guide for Monitoring Marine Litter on European Seas_ [The guide](https://mcc.jrc.ec.europa.eu/main/dev.py?N=41&O=439&titre_chap=TG%20Litter&titre_page=Guidance%20for%20the%20Monitoring%20of%20Marine%20Litter).

The guide and the monitoring of beach litter are part of decades of research, here is the brief history [A Brief History of Marine Litter Research](https://link.springer.com/chapter/10.1007/978-3-319-16510-3_1).

__Common sense guidance:__

1. The data should be considered as a reasonable estimate of the minimum amount of trash on the ground at the time of the survey.
2. There are many sources of variance. We have considered the following:
   * litter density between sampling groups.
   * litter density with respect to topographical features.
3. There are differences in detect-ability and appearance for items of the same code that are due to the effects of decomposition.
4. Many surveyors are volunteers and have different levels of experience or physical constraints that limit what will actually be collected and counted.
5. Comparing like items will yield better results and more accurate predictions
   * Grouping items by category of use
   * Selecting specific items
   * Using the _most common items_
  

__Application__

This data can be used to measure the efficiency of current measures and identify priorties for the future. This we can do because of the basic assumptions of the data model:

1. The more there is on the ground, the more will be picked up or the more that will be seen.

From this follows a consideration of the amount of resources that are dedicated to preventing and reomving the insult

2. The amount on the ground is what remains after resources have been attributed

For lakes and rivers

3. Their is an exchange between the beach and water



:::

## Vital statistics

::::::::::{tab-set}

:::::::::{tab-item} All data

::::::::{grid} 2 2 2 2
:gutter: 1

:::::::{grid-item}
:columns: 12 4 4 4

```{glue} feature-inventory
```
```{glue} administrative-boundaries
```
__Material composition__

```{glue} material-report
```
:::::::

:::::::{grid-item}
:columns: 12 8 8 8

```{glue} prior-likelihood
```
:::::::
::::::::

::::::::{grid} 3 3 3 3 

:::::::{grid-item}
__All data__

```{glue} sampling-summary
```
:::::::

:::::::{grid-item} 
<font color=#1e90ff> Most recent </font>

```{glue} l-sampling-summary
```
:::::::

:::::::{grid-item} 
<font color=#daa520> All prior </font>

```{glue} prior-sampling-summary
```
:::::::

::::::::

:::::::::



:::::::::{tab-item} Lakes

::::::::{grid} 2 2 2 2
:gutter: 1

:::::::{grid-item}
:columns: 12 4 4 4
```{glue} lake-feature-inventory
```
```{glue} lake-administrative-boundaries
```
```{glue} lake-sampling-summary
```
__Material composition__

```{glue} lake-material-report
```
:::::::

:::::::{grid-item}
:columns: 12 8 8 8

::::::{grid} 1 1 1 1

:::::{grid-item}
```{glue} lake-prior-likelihood
```

:::::

:::::{grid-item}
::::{grid} 2 2 2 2 

:::{grid-item} <font color=#daa520> Prior </font>
```{glue} lake-prior-sampling-summary
```
:::
:::{grid-item} <font color=#1e90ff> Observed </font>
```{glue} lake-observed-sampling-summary
```
:::

::::

:::::

:::::::

::::::::
:::::::::

:::::::::{tab-item} Rivers

::::::::{grid} 2 2 2 2
:gutter: 1

:::::::{grid-item}
:columns: 12 4 4 4
```{glue} river-feature-inventory
```
```{glue} river-administrative-boundaries
```
```{glue} river-sampling-summary
```
__Material composition__

```{glue} river-material-report
```
:::::::

:::::::{grid-item}
:columns: 12 8 8 8

::::::{grid} 1 1 1 1

:::::{grid-item}
```{glue} river-prior-likelihood
```

:::::

:::::{grid-item}
::::{grid} 2 2 2 2 

:::{grid-item} <font color=#daa520> Prior </font>
```{glue} river-prior-sampling-summary
```
:::
:::{grid-item} <font color=#1e90ff> Observed </font>
```{glue} river-observed-sampling-summary
```
:::

::::

:::::

:::::::

::::::::

::::::::::

:::{dropdown} How to make a report

__Survey and Land use__

A report is the implementation of a `SurveyReport`, `LandUseReport` or a  `GridForecaster`. The `SurveyReport` is the basic 
element and does the initial aggregating and descriptive statistics for a query.

The land-use-report accepts `SurveyReport.sample_results` and assigns the land-use attributes to the record. The 
land-use-report provides the baseline assessment of litter density with reference to the surrounding environment. 
The assessment accepts as variables the proportion of available space that a topographical feature occupies in a 
circle of $\pi r² \text{ where r = 1 500 meters}$ and the center of that circle is the survey location. 
These proportions are compared to the `average pieces per meter` for an object or group of objects.


__Create a report__

A report can be intiated by providing the name of the canton. If your canton does not appear this is because we have no data. The prior dates will be calculated automatically, by taking all data prior to the start date of the querry.

```{code} python

import reports
import geospatial
import gridforecast

# suppose you have defined your data into df
observed_dates = {'start':'2020-01-01', 'end':'2021-12-31'}

# everything that was seen before
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}

# name the canton
canton = 'Bern'

# define the dates of interest
data_of_interest = {'canton':canton, 'date_range':observed_dates}
sampling_history = {'canton':canton, 'date_range':prior_dates}

# filter the data
filtered_data, locations = gridforeacast.filter_data(df, data_of_interest)

# make a survey report
this_report = reports.SurveyReport(dfc=filtered_data)

# generate the parameters for the landuse report
target_df = this_report.sample_results
features = geospatial.collect_topo_data(locations=target_df.location.unique())

# make a landuse report
this_land_use = geospatial.LandUseReport(target_df, features)
```

Each report and the inference method are documented: [SurveyReport](surveyreporter), [LandUseReport](landusereporter), [GridForecaster](gridforecaster)
:::


## Most common objects

::::::::::{tab-set}

:::::::::{tab-item} All data
::::{grid} 2 2 2 2 
:::{grid-item}
:columns: 4

The most common objects from the selected data. The most common objects are a combination of the top ten most abundant objects and those objects that are found in more than 50% of the samples. Some objects are found frequently but at low quantities.Other objects are found in fewer samples but at higher quantities.

```{glue} ratio-most-common
```
:::

:::{grid-item-card}
:columns: 8
:shadow: none

```{glue} most_common_objects
:::
::::
:::::::::

:::::::::{tab-item} Lakes
::::{grid} 2 2 2 2 
:::{grid-item}
:columns: 4

The most common objects from the selected data. The most common objects are a combination of the top ten most abundant objects and those objects that are found in more than 50% of the samples. Some objects are found frequently but at low quantities.Other objects are found in fewer samples but at higher quantities.

```{glue} lake-ratio-most-common
```
:::

:::{grid-item-card}
:columns: 8
:shadow: none

```{glue} lake-most_common_objects
:::
::::
:::::::::

:::::::::{tab-item} rivers
::::{grid} 2 2 2 2 
:::{grid-item}
:columns: 4

The most common objects from the selected data. The most common objects are a combination of the top ten most abundant objects and those objects that are found in more than 50% of the samples. Some objects are found frequently but at low quantities.Other objects are found in fewer samples but at higher quantities.

```{glue} river-ratio-most-common
```
:::

:::{grid-item-card}
:columns: 8
:shadow: none

```{glue} river-most_common_objects
:::
::::
:::::::::

::::::::::

:::{dropdown} Defining the most common objects

The default method for defining _the most common objects_ is based on the number of items collected and the number of times that at least one of an object was found with respect to the number of surveys in the query, the _fail rate_.

Adjusting the fail rate will increase or decrease the number of the most common objects. The fail rate is included with the object inventory. 

```{code} python

# the most common objects are accesible in the survey report
# the report.object_summary method aggregates the data to code
# and attaches the fail rate and % of total
inventory = this_report.object_summary()

# userdisplay.most_common, takes the 10 most abundant and filters
# the data for fail rate >= 0.5. The method returns a formatted table,
# a list of the codes and the ratio of the quantity of the most common to the whole 
mostcommon, codes, ratio = userdisplay.most_common(inventory)

```


:::

## Land use profile


In [10]:
g = results['this_land_use'].n_samples_per_feature().copy()
g = userdisplay.landuse_profile(g[session_config.feature_variables[:-1]], nsamples=len(lkl_all))
g = g.set_caption("")

gt = results['this_land_use'].rate_per_feature().copy()

gt = userdisplay.litter_rates_per_feature(gt.loc[session_config.feature_variables[:-1]])
gt = gt.set_caption("")


glue('rate-per-feature', gt, display=False)
glue('sampling-profile', g, display=False)

In [11]:
streets = results['this_land_use'].n_samples_per_feature().copy()
streets = streets[[session_config.feature_variables[-1]]].copy()
streets = userdisplay.street_profile(streets.T, nsamples=len(lkl_all))
caption = ""
streets = streets.set_caption(caption)

streets_r = results['this_land_use'].rate_per_feature().copy()
streets_r = streets_r.loc[[session_config.feature_variables[-1]]].copy()
streets_r = userdisplay.street_profile(streets_r, nsamples=len(lkl_all), caption='rate')
caption = ""
streets_r = streets_r.set_caption(caption)


glue('street-profile', streets, display=False)
glue('street-rates-feature', streets_r, display=False)


In [12]:
gl = lake_results['this_land_use'].n_samples_per_feature().copy()
gl = userdisplay.landuse_profile(gl[session_config.feature_variables[:-1]], nsamples=len(lkl_l))
gl = gl.set_caption("")

gtl = lake_results['this_land_use'].rate_per_feature().copy()

gtl = userdisplay.litter_rates_per_feature(gtl.loc[session_config.feature_variables[:-1]])
gtl = gtl.set_caption("")


glue('lake-rate-per-feature', gtl, display=False)
glue('lake-sampling-profile', gl, display=False)

In [13]:
streets_p = lake_results['this_land_use'].n_samples_per_feature().copy()
streets_p = streets_p[[session_config.feature_variables[-1]]].copy()
streets_p = userdisplay.street_profile(streets_p.T, nsamples=len(lkl_l))
caption = ""
streets_p = streets_p.set_caption(caption)

streets_r_l = lake_results['this_land_use'].rate_per_feature().copy()
streets_r_l = streets_r_l.loc[[session_config.feature_variables[-1]]].copy()
streets_r_l = userdisplay.street_profile(streets_r_l, nsamples=len(lkl_l), caption='rate')
caption = ""
streets_r_l = streets_r_l.set_caption(caption)


glue('lake-street-profile', streets_p, display=False)
glue('lake-street-rates-feature', streets_r_l, display=False)

In [14]:
gr = river_results['this_land_use'].n_samples_per_feature().copy()
gr = userdisplay.landuse_profile(gr[session_config.feature_variables[:-1]], nsamples=len(lkl_r))
gr = gr.set_caption("")

gtlr = river_results['this_land_use'].rate_per_feature().copy()

gtlr = userdisplay.litter_rates_per_feature(gtlr.loc[session_config.feature_variables[:-1]])
gtlr = gtlr.set_caption("")


glue('river-rate-per-feature', gtlr, display=False)
glue('river-sampling-profile', gr, display=False)

In [15]:
streets_p_r = river_results['this_land_use'].n_samples_per_feature().copy()
streets_p_r = streets_p_r[[session_config.feature_variables[-1]]].copy()
streets_p_r = userdisplay.street_profile(streets_p_r.T, nsamples=len(lkl_r))
caption = ""
streets_p_r = streets_p_r.set_caption(caption)

streets_r_r = river_results['this_land_use'].rate_per_feature().copy()
streets_r_r = streets_r_r.loc[[session_config.feature_variables[-1]]].copy()
streets_r_r = userdisplay.street_profile(streets_r_r, nsamples=len(lkl_r), caption='rate')
caption = ""
streets_r_r = streets_r_r.set_caption(caption)


glue('river-street-profile', streets_p_r, display=False)
glue('river-street-rates-feature', streets_r_r, display=False)

::::{tab-set}
:::{tab-item} All data pcs/m and land use

__Land use__

The magnitude of the land-use variable is the portion of the total dry surface area for the labeled land use attribute in [swissTLM3d](https://www.swisstopo.admin.ch/fr/modele-du-territoire-swisstlm3d#dokumente)  in the cirlce with r = 1 500 m and area = $\pi r²$ and the survey location in the middle. Thus for in the table below, locations that are urban environments would have a building rating of 80% - 100%.


```{glue} rate-per-feature
```
<b></b><b></b>

__Streets__

The streets are measured as the length of the road network in the cirlce with r= 1 500 m and area $\pi r²$ and the survey location in the middle. The lenghts for each location are normalized from 0 - 1. Thus in the table below, the locations that have the shortest road net work will be in category 1, the those with a more dense network will be higher.
<b></b>

```{glue} street-rates-feature
``` 
:::

:::{tab-item} Lakes pcs/m and land use

__Land use__

The magnitude of the land-use variable is the portion of the total dry surface area for the labeled land use attribute in [swissTLM3d](https://www.swisstopo.admin.ch/fr/modele-du-territoire-swisstlm3d#dokumente)  in the cirlce with r = 1 500 m and area = $\pi r²$ and the survey location in the middle. Thus for in the table below, locations that are urban environments would have a building rating of 80% - 100%.


```{glue} lake-rate-per-feature
```
<b></b><b></b>

__Streets__

The streets are measured as the length of the road network in the cirlce with r= 1 500 m and area $\pi r²$ and the survey location in the middle. The lenghts for each location are normalized from 0 - 1. Thus in the table below, the locations that have the shortest road net work will be in category 1, the those with a more dense network will be higher.
<b></b>

```{glue} lake-street-rates-feature
```
:::

:::{tab-item} Rivers pcs/m and land use

__Land use__

The magnitude of the land-use variable is the portion of the total dry surface area for the labeled land use attribute in [swissTLM3d](https://www.swisstopo.admin.ch/fr/modele-du-territoire-swisstlm3d#dokumente)  in the cirlce with r = 1 500 m and area = $\pi r²$ and the survey location in the middle. Thus for in the table below, locations that are urban environments would have a building rating of 80% - 100%.


```{glue} river-rate-per-feature
```
<b></b><b></b>

__Streets__

The streets are measured as the length of the road network in the cirlce with r= 1 500 m and area $\pi r²$ and the survey location in the middle. The lenghts for each location are normalized from 0 - 1. Thus in the table below, the locations that have the shortest road net work will be in category 1, the those with a more dense network will be higher.
<b></b>

```{glue} river-street-rates-feature
``` 
:::

::::

:::{dropdown} Defining land use

__Land cover__

These measured land-use attributes are the labeled polygons from the map layer Landcover defined here [swissTLMRegio product information](https://www.swisstopo.admin.ch/fr/modele-du-territoire-swisstlm3d#dokumente), they are extracted using vector overlay techniques in 
[QGIS](https://qgis.org/en/site/). The overlay is a hexagon-grid, each hex is 3000m, circumcscribed by a circle r=1500m. The survey location is located at the center of the hex. The magnitude of the land-use variable is the portion of the total dry surface area for any particular land-use attribute. Areas of the hex that are not defined with a land-use attribute in this map layer are labeled undefined and processed like any other land-use attribute. The land-cover variables of interest are:

* Buildings: built up, urbanized
* Woods: not a park, harvesting of trees may be active
* Vineyards: does not include any other type of agriculture
* Orchards: not vineyards
* Undefined: areas of the map with no predefined label


```{code}

# the land use is summarized using a LandUseReport object
# the average pieces per meter by land use category
rate_per_feature = this_land_use.n_pieces_per_feature()

# the sampling distribution
samples_per_feature = this_land_use.n_samples_per_feature()

# the variety of locations per feature
locations_per_feature = this_land_use.locations_per_feature()

# format for display .html
styled_rate_per_feature = userdisplay.litter_rates_per_feature(rate_per_feature)
```

__Land-use - public services__

Land-use variables are the labled polygons from the Freizeitareal and Nutzungsareal map layers, defined in [swissTLMRegio product information](https://www.swisstopo.admin.ch/fr/modele-du-territoire-swisstlm3d#dokumente). Both layers represent areas used for specific activities. Freizeitareal identifies areas used for recreational purposes and Nutzungsareal represents areas such as hospitals, cemeteries, historical sites or incineration plants. As a ratio of the available dry-land in a hex, these features are relatively small (less than 10%) of the total dry-land. For identified features within a bounding hex the magnitude in meters² of these variables is scaled between 0 and 1, thus the scaled value represents the size of the feature in relation to all other measured values for that feature from all other hexagons.

* Recreation: parks, sports fields, attractions
* Infrastructure: Schools, Hospitals, cemeteries, powerplants

__Streets and roads__

Streets and roads are the labled polylines from the TLM Strasse map layer defined in [swissTLMRegio product information](https://www.swisstopo.admin.ch/fr/modele-du-territoire-swisstlm3d#dokumente). All polyines from the map layer within a bounding hex are merged (disolved in QGIS commands) and the combined length of the polylines, in meters, is the magnitude of the variable for the bounding hex.
:::

## Forecast

::::::::::{tab-set}

:::::::::{tab-item} All data
::::{grid} 1 1 2 2

:::{grid-item-card}
:columns: 12 5 5 5 

Minimum expected survey results 2025
^^^


```{glue} forecast-all-prior
```
```{glue} forecast-weighted-prior
```

:::

:::{grid-item-card}
:columns: 12 7 7 7 
:shadow: none
```{glue} cumumlative-dist-forecast-prior
```
+++
Cumulative distribution of observed, sampling history and forecasts using to different priors.
:::
::::
:::::::::

:::::::::{tab-item} Lakes
::::{grid} 1 1 2 2

:::{grid-item-card}
:columns: 12 5 5 5 

Minimum expected survey results 2025
^^^


```{glue} lake-forecast-99-list
```

```{glue} lake-forecast-max-list
```


:::

:::{grid-item-card}
:columns: 12 7 7 7 
:shadow: none
```{glue} lake-cumumlative-dist-forecast-prior
```
+++
Cumulative distribution of observed, sampling history and forecasts using to different priors.
:::
::::
:::::::::

:::::::::{tab-item} Rivers
::::{grid} 1 1 2 2

:::{grid-item-card}
:columns: 12 5 5 5 

Minimum expected survey results 2025
^^^


```{glue} river-forecast-99-list
```

```{glue} river-forecast-max-list
```


:::

:::{grid-item-card}
:columns: 12 7 7 7 
:shadow: none
```{glue} river-cumumlative-dist-forecast-prior
```
+++
Cumulative distribution of observed, sampling history and forecasts using to different priors.
:::
::::
:::::::::

::::::::::

:::{dropdown} Forecast methods

__Model assumptions__

1. Locations with similar land use attributes will have similar litter density rates
2. The data is a best estimate of what was present on the day of the survey
3. There are regional differences with respect to the density of specific objects
4. The locations surveyed are maintained by a public administration

The applied method would best be classified as Empirical Bayes, in the sense that the prior is derived from the data [Bayesian Filtering and Smoothing, Empirical Bayes methods in classical and Bayesian inference(https://users.aalto.fi/~ssarkka/pub/cup_book_online_20131111.pdf) or [Empirical Bayes methods in classical and Bayesian
inference](https://hannig.cloudapps.unc.edu/STOR757Bayes/handouts/PetroneEtAl2014.pdf). However, we share the concerns of Davidson-Pillon [Bayesian methods for hackers](https://dataorigami.net/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/#contents) about double counting and eliminate it as part of the formulation of the prior. One of our model assumption is that the litter density will be similar for locations that have similar characteristics. This allows us to remove any instances of the likelihood data from the prior data and replace it with locations that have similar charateristics.

The choice of land use features was a natural choice but was further explored in [Near or Far](https://hammerdirt-analyst.github.io/landuse/titlepage.html).

Our parameter estimates are thus derived from the data and they remain testable and quantifiable according to [Prior Probabilities, E T Jaynes](https://bayes.wustl.edu/etj/articles/prior.pdf). This makes our calculation very repetetive but also very well known. It can be defined in a few lines of code for any set of survey results.

```{code} python

# standared libaries
import numpy as np
from scipy.stats import dirichlet, multinomial

# collect the data of interest
h = array of survey values

# count the number of times that each survey values exceed a value on the gird
counts = np.array([np.sum((h > x) & (h <= x + .1)) for x in grid_range])

# use the dirichlet dist to estimate p(Y >= x) for each x on the grid
# and sample from the estimation
adist = dirichlet(counts)
this_dist = adist.rvs(1-[0]

# draw samples from the conjugate
posterior_samples = multinomial.rvs(nsamples, p=this_dist)

```

__Make a grid forecaster__

:::

## Municipal Results

The average pieces per meter of the most common objects for each city.

In [16]:
dxl = lake_results['this_report'].df
dxf = lake_results['this_land_use'].df_cont

dxlc = dxl[['location', 'city', 'feature_type']].drop_duplicates('location')
dxlc.set_index(['location'], inplace=True, drop=True)
dxf['city'] = dxf.location.apply(lambda x : dxlc.loc[x, 'city'])
sumlu = {x:'sum' for x in session_config.feature_variables}
dxf = dxf.groupby(['sample_id', 'city', *session_config.feature_variables], as_index=False).agg(session_config.unit_agg)

dxf = dxf.groupby(['city']).agg({'quantity':'sum', 'pcs/m':'mean', 'sample_id':'nunique', **sumlu})
highlight_props = 'background-color:#FAE8E8'

def highlight_max(s, arg, props: str = highlight_props):
    return np.where((s > arg) & (s != 0), props, '')

for alabel in session_config.feature_variables:
    dxf[alabel] = dxf[alabel]/dxf.sample_id
    
dxf['check'] = dxf[session_config.feature_variables[:-1]].sum(axis=1)
dxfc = geospatial.categorize_features(dxf, feature_columns=session_config.feature_variables)
dxfc.rename(columns={'sample_id':'samples'}, inplace=True)
dxfc.drop('check', axis=1, inplace=True)
dxfc = dxfc.style.set_table_styles(userdisplay.table_css_styles)
dxfc = dxfc.apply(highlight_max, arg=lake_results['this_report'].sampling_results_summary['average'], subset=pd.IndexSlice[:, ['pcs/m']])
dxfc = dxfc.format(userdisplay.format_kwargs, precision=2)

glue('lake-municipal-results', dxfc , display=False)

In [17]:
dxl = river_results['this_report'].df
dxf = river_results['this_land_use'].df_cont

dxlc = dxl[['location', 'city', 'feature_type']].drop_duplicates('location')
dxlc.set_index(['location'], inplace=True, drop=True)
dxf['city'] = dxf.location.apply(lambda x : dxlc.loc[x, 'city'])
sumlu = {x:'sum' for x in session_config.feature_variables}
dxf = dxf.groupby(['sample_id', 'city', *session_config.feature_variables], as_index=False).agg(session_config.unit_agg)

dxf = dxf.groupby(['city']).agg({'quantity':'sum', 'pcs/m':'mean', 'sample_id':'nunique', **sumlu})


for alabel in session_config.feature_variables:
    dxf[alabel] = dxf[alabel]/dxf.sample_id
    
dxf['check'] = dxf[session_config.feature_variables[:-1]].sum(axis=1)
dxfcr = geospatial.categorize_features(dxf, feature_columns=session_config.feature_variables)
dxfcr.rename(columns={'sample_id':'samples'}, inplace=True)
dxfcr.drop('check', axis=1, inplace=True)
dxfcr = dxfcr.style.set_table_styles(userdisplay.table_css_styles)
dxfcr = dxfcr.apply(highlight_max, arg=lake_results['this_report'].sampling_results_summary['average'], subset=pd.IndexSlice[:, ['pcs/m']])
dxfcr = dxfcr.format(userdisplay.format_kwargs, precision=2)
# glue('all-data-municipal-results', i , display=False)
glue('river-municipal-results', dxfcr, display=False)


::::::::::{tab-set}

:::::::::{tab-item} Lakes
```{glue} lake-municipal-results
```
:::::::::

:::::::::{tab-item} Rivers
```{glue} river-municipal-results
``` 
:::::::::

::::::::::