In [1]:
%load_ext watermark
import pandas as pd
import numpy as np
from scipy.stats import dirichlet
import logging

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns


from myst_nb import glue
from IPython.display import display, Markdown

from scipy.stats import halfnorm, multinomial
import gridforecast as gfcast

# available data

columns =  [
    'sample_id',
    'code',
    'quantity',
    'pcs/m',
    'feature_name',
    'location',
    'parent_boundary',
    'city', 
    'canton',
    'feature_type',
    'date'
]


import logging

logging.basicConfig(
    filename='app.log', 
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger(__name__)

def create_jeffreys_prior_matrix(index_range, categories, epsilon=0.01):
    # Initialize the matrix
    jeffreys_prior_matrix = np.zeros((len(index_range), len(categories)))
    
    # Calculate Jeffreys prior values using the modified formula
    for i, x in enumerate(index_range):
        prior = 1 / (x + epsilon)  # Adding epsilon to avoid division by zero
        # Assign this value to all categories for this index
        jeffreys_prior_matrix[i, :] = prior
    
    return jeffreys_prior_matrix

# Grid forecast class

The _grid forecaster_ refers to the methods defined in `gridforecast.py`. The main purpose of the _grid forecaster_ is to implement estimate the probability that a survey result _y_ from a collection of survey results _Y_ will exceed a value _x_ on the grid _X_ from 0 - max(_X_) for every _x_ spaced 0.1, where max(_X_) is defined by _Y_. This is called a grid approximation, in this case we use a Bayesien framework and implement _multinomila-Dirichlet_ conjugate to estimate the probabilities on each point of the grid. The complete method is a defined in [grid approximation](#).

The grid forecast for any two arrays can be initiated by calling `gridforecast.MulitnomialDirichlet` and providing two pd.series of float values. However, for reporting we use the grid forecast to supplement the [SurveyReport](surveyreporter) and the [LandUseReport](landusereporter).

```{note}
The grid forecast allows us to estimate the probability of a set of survey results given another set of survey results. Therefore, to interpret the results of a grid forecast the relationship between the two arrays must be well understood. Our focus has been on the structural and geographic similarities of the survey locations.
```

__Example creating reports and forecasts__

```python
# collecting the default data
data = session_config.collect_survey_data()

# the likelihood: the dates of the most recent samples
recent_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
# the prior: the dates prior to the most recent samples
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}
# the region of interest
canton = 'Vaud'

# the search parameters for the prior and likelihood
likelihood_params = {'canton':canton, 'date_range':recent_dates}
prior_params = {'canton':canton, 'date_range':prior_dates}

# verify the parameters exist in the data
# checking the parameters will verify that the requested data
# exists. If the query is possible it is executed and the value of
# comments='ok', if not empty arrays are returned with the message
# 'no survey results found'. The method returns the query data, a list
# of the sample locations and the comment.
likelihood_data, likelihood_locations, likelihood_comments = check_params(likelihood_params, data, logger)
prior_data, prior_locations, prior_comments = check_params(prior_params, data, logger)

# if there is data for both the likelihood and the prior
# make a survey report and a land use report for both sets of data
likelihood_report, likelihood_land_use = make_report_objects(likelihood_data)
prior_report, prior_land_use = make_report_objects(likelihood_data)

# make forecast from all the available liklihood data
forecast_object = MulitnomialDirichlet('comb', prior_report.sample_results['pcs/m'], likelihood_report.sample_results['pcs/m'], logger)

# make forecast limiting the likelihood to the 99the percentile
posterior_counts, comments = posterior_dirichlet_counts(lkl, prr, max_range=0.99)

# forecasts from all the data
forecasted_samples = forecast_object.sample_posterior()
forecasted_summary = forecast_object.get_descriptive_statistics()

# forecasts limited to the 99th percentile
sample_values_99, posterior_99, summary_99 = gfcast.dirichlet_posterior(posterior_counts)
```

__Using a weighted prior__

To predict density given similar locations use the land-use report from a set survey results that does not contain any of the survey locations from the likelihood. The default method is to also only select values that have the same use case ie. parks, lakes or rivers. 

```python
# determine the proportion of each land-use feature in the likelihood
weights = land_use_weights(likelihood_land_use, session_config.feature_variables)

# from the pool of available data select records that are not included in the likelihood
# in this case we eliminate the canton of interest, limit the date to the end date of the prior
# and create a survey report and land use report for *the other prior data*
other_data = data[(data.canton != canton)&(data['date'] <= prior_dates['end'])].copy()
other_prior_report, other_prior_land_use = gfcast.make_report_objects(other_prior_data)

# using the weights from the likelihood and the other_prior_land_use
other_prior_data, prior_weights = select_prior_data_by_feature_weight(other_prior_land_use, weights, session_config.feature_variables)
posterior_by_weight, weighted_comments = posterior_dirichlet_counts(likelihood_data, g['pcs/m'].values)
posterior_sample_values, weighted_dist, weighted_summary = dirichlet_posterior(posterior_by_weight)

```


In [2]:
import session_config
import reports
import geospatial
import userdisplay as disp
import gridforecast as gfcast

# collecting the default data
data = session_config.collect_survey_data()
data = data.reset_index()

# the likelihood: the dates of the most recent samples
recent_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
# the prior: the dates prior to the most recent samples
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}
# the region of interest
canton = 'Vaud'

# the search parameters for the prior and likelihood
likelihood_params = {'canton':canton, 'date_range':recent_dates}
prior_params = {'canton':canton, 'date_range':prior_dates}

# verify the parameters exist in the data
# checking the parameters will verify that the requested data
# exists. If the query is possible it is executed and the value of
# comments='ok', if not empty arrays are returned with the message
# 'no survey results found'. The method returns the query data, a list
# of the sample locations and the comment.
likelihood_data, likelihood_locations, likelihood_comments = gfcast.check_params(likelihood_params, data, logger)
prior_data, prior_locations, prior_comments = gfcast.check_params(prior_params, data, logger)

# if there is data for both the likelihood and the prior
# make a survey report and a land use report for both sets of data
likelihood_report, likelihood_land_use = gfcast.make_report_objects(likelihood_data)
prior_report, prior_land_use = gfcast.make_report_objects(prior_data)

# make forecast from all the available liklihood data
forecast_object = gfcast.MulitnomialDirichlet('comb', prior_report.sample_results['pcs/m'], likelihood_report.sample_results['pcs/m'], logger)

# make forecast limiting the likelihood to the 99the percentile
posterior_counts, comments = gfcast.posterior_dirichlet_counts(likelihood_report.sample_results['pcs/m'], prior_report.sample_results['pcs/m'], max_range=0.99)

# forecasts from all the data
forecasted_samples = forecast_object.sample_posterior()
forecasted_summary = forecast_object.get_descriptive_statistics()

# forecasts limited to the 99th percentile
sample_values_99, posterior_99, summary_99 = gfcast.dirichlet_posterior(posterior_counts)

## Grid forecaster methods

The `gridforecast.MulitnomialDirichlet` is a class in `gridforecast.py` the built in methods are designed to generate forecasts under a variety of scenarios and provide the basic elements to evaluate those forecasts. In the examples below consider the forecast_object created in the previous example.


__list of methods__

1. MultinomialDirichlet
   * compute_grid
   * compute_counts
   * compute_posterior_params
   * sample_posterior
   * compute_percentiles
   * compute_hdi
   * compute_expected_average
   * probability_of_x
   * get_descriptive_statistics
2. select_prior_data_by_feature_weight
3. posterior_dirichlet_counts
4. dirichlet_posterior





### The grid size

The grid size for each combination is based on the maximum value of either the likelihood or the prior. 

```python
forecast_object.compute_grid()
``` 

In [3]:
forecast_object.compute_grid()

array([0.000e+00, 1.000e-02, 2.000e-02, ..., 7.707e+01, 7.708e+01,
       7.709e+01])

### The counts

The number of times that a survey result was either equal to zero or any other place on the grid can be accessed with `forecastobject.prior` or `forecastobject.compute_counts(forecast_object.prior_data)`

```python
forecastobject.compute_counts(forecast_object.prior_data)
``` 

In [4]:
forecast_object.compute_counts(forecast_object.prior_data)

array([0, 0, 0, ..., 0, 0, 1])

### The posterior parameters

The parameters for the Dirichlet posterior

```python
forecastobject.compute_posterior_params()
``` 

In [5]:
forecast_object.compute_posterior_params()

array([0.01, 0.01, 0.01, ..., 0.01, 0.01, 1.  ])

### Sample the posterior distribution

Sample the posterior distribution

```python
forecast_object.sample_posterior()
``` 

In [6]:
forecast_object.sample_posterior()

array([ 0.44,  0.55,  0.71,  0.81,  0.85,  1.26,  1.26,  1.41,  1.43,
        1.5 ,  1.59,  1.59,  1.65,  1.75,  1.76,  1.76,  1.86,  1.89,
        1.89,  2.03,  2.1 ,  2.15,  2.21,  2.4 ,  2.45,  2.56,  2.57,
        2.57,  2.57,  2.57,  2.64,  2.69,  2.74,  3.41,  3.46,  3.73,
        4.01,  4.76,  4.76,  5.39,  5.43,  5.59,  6.29,  6.29,  6.43,
        6.55,  6.85,  6.94,  6.94,  6.95,  6.97,  7.26,  7.4 ,  7.68,
        8.4 ,  8.92,  9.57,  9.69, 10.07, 10.56, 10.64, 10.67, 10.67,
       10.67, 11.62, 12.37, 12.37, 12.61, 13.6 , 15.74, 18.36, 18.61,
       18.9 , 19.26, 22.38, 22.38, 23.73, 27.33, 28.13, 31.53, 31.98,
       33.55, 34.73, 35.57, 37.77, 39.36, 39.54, 40.69, 41.74, 44.1 ,
       46.96, 47.76, 50.51, 51.84, 51.92, 53.96, 59.09, 70.24, 70.24,
       75.28])

### The 90% interval of the predictions

The 90% interval of the predictions

```python
forecast_object.compute_percentiles()
``` 

In [7]:
forecast_object.compute_percentiles()

array([ 0.74  ,  2.8   ,  7.065 , 22.18  , 58.5515])

### The 90% HDI

The 90% highest density interval

```python
forecast_object.compute_hdi()
``` 

In [8]:
forecast_object.compute_hdi()

(0.18, 58.83)

### The expected mean

The 90% highest density interval

```python
forecast_object.compute_hdi()
``` 

In [9]:
forecast_object.compute_expected_average()

array([3.26850793e-05, 3.26850793e-05, 3.26850793e-05, ...,
       3.26850793e-05, 3.26850793e-05, 3.26850793e-03])

### The probability of x

The chance that a result will exceed a given value

```python
# in this case we are asking what is the chance of finding
# at least one piece per meter
a, b, c = forecast_object.probability_of_x(1)
sum(a[b[0]:])
``` 

In [10]:
a, b, c = forecast_object.probability_of_x(1)
sum(a[b[0]:])

0.9288737193351585

### The descriptive statistices

The average, hdi and the 90% range of the expected distribution

```python
forecast_object.get_descriptive_statistics()
``` 

In [11]:
forecast_object.get_descriptive_statistics()

{'code': 'comb',
 'average': 17.717200000000002,
 'hdi': (0.18, 65.66),
 'range': array([ 0.64  ,  2.86  ,  6.9   , 19.17  , 59.2875]),
 'max_observed': 77.1}

### Select prior data by feature weight


The average, hdi and the 90% range of the expected distribution

```python
# get the land use weights from the observations of interest
weights = land_use_weights(likelihood_land_use, feature_variables)

# prior data does not include locations in canton
# the surveys are limited to the prior date as defined
other_data = data[(data.canton != canton)&(data['date'] <= prior_dates['end'])].copy()
other_report, landuse_from_other = gfcast.make_report_objects(other_data)

# use the land use object from the other data
# and the weights from the likelihood to draw random
# samples from the other data
the_random_samples, w = select_prior_data_by_feature_weight(landuse_from_other, weights, feature_variables)
``` 

In [12]:
# get the land use weights from the observations of interest
weights = gfcast.land_use_weights(likelihood_land_use, session_config.feature_variables)

# prior data does not include locations in canton
# the surveys are limited to the prior date as defined
other_data = data[(data.canton != canton)&(data['date'] <= prior_dates['end'])].copy()
other_report, landuse_from_other = gfcast.make_report_objects(other_data)

# use the land use object from the other data
# and the weights from the likelihood to draw random
# samples from the other data
the_random_samples, new_weights = gfcast.select_prior_data_by_feature_weight(landuse_from_other.df_cat, weights, session_config.feature_variables)

In [13]:
the_random_samples.head()

Unnamed: 0,sample_id,location,date,quantity,pcs/m,public services,streets,orchards,vineyards,buildings,forest,undefined,buildings_public services
0,"('aare_kehrsatz_stolten', '2017-07-03')",aare_kehrsatz_stolten,2017-07-03,5,0.1,1,3,1,1,2,1,3,1
1,"('birs_basel_laderachs', '2018-03-24')",birs_basel_laderachs,2018-03-24,31,1.12,1,3,1,1,4,1,1,1
2,"('limmat_zurich_mortensena_meiera', '2017-08-07')",limmat_zurich_mortensena_meiera,2017-08-07,39,1.32,1,3,1,1,5,1,1,1
3,"('birs_reinach_dinuccin', '2017-05-29')",birs_reinach_dinuccin,2017-05-29,138,3.86,1,3,1,1,4,2,1,1
4,"('vierwaldstattersee_weggis_schoberls_1', '201...",vierwaldstattersee_weggis_schoberls_1,2018-01-27,5,0.21,1,1,1,1,2,1,4,1


### Posterior Dirichlet counts

The posterior distribution from the likelihood and the weighted prior.

```python
# get the land use weights from the observations of interest
likelihood = likelihood_report.sample_results['pcs/m'].values
prior = the_random_samples['pcs/m'].values
posterior_by_weight, comments = posterior_dirichlet_counts(likelihood, prior)
sample_values, adist, summary = dirichlet_posterior(posterior_by_weight)
``` 

In [14]:
likelihood = likelihood_report.sample_results['pcs/m'].values
prior = the_random_samples['pcs/m'].values
posterior_by_weight, comments = gfcast.posterior_dirichlet_counts(likelihood, prior)
sample_values, adist, summary = gfcast.dirichlet_posterior(posterior_by_weight)
summary

{'range': array([0.3  , 0.675, 2.3  , 4.7  , 9.97 ]),
 'nsamples': 100,
 'average': 3.6220000000000003,
 'hdi': (0.1, 15.100000000000001)}