In [1]:
%load_ext watermark
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns

import session_config
import reports
import geospatial
import display as disp
from myst_nb import glue
from IPython.display import display, Markdown

# available data
surveys = session_config.collect_survey_data()

# boundaries / search parameters
feature_type = 'feature_name'
feature_name = 'lac-leman'

df = surveys[surveys[feature_type] == feature_name].copy()
vaud_report = reports.SurveyReport(dfc=df)

# the parameters for the landuse report
target_df = vaud_report.sample_results
features = geospatial.collect_topo_data(locations=target_df.location.unique())
land_use_report = geospatial.LandUseReport(target_df, features)

# creates an array of tuples of the correlated pairs
correlated_pairs = land_use_report.correlated_pairs()

# pass the correlated pairs to combine features method
# this will categorize the features and combine the correlated pairs
# into new columns
land_use_report.combine_features(correlated_pairs)

# Display reports

Cumulative reports display the test statistic of sample results between elements of a geographic or administrative region. Cumulative reports are visualized with heat maps. The starting point for cumulative reports is a valid `ReportClass` object. The granularity of the results are at a minimum the municipal level. The lowest recognized administrative unit.

## A top level description

A short and detailed summary of the report can be created by synthesising or displaying four tables from the `SurveyReport`. The sampling result summary returns the components of the quantiles and the descriptive statistics.

```python
sampling_summary = vaud_report.sampling_results_summary
a, b, c, d = disp.sampling_result_summary(sampling_summary, 'en')

vaud_boundaries = vaud_report.administrative_boundaries()
disp.boundaries(vaud_boundaries, 'en')

vaud_features = vaud_report.feature_inventory()
disp.feature_inventory(vaud_features, 'en')
``` 

In [2]:
sampling_summary = vaud_report.sampling_results_summary
a, b, c, d = disp.sampling_result_summary(sampling_summary, 'en')

glue('summary', Markdown(f'{a}\n{b}'), display=False)

glue('dist', Markdown(f'{c}\n{d}'), display=False)

vaud_boundaries = vaud_report.administrative_boundaries()
glue('boundaries', Markdown(disp.boundaries(vaud_boundaries, 'en')), display=False)

vaud_features = vaud_report.feature_inventory()
glue('features', Markdown(disp.feature_inventory(vaud_features, 'en')), display=False)

::::{grid} 1 2 2 2

:::{grid-item}

_Summary of samples_

```{glue:} summary
```
:::

:::{grid-item}

_Distribution of sample totals_

```{glue:} dist
```
:::

:::{grid-item}

_The administrative boundaries_

```{glue:} boundaries
```
:::

:::{grid-item}

_The rivers lakes and parks_

```{glue:} features
```
:::

::::

__Recall:__ the place names can always be printed.


## The most common objects

The most common objects are a combination of the top ten most abundant objects and those objects that are found in more than 50% of the samples. 

```python

vaud_most_common = vaud_report.object_summary().reset_index()
df, weight = disp.most_common(vaud_most_common)

```

In [3]:
vaud_most_common = vaud_report.object_summary().reset_index()
df, weight = disp.most_common(vaud_most_common)
df

## Correlation matrix

```python
amatrix = land_use_report.correlation_matrix()
disp.correlation_matrix(amatrix)
```

In [4]:
amatrix = land_use_report.correlation_matrix()
disp.correlation_matrix(amatrix)

## Land use profile

In [5]:
lprofile = land_use_report.n_samples_per_feature()
disp.landuse_profile(lprofile[session_config.feature_variables[:-1]], nsamples=vaud_report.number_of_samples)

## Litter rate per feature

In [6]:
rpf = land_use_report.rate_per_feature()
disp.litter_rates_per_feature(rpf.loc[session_config.feature_variables[:-1]])

## Streets profile

In [7]:
disp.street_profile(lprofile[['streets']].T, session_language='en', nsamples=vaud_report.number_of_samples)

In [8]:
disp.street_profile(rpf.loc[['streets']], session_language='en', caption='rate')

In [9]:
df_l = land_use_report.df_cat

In [10]:
the_grid = np.arange(0, 50, step=.2)
my_grids = pd.DataFrame(index=the_grid)
codes = vaud_most_common.sort_values('quantity', ascending=False)[:10].code

In [12]:
def calculate_exceedance_matrix(pcs_values, index_range, categories):
    # Initialize an array to store results for the categories
    result_array = np.zeros((len(index_range), len(categories)))
    
    for i, category_pcs_values in enumerate(pcs_values):
        # Total samples for the current category
        total_samples = len(category_pcs_values)
        if total_samples == 0:
            # Handle case with no samples to avoid division by zero
            continue
        # Use broadcasting to compare pcs_values against all index values at once
        exceedance_matrix = category_pcs_values[:, np.newaxis] > index_range
        # Sum over rows to count the exceedances for each index value and normalize
        exceedance_counts = exceedance_matrix.sum(axis=0) / total_samples
        # Store in the result array
        result_array[:, i] = exceedance_counts
    
    return result_array

# Define the index range and categories
index_range = np.arange(0, 50.1, 0.2)
categories = range(1, 6)

# Prepare an array of pcs/m values for each category of 'buildings'
category_pcs_values = [df_l[df_l['buildings'] == i]['pcs/m'].values for i in range(1, 6)]

# Example usage with the new function signature
buildings_matrix = calculate_exceedance_matrix(category_pcs_values, index_range, categories)
# buildings_matrix[:5, :]  # Show the first 5 rows


# # To demonstrate, let's manually prepare an array of pcs/m values for each category of 'buildings'
# category_pcs_values = [df_l[df_l['buildings'] == i]['pcs/m'].values for i in range(1, 6)]

# buildings_matrix = feature_rate_matrix(category_pcs_values)
# # buildings_matrix[:5, :]  # Show the first 5 rows


In [13]:
def create_prior_matrix(index_range, categories, threshold=10):
    # Initialize the matrix
    prior_matrix = np.zeros((len(index_range), len(categories)))
    
    # Define the exponential decay rates
    decay_rate_pre_threshold = 0.05
    decay_rate_post_threshold = 0.5
    
    # Fill the matrix with the prior probabilities
    for i, x in enumerate(index_range):
        if x <= threshold:
            decay_rate = decay_rate_pre_threshold
        else:
            decay_rate = decay_rate_post_threshold
        # Exponential decay formula
        prior = np.exp(-decay_rate * (x - threshold)) if x > threshold else np.exp(-decay_rate * x)
        # Assign this value to all categories for this index
        prior_matrix[i, :] = prior
    
    return prior_matrix

# Example usage with the index_range and categories defined earlier
prior_matrix = create_prior_matrix(index_range, categories)
prior_matrix[15:25, :]  # Show the first 5 rows


In [14]:
def create_jeffreys_prior_matrix(index_range, categories, epsilon=0.01):
    # Initialize the matrix
    jeffreys_prior_matrix = np.zeros((len(index_range), len(categories)))
    
    # Calculate Jeffreys prior values using the modified formula
    for i, x in enumerate(index_range):
        prior = 1 / (x + epsilon)  # Adding epsilon to avoid division by zero
        # Assign this value to all categories for this index
        jeffreys_prior_matrix[i, :] = prior
    
    return jeffreys_prior_matrix

prior = create_jeffreys_prior_matrix(index_range, categories)
f = pd.DataFrame(prior, columns=categories, index=index_range)

In [15]:
from scipy.stats import weibull_min
from scipy.optimize import root_scalar

# Given percentiles and their values
p75 = 8.92
p95 = 25
scale_max = 50  # The range to model

# Function to estimate Weibull parameters based on percentiles
def estimate_weibull_params(p75, p95):
    # Function to find k given p75 and p95
    def equations(k):
        cdf75 = weibull_min.cdf(p75, k, scale=p75 / (np.log(4)))  # 75th percentile
        cdf95 = weibull_min.cdf(p95, k, scale=p75 / (np.log(4)))  # 95th percentile
        return cdf75 - 0.75, cdf95 - 0.95

    # Solve for k that matches the 75th percentile to 0.75
    k_result = root_scalar(lambda k: equations(k)[0], bracket=[0.1, 10], method='brentq')
    k = k_result.root
    scale = p75 / weibull_min.ppf(0.75, k)

    return k, scale

# Calculate parameters
k, scale = estimate_weibull_params(p75, p95)

# Create the prior grid
index_range = np.arange(0, scale_max + 0.1, 0.2)
prior_pdf = weibull_min.pdf(index_range, k, scale=scale)

# Normalize the prior grid
prior_pdf /= prior_pdf.max()

prior_pdf.shape


In [16]:
buildings_matrix[:, 0]

In [17]:
fig, ax = plt.subplots()

sns.scatterplot(x=index_range, y = prior_pdf, ax=ax)
sns.scatterplot(x=index_range, y = buildings_matrix[:, 0], ax=ax)

plt.show()