In [1]:
%load_ext watermark
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import seaborn as sns

import session_config
import reports
import geospatial
import display as disp
from myst_nb import glue
from IPython.display import display, Markdown

# available data
surveys = session_config.collect_survey_data()

# boundaries / search parameters
feature_type = 'feature_name'
feature_name = 'lac-leman'

df = surveys[surveys[feature_type] == feature_name].copy()
vaud_report = reports.SurveyReport(dfc=df)

# the parameters for the landuse report
target_df = vaud_report.sample_results
features = geospatial.collect_topo_data(locations=target_df.location.unique())
land_use_report = geospatial.LandUseReport(target_df, features)

# creates an array of tuples of the correlated pairs
correlated_pairs = land_use_report.correlated_pairs()

# pass the correlated pairs to combine features method
# this will categorize the features and combine the correlated pairs
# into new columns
land_use_report.combine_features(correlated_pairs)

In [2]:
df_l = land_use_report.df_cat
vaud_most_common = vaud_report.object_summary().reset_index()

In [3]:
the_grid = np.arange(0, 30, step=.2)
my_grids = pd.DataFrame(index=the_grid)
codes = vaud_most_common.sort_values('quantity', ascending=False)[:10].code

In [4]:
def calculate_exceedance_matrix(pcs_values, index_range, categories):
    # Initialize an array to store results for the categories
    result_array = np.zeros((len(index_range), len(categories)))
    
    for i, category_pcs_values in enumerate(pcs_values):
        # Total samples for the current category
        total_samples = len(category_pcs_values)
        if total_samples == 0:
            # Handle case with no samples to avoid division by zero
            continue
        # Use broadcasting to compare pcs_values against all index values at once
        exceedance_matrix = category_pcs_values[:, np.newaxis] > index_range
        # Sum over rows to count the exceedances for each index value and normalize
        exceedance_counts = exceedance_matrix.sum(axis=0) / total_samples
        # Store in the result array
        result_array[:, i] = exceedance_counts
    
    return result_array

# Define the index range and categories
index_range = np.arange(0, 30.1, 0.2)
categories = range(1, 6)

# Prepare an array of pcs/m values for each category of 'buildings'
category_pcs_values = [df_l[df_l['buildings'] == i]['pcs/m'].values for i in range(1, 6)]

# Example usage with the new function signature
buildings_matrix = calculate_exceedance_matrix(category_pcs_values, index_range, categories)
# buildings_matrix[:5, :]  # Show the first 5 rows


# # To demonstrate, let's manually prepare an array of pcs/m values for each category of 'buildings'
# category_pcs_values = [df_l[df_l['buildings'] == i]['pcs/m'].values for i in range(1, 6)]

# buildings_matrix = feature_rate_matrix(category_pcs_values)
# # buildings_matrix[:5, :]  # Show the first 5 rows


In [17]:
import numpy as np
from scipy.stats import halfnorm

# Define parameters
sigma = np.std(category_pcs_values[4])  # Scale parameter from your data's standard deviation
range_max = 30  # Max range
# index_range = np.arange(0, range_max + 0.1, 0.2)

# Calculate the Half-Normal PDF
pdf_values = halfnorm.cdf(index_range, scale=sigma)

pdf_values = [1-x for x in pdf_values]

# Optional: Scale PDF by a factor (e.g., 100) if direct probabilities are too low
# pdf_values_scaled = pdf_values * 100  # Scale factor can be adjusted as needed

# Use the scaled PDF values directly as your prior without normalizing them to sum to 1
# This retains the natural shape and relative probabilities
# prior_matrix = np.tile(pdf_values_scaled, (4, 1)).T  # Assuming 4 categories

# print(prior_matrix[:10, :])  # Print the first 10 rows to check the probabilities
# pdf_values[-20:]

In [20]:
t = buildings_matrix[:, 4] * pdf_values
q = buildings_matrix[:, 4] * beta_p['beta_p'].values

In [21]:
newp = t/sum(t)
from scipy.stats import multinomial

rv = multinomial(1, newp)
newy = rv.rvs(100)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [148]:
def collect_observed_by_category(df, feature: str = None, value: str = 'pcs/m', categories: np.array = range(1, 6)):
    observed = [df_l[df_l[feature] == i][value].values for i in categories]
    return observed

def calculate_prior(sigma, index_range):
    # sigma = np.std(prior_observed)  # Scale parameter from your data's standard deviation
    # range_max = 30  # Max range
    # index_range = np.arange(0, range_max + 0.1, 0.2)

    # Calculate the Half-Normal PDF
    pdf_values = halfnorm.cdf(index_range, scale=sigma)

    pdf_values = [1-x for x in pdf_values]
    return pdf_values

def make_posterior(observed, prior):

    unnormalized = observed * prior
    normalized = unnormalized/sum(unnormalized)
    posterior = multinomial(1, normalized)
    return posterior
    

def draw_and_collect_sample_results(posterior, index_vals):

    results = []
    for element in samples:
        anindex = np.nonzero(element)
        res = index_vals[anindex[0]]
        results.extend(res)

    return results

observed_vals = collect_observed_by_category(df_l, feature='forest')

observed_value_matrix = calculate_exceedance_matrix(observed_vals, index_range, categories)
sigma_historical = 110
# prior = calculate_prior(sigma, index_range)


posteriors = []

for category in [0, 1, 2, 3, 4]:
    

    d = observed_value_matrix[:, category]
    if sum(d) == 0:
        data = observed_vals[:, 0:category].flatten()
        
    sigma = np.std(observed_vals[category])
    print(sigma)
    if sigma < 0.01:
        sigma = sigma_historical
    prior = calculate_prior(sigma, index_range)
        
    posterior = make_posterior(d, prior)
    samples = posterior.rvs(100)
    samples = [index_range[np.argmax(x)] for x in samples]
    posteriors.append(samples)

for element in posteriors:
    print(np.quantile(element, session_config.report_quantiles))    

10.105044108098282
6.8232567173897944
16.337210874919794
nan


ValueError: operands could not be broadcast together with shapes (453,) (151,) 

In [145]:
observed_value_matrix

array([[1.        , 1.        , 1.        , 0.        , 0.        ],
       [1.        , 0.95454545, 1.        , 0.        , 0.        ],
       [0.99537037, 0.86363636, 1.        , 0.        , 0.        ],
       [0.98611111, 0.81818182, 1.        , 0.        , 0.        ],
       [0.97685185, 0.72727273, 1.        , 0.        , 0.        ],
       [0.9537037 , 0.59090909, 1.        , 0.        , 0.        ],
       [0.93981481, 0.59090909, 1.        , 0.        , 0.        ],
       [0.93055556, 0.54545455, 1.        , 0.        , 0.        ],
       [0.91203704, 0.45454545, 1.        , 0.        , 0.        ],
       [0.89814815, 0.45454545, 1.        , 0.        , 0.        ],
       [0.87037037, 0.45454545, 1.        , 0.        , 0.        ],
       [0.85185185, 0.45454545, 1.        , 0.        , 0.        ],
       [0.81481481, 0.45454545, 1.        , 0.        , 0.        ],
       [0.79166667, 0.45454545, 0.92307692, 0.        , 0.        ],
       [0.74537037, 0.40909091, 0.

In [146]:
observed_vals

[array([13.6 ,  5.87, 50.06, 15.74, 12.99,  3.81,  4.94,  2.14,  2.56,
        17.88,  2.69,  2.4 ,  0.88,  1.5 ,  1.89,  1.13,  0.49,  0.58,
         1.29,  0.22, 34.97, 17.54, 18.97,  9.69,  1.65,  0.64,  3.33,
         2.74,  1.85,  8.38,  2.03,  3.18,  7.26,  4.74,  0.85,  4.21,
         1.6 ,  6.98,  9.65,  2.56,  2.39,  1.59,  2.87,  3.73,  2.13,
         4.93,  4.88, 10.67,  4.35,  5.36, 14.53,  6.98,  2.57,  5.75,
         8.4 ,  5.07,  4.22,  5.89,  6.37,  1.26,  1.9 ,  3.11,  3.59,
         5.43, 12.37,  3.26,  4.22,  5.81,  8.65,  2.21,  2.64,  3.3 ,
         1.76,  3.89,  6.14, 15.42,  5.76,  4.85,  5.63,  5.6 ,  7.16,
         8.92, 11.77, 18.61, 13.75,  8.55, 10.07,  6.25,  4.02,  2.9 ,
         3.35,  6.5 ,  7.68,  9.23, 11.36, 26.82, 14.83, 18.36,  2.68,
        33.55, 12.25, 11.72,  4.43, 15.45,  2.23,  3.37,  2.86,  0.92,
         1.58,  2.32,  2.3 ,  1.99,  1.08,  1.75, 22.38, 24.61,  9.05,
        14.39,  6.15,  5.31, 10.08,  9.1 , 39.56,  9.57, 28.13,  8.92,
      

In [128]:
def hdi(samples, cred_mass=.75):
    # Sort the samples
    sorted_samples = np.sort(samples)
    
    # Calculate the number of included samples in the interval
    n_samples = len(sorted_samples)
    n_cred_samples = int(np.floor(cred_mass * n_samples))
    
    # Compute the width of intervals that include the desired number of samples
    interval_widths = sorted_samples[n_cred_samples:] - sorted_samples[:n_samples - n_cred_samples]
    
    # Find the shortest interval
    min_idx = np.argmin(interval_widths)
    
    # Return the HDI
    hdi_min = sorted_samples[min_idx]
    hdi_max = sorted_samples[min_idx + n_cred_samples]
    
    return hdi_min, hdi_max

for element in posteriors:
    print(hdi(element))


(0.0, 8.200000000000001)
(0.2, 15.8)
(0.2, 3.8000000000000003)
(0.0, 6.800000000000001)
(0.0, 11.200000000000001)


In [114]:
[np.median(observed_vals[x]) for x in [0, 1, 2, 3, 4]]

[2.69, 15.879999999999999, 5.35, 4.285, 6.2]

In [129]:
prior

[1.0,
 0.992746603901876,
 0.9854938072290768,
 0.9782422092583203,
 0.9709924089691729]

In [None]:
fig, ax = plt.subplots()

# sns.scatterplot(x=index_range, y = prior_pdf, ax=ax)
sns.scatterplot(x=index_range, y=buildings_matrix[:, 4], color='blue', ax=ax)
sns.scatterplot(x=index_range, y=pdf_values, color='black', ax=ax)
sns.scatterplot(x=index_range, y=t, color='magenta', ax=ax)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import halfnorm

# Define parameters
sigma = 6.4  # Scale parameter
range_max = 50  # Maximum range for the x-values
index_range = np.arange(0, range_max + 0.1, 0.2)  # Values from 0 to 50

# Calculate the Half-Normal CDF
cdf_values = halfnorm.cdf(index_range, scale=sigma)

# Transform to 'exceedance' probabilities
exceedance_probabilities = 1 - cdf_values

# Visualization
plt.figure(figsize=(8, 4))
plt.plot(index_range, exceedance_probabilities, label='1 - CDF (Half-Normal)')
plt.title('Exceedance Probability for Half-Normal Distribution')
plt.xlabel('Value')
plt.ylabel('Probability of Exceeding Value')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
import pymc