In [1]:
import pandas as pd
import numpy as np
# import matplotlib as mpl
import matplotlib.pyplot as plt

from userdisplay import highlight_max, style_negative

import seaborn as sns

import matplotlib.patches as mpatches
import session_config
from session_config import  collect_survey_data, feature_variables
from reports import make_report_objects, reports_and_forecast
from reports import admin_report, features_present, histograms_standard
from reports import ecdf_plots_standard, scatter_plot_standard
from reports import labels_for_display, make_standard_report, make_report_objects
# import userdisplay
# import geospatial
import gridforecast as gfcast
import datetime as dt
from IPython.display import Markdown

from featureevaluator import FeatureEvaluation
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression, LassoCV, TheilSenRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from sklearn.exceptions import ConvergenceWarning
import warnings

import openai
from dotenv import load_dotenv
import os
from myst_nb import glue

import bs4
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_chroma import Chroma
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.messages import AIMessage, HumanMessage



load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

datax = collect_survey_data()
codes = pd.read_csv('data/end_process/codes.csv').set_index('code')

# from use_cases example
ooi = ['G10',  'G30', 'G31', 'G33', 'G34', 'G35', 'G8', 'G7', 'G6', 'G5', 'G4', 'G37', 'G2', 'G27', 'G25', 'G26', 'G11']
# more refined search
tobo_snacks = ['G27', 'G30', 'G35']
# unidentified, plastic, different uses
# udi = ['Gfrags', 'Gfoams']
# industrial
indus = ['G89', 'G67', 'G112', 'G93' , 'G66','G74', 'G72', 'G87', 'G65', 'G69', 'G68', 'G43', 'G41', 'G38', 'G36', 'G19', 'G17', 'Gfrags']

# features
land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'recreation', 'streets']

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
def evaluate_feature_importance(best_model, model_name, X_test, y_test, X_train, y_train):

    # the permuation importance of the variables
    if model_name in ['Random Forest Regression', 'Linear Regression']:
        perm_importance = permutation_importance(best_model, X_test, y_test, n_repeats=30, random_state=42)
        perm_importance_df = pd.DataFrame({
            'Feature': X_test.columns,
            'Importance': perm_importance.importances_mean
            }).sort_values(by='Importance', ascending=False)

    try:
    # model feature importance
        feature_importances_rf = best_model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': X_test.columns,
            'Importance': feature_importances_rf
        }).sort_values(by='Importance', ascending=False)
        return feature_importance_df, perm_importance_df
    except AttributeError:
    # if feature importance not avaialable try the coefficients
        try:
            params = best_model.coef_
            feature_importances_rf = params
            feature_importance_df = pd.DataFrame({'feature':X_test.columns, 'Coeficient':feature_importances_rf})
            return feature_importance_df, perm_importance_df
        except AttributeError:
            #return an empty DataFrame
            return pd.DataFrame(), perm_importance_df

def find_elbow_point(sse):
    n_points = len(sse)
    all_coords = np.vstack((range(n_points), sse)).T
    first_point = all_coords[0]
    last_point = all_coords[-1]

    line_vec = last_point - first_point
    line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))

    vec_from_first = all_coords - first_point
    scalar_product = np.sum(vec_from_first * line_vec_norm, axis=1)
    vec_from_first_parallel = np.outer(scalar_product, line_vec_norm)
    vec_to_line = vec_from_first - vec_from_first_parallel

    dist_to_line = np.sqrt(np.sum(vec_to_line**2, axis=1))
    elbow_point = np.argmax(dist_to_line)
    
    return elbow_point + 1

def filter_features(data, threshold: float = 0.2, terms: [] = None ):

    filtered_columns = [col for col in terms if (data[col] > 0).mean() >= threshold]
    return data[['pcs/m', 'canton', 'use', *filtered_columns]], filtered_columns
    

def determine_optimal_clusters(d):

  
    sse = []
    k_range = range(1, 11)
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(d)
        sse.append(kmeans.inertia_)
    
    optimal_k = find_elbow_point(sse)
    return optimal_k, sse

def kmeans_clustering(n_clusters, w_interactions: bool = False):
    
    kmeans = kmeans_plusplus(n_clusters=n_clusters, random_state=42)
    
        
    d['clusters'] = kmeans.fit_predict(d)
    some_features = [x for x in d.columns if x not in ['pcs/m','clusters', 'streets']]
    
    means = d.groupby(['clusters'])['pcs/m'].mean()
    means_unscaled = self.unscale_target(means)
    
    counts = d.groupby(['clusters'])['pcs/m'].count()
    
    cluster_summary = d.groupby('clusters').agg({x:'mean' for x in some_features}).reset_index()
    cluster_summary = self.unscale_values(cluster_summary, columns=some_features, w_interactions=w_interactions)
    cluster_summary['pcs/m'] = means_unscaled
    cluster_summary['samples'] = counts.values
    cluster_summary = cluster_summary[['samples', 'pcs/m', *cluster_summary.columns[:-2]]]
           
    return cluster_summary, kmeans, d

def unscale_target(means, ascaler):
    means = means.values
    means_shape = means.shape
    if means.ndim == 1:
        means = means.reshape(1, -1)

    means_unscaled = ascaler.inverse_transform(means)
        
    means_unscaled.reshape(means_shape)
    return means_unscaled[0]

def perform_regression_analysis(d, features: [] = None, target_var: str = 'pcs/m'):
    params = {
        "n_estimators": 100,
        "max_depth": 4,
        "min_samples_split": 5,
        "learning_rate": 0.01,
        "loss": "huber",
        "alpha": .9
       
        }
    these_models = {
        'Linear Regression': LinearRegression(),
        'Random Forest Regression': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting Regression': GradientBoostingRegressor(**params),
        'Theil-Sen Regressor': TheilSenRegressor(random_state=42)
        }
      
    
    X = d[features]
    y = d[target_var].values
       
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    regression_results = []
    best_model = None
    best_r2 = -np.inf
    the_name = None
    
    # sklearn - linear models        
    for model_name, model in these_models.items():
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            regression_results.append({'Model': model_name, 'R²': r2, 'MSE': mse})
            
            if r2 > best_r2:
                best_r2 = r2
                best_model = model
                the_name = model_name
    # bagging
    bag_estimator = these_models[the_name]
    bag = BaggingRegressor(estimator=bag_estimator)
    bag.fit(X_train, y_train)
    y_pred = bag.predict(X_test)
    predictions = {
        the_name: best_model.predict(X_test),
        'Bagging': y_pred
    }

    regression_results.append({'Model': f'Bagging:{the_name}', 'R²': bag.score(X_test, y_test), 'MSE':mean_squared_error(y_test, y_pred)})
    # voting

    lnr = these_models['Linear Regression']
    rf = these_models['Random Forest Regression']
    gbr = these_models['Gradient Boosting Regression']
    voting = VotingRegressor([('lnr', lnr), ('rf', rf), ('gbr', gbr)])
    voting.fit(X_train, y_train)
    y_pred = voting.predict(X_test)
    predictions.update({'voting': y_pred})
    
    regression_results.append({'Model': 'Voting', 'R²': voting.score(X_test, y_test), 'MSE':mean_squared_error(y_test, y_pred)})    
    
    return regression_results, best_model, the_name, predictions, X_test, y_test, X_train, y_train

def create_interaction_terms(data, interaction_terms=None, target='pcs/m'):
    if interaction_terms is None:
        interaction_terms = ['streets', 'public-services', 'recreation']
    
    
    d_cols = [x for x in data.columns if x not in [target, 'use']]
    interaction_data = {}
    interaction_columns = []
    # print(interaction_columns)
    for col in d_cols:
        if col not in interaction_terms:
            feature_value = data[col].values
            interaction_name = f'{col}'
            for term in interaction_terms:
                feature_value += data[col].values * data[term].values
                interaction_name += f'_inter_{term}'
                
            interaction_data[interaction_name] = feature_value
            interaction_columns.append(interaction_name)
    
    interaction_data = pd.DataFrame(interaction_data)
    interaction_data[target] = data[target]
    interaction_data['use'] = data['use']
    return interaction_data, interaction_columns



def clusters_by_use_case(cluster_data, use: str = 'pro', scaled_cols: [] = None, columns_to_cluster: [] = None, interaction_terms: bool = False):

    if interaction_terms:
        print(scaled_cols)
        cluster_p = cluster_data[cluster_data.use == use].copy()
        nclusters = determine_optimal_clusters(cluster_p[columns_to_cluster])
        kmeans = KMeans(n_clusters=nclusters[0], random_state=42).fit(cluster_p[columns_to_cluster])
        cluster_p['cluster'] = kmeans.labels_
        scaler = int_minmax.fit(cluster_p[scaled_cols])
        cluster_p.loc[:, scaled_cols] = scaler.transform(cluster_p[scaled_cols])
        cluster_p['pcs/m'] = interaction_target.inverse_transform(cluster_p['pcs/m'].values.reshape(-1,1))
        df = cluster_p.drop_duplicates('cluster').sort_values('cluster').set_index('cluster', drop=True)
        pcs_m = cluster_p.groupby(['use', 'cluster'], as_index=False).agg({'pcs/m': 'mean'}).set_index('cluster', drop=True)
        samps = cluster_p.groupby(['use', 'cluster'], as_index=False).agg({'pcs/m': 'count'}).rename(columns={'pcs/m':'nsamples'}).set_index('cluster', drop=True)
        pcs_m['nsamps'] =samps.nsamples.values
        df = pcs_m.merge(df[columns_to_cluster], left_index=True, right_index=True)
    else:    

        cluster_p = cluster_data[cluster_data.use == use].copy()
        nclusters = determine_optimal_clusters(cluster_p[columns_to_cluster])
        kmeans = KMeans(n_clusters=nclusters[0], random_state=42).fit(cluster_p[columns_to_cluster])
        cluster_p['cluster'] = kmeans.labels_
        cluster_p[scaled_cols] = feature_scaler.inverse_transform(cluster_p[scaled_cols])
        cluster_p['pcs/m'] = target_scaler.inverse_transform(cluster_p['pcs/m'].values.reshape(-1,1))
        cluster_p['streets'] = street_scaler.inverse_transform(cluster_p['streets'].values.reshape(-1,1))
        scaler = MinMaxScaler().fit(cluster_p['streets'].values.reshape(-1,1))
        cluster_p['streets'] = scaler.transform(cluster_p['streets'].values.reshape(-1,1))
        df = cluster_p.drop_duplicates('cluster').sort_values('cluster').set_index('cluster', drop=True)
        pcs_m = cluster_p.groupby(['use', 'cluster'], as_index=False).agg({'pcs/m': 'mean'}).set_index('cluster', drop=True)
        samps = cluster_p.groupby(['use', 'cluster'], as_index=False).agg({'pcs/m': 'count'}).rename(columns={'pcs/m':'nsamples'}).set_index('cluster', drop=True)
        pcs_m['nsamps'] =samps.nsamples.values
        df = pcs_m.merge(df[columns_to_cluster], left_index=True, right_index=True)

    return cluster_p, df



def append_to_markdown(filename, content):
    with open(filename, 'a') as f:
        f.write(content)

import openai

def generate_narrative_from_table(table):
    openai.api_key = os.getenv('OPENAI_API_KEY')
    
    example_narrative = """
    Cluster analysis identified three distinct clusters with varying levels of litter density (pcs/m) and distinct land use characteristics. 
    Cluster 0, which includes 7 samples, has an average pcs/m of 0.235 and is primarily associated with high forest coverage (0.553) and lower building presence (0.162). 
    In contrast, Cluster 1, containing the majority of the samples (70), has a higher average pcs/m of 0.377. This cluster is characterized by a balance between undefined areas (0.536) and buildings (0.248). 
    Cluster 2, with 21 samples, exhibits the highest litter density at 1.405 pcs/m. This cluster is dominated by significant building coverage (0.586) and lower presence of undefined areas (0.119). 
    The public services feature remains low across all clusters, highlighting its minimal impact on litter density in this analysis.
    """
    
    prompt = (
        f"Generate a narrative summary for based on the following table. Use the example narrative as a style guide:\n\n"
        f"{table}\n\n"
        f"Example Narrative:\n{example_narrative}\n\n"
    )
    
    response = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        max_tokens=1000
    )
    return response.choices[0].text.strip()

def generate_narrative_from_table(client, messages, model):
   
    
    response = client.chat.completions.create(
        model=model,
        
        messages=messages,
        max_tokens=1100
    )
     
    return response.choices

In [54]:
o_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}

# 'Neuchâtel', 'Zürich', 


canton = 'Bern'
this_feature_type = 'l'

d = datax.reset_index(drop=True)


# make complete report
params_l = {'canton':canton, 'date_range':o_dates, 'feature_type': this_feature_type}
params_p = {'canton':canton, 'date_range':prior_dates, 'feature_type':this_feature_type}

# set the parameters for the weighted prior
# exclude records in the likelihood, set date range and feature type
# make the land-use-inventory, exclude any likelihood values
lu_catalogue = d[(d.canton != canton)&(d['date'] <= o_dates['end'])&(d.feature_type == 'l')].copy()
catalog_surveys, catalog_features = make_report_objects(lu_catalogue)
prior_feature = catalog_features.df_cat
prior_feature['feature_type'] = 'l'


# the prior and likelihood data from the region of interest
all_data_of_interest = d[(d['date'] >= prior_dates['start']) & (d['date'] <= o_dates['end'])&(d.feature_type == 'l')].copy()
all_data_of_interest = all_data_of_interest[all_data_of_interest.canton == 'Bern'].copy()

# create a variable for different code group totals
all_data_of_interest = all_data_of_interest[all_data_of_interest.code.isin([*indus, *tobo_snacks])].copy()

all_data_of_interest_i = all_data_of_interest[all_data_of_interest.code.isin(indus)].copy()
all_data_of_interest_i['use'] = 'pro'

all_data_of_interest_p = all_data_of_interest[all_data_of_interest.code.isin(tobo_snacks)].copy()
all_data_of_interest_p['use'] = 'pers'

# all_data_of_interest_o = all_data_of_interest[~all_data_of_interest.code.isin([*tobo_snacks, *indus])].copy()
# print(all_data_of_interest_o.quantity.sum())
# all_data_of_interest_o['use'] = 'other'

all_data_of_interest = pd.concat([all_data_of_interest_i, all_data_of_interest_p])

all_data_of_interest.reset_index(inplace=True, drop=True)

land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'streets', 'orchards', 'use', 'canton', 'city', 'feature_name']

all_report, all_land_use = make_report_objects(all_data_of_interest, info_columns = ['use', 'canton', 'city', 'feature_name'])


args = {
    'likelihood': {'canton':canton, 'date_range':o_dates},
    'prior' : {'canton':canton, 'date_range':prior_dates},
    'data' : all_data_of_interest.copy(),
    'land-use-inventory' : prior_feature.copy()
}


combined_results = reports_and_forecast(args['likelihood'], args['prior'], ldata=args['data'])
standard_combined = make_standard_report(combined_results, args)


lake_report = combined_results['this_report']
lake_prior_report = combined_results['prior_report']
lake_land_use = combined_results['this_land_use']

scaled_cols = ['public-services', 'buildings', 'forest', 'undefined', 'vineyards', 'orchards', 'streets', 'recreation']

d = all_land_use.df_cont.copy()

cluster_d, filtered_columns = filter_features(d.copy(), terms=scaled_cols)


target_scaler = StandardScaler()
feature_scaler = StandardScaler()
street_scaler = StandardScaler()
interaction_target = StandardScaler()
interaction_scaler = StandardScaler()
int_minmax = MinMaxScaler()

cluster_i, i_columns = create_interaction_terms(cluster_d[['pcs/m', 'use', *filtered_columns]].copy())
cluster_i[i_columns] = interaction_scaler.fit_transform(cluster_i[i_columns])
cluster_i['pcs/m'] = interaction_target.fit_transform(cluster_i['pcs/m'].values.reshape(-1,1))

# cluster_d['pcs/m'] = target_scaler.fit_transform(cluster_d[['pcs/m']])
cluster_d['pcs/m'] = target_scaler.fit_transform(cluster_d[['pcs/m']])
cluster_d['streets'] = street_scaler.fit_transform(cluster_d[['streets']])
# these_cols = [x for x in filtered_columns if x != 'streets']
cluster_d[filtered_columns] = feature_scaler.fit_transform(cluster_d[filtered_columns])

In [55]:
system_prompt1 = (
    "You are a research assistant assigned the task of summarizing"
    "table data for the results section of a manuscript. You are to"
    "Transcribe the values from tables and put them in paragraph form."
    "Being carefull that each value in the table is accounted for in the"
    "paragraph. You are to do this in a narrative form. answeres must be concise"   
    "\n\n"
    "{context}"
)

In [56]:
lake_profile = all_land_use.n_samples_per_feature()/all_report.number_of_samples

lake_rates = all_land_use.rate_per_feature()
lake_rate = "### Objects per meter of shoreline by magnitude of feature\n" + lake_rates.to_markdown() + '\n\n\n' + "### Proportion of samples by magnitude of feature\n" + lake_profile.to_markdown()

indus_code_defs = codes.loc[indus, 'en']
pro_codes = (', ').join(indus_code_defs.values)

rec_code_defs = codes.loc[tobo_snacks, 'en']
rec_codes = (', ').join(rec_code_defs.values)


di = all_report.sample_results(info_columns=['use'])
di = di.groupby(['use', 'sample_id']).agg(session_config.unit_agg)
di = di.groupby(['use'])['pcs/m'].describe()
di.rename(columns={'count': 'nsamples'}, inplace=True)


# summary of sample results
lake_combined_summary = all_report.sampling_results_summary.T

# summary by use
prof_summary = di.loc[['pro']].copy()
pers_summary = di.loc[['pers']].copy()
prof_summary


Unnamed: 0_level_0,nsamples,mean,std,min,25%,50%,75%,max
use,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pro,98.0,1.028776,1.124252,0.0,0.1925,0.575,1.65,5.32


## Tasks:

### Summarize the survey results

The survey report results are defined by <report_objects> they can be made with the following commands:

```python

# define the features variables
land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'streets', 'orchards', 'use', 'canton', 'city', 'feature_name']

# make the report objects
all_report, all_land_use = make_report_objects(all_data_of_interest, info_columns = ['use', 'canton', 'city', 'feature_name'])

# make reports and forecasta
args = {
    'likelihood': {'canton':canton, 'date_range':o_dates},
    'prior' : {'canton':canton, 'date_range':prior_dates},
    'data' : all_data_of_interest.copy(),
    'land-use-inventory' : prior_feature.copy()
}


combined_results = reports_and_forecast(args['likelihood'], args['prior'], ldata=args['data'])
standard_combined = make_standard_report(combined_results, args)
```

Frist we use the llm to to summarize these tables into a narrative paragraph. Ensuring to include all values. In the second phase the llm summarizes each paragraph by indentifying siginifcant values. 

In [57]:
prof_summary

Unnamed: 0_level_0,nsamples,mean,std,min,25%,50%,75%,max
use,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pro,98.0,1.028776,1.124252,0.0,0.1925,0.575,1.65,5.32


In [58]:
pers_summary

Unnamed: 0_level_0,nsamples,mean,std,min,25%,50%,75%,max
use,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pers,98.0,0.587449,0.836707,0.0,0.12,0.25,0.675,4.04


In [59]:
lake_combined_summary

Unnamed: 0,total,nsamples,average,5th,25th,50th,75th,95th,std,max,start,end
result,5678,98,1.616224,0.1085,0.3525,0.98,2.5625,4.764,1.647057,7.75,2017-04-16,2021-04-08


### Summarize the land-use tables

In [60]:
lake_profile

Unnamed: 0,buildings,wetlands,forest,public-services,recreation,undefined,streets,vineyards,orchards
2,1.040816,0.0,1.265306,0.0,0.0,0.142857,0.0,0.0,0.0
1,0.489796,2.0,0.591837,2.0,2.0,0.530612,0.0,2.0,2.0
3,0.326531,0.0,0.142857,0.0,0.0,1.326531,0.0,0.0,0.0
4,0.102041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.040816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
lake_rates

Unnamed: 0,buildings,wetlands,forest,public-services,recreation,undefined,streets,vineyards,orchards
1,0.774792,0.808112,1.004483,0.808112,0.808112,1.041154,0,0.808112,0.808112
2,0.620588,0.0,0.770323,0.0,0.0,1.532857,0,0.0,0.0
3,1.620937,0.0,0.329286,0.0,0.0,0.636846,0,0.0,0.0
4,0.439,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
5,0.41,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [62]:
dit = di.to_markdown()

In [63]:
example_narrative = """
    Cluster analysis identified three distinct clusters with varying levels of litter density (pcs/m) and distinct land use characteristics. 
    Cluster 0, which includes 7 samples, has an average pcs/m of 0.235 and is primarily associated with high forest coverage (0.553) and lower building presence (0.162). 
    In contrast, Cluster 1, containing the majority of the samples (70), has a higher average pcs/m of 0.377. This cluster is characterized by a balance between undefined areas (0.536) and buildings (0.248). 
    Cluster 2, with 21 samples, exhibits the highest litter density at 1.405 pcs/m. This cluster is dominated by significant building coverage (0.586) and lower presence of undefined areas (0.119). 
    The public services feature remains low across all clusters, highlighting its minimal impact on litter density in this analysis.
    """

### Summarize the cluster analysis

In [67]:
cluster_pro, summary_pro = clusters_by_use_case(cluster_d, use='pro',scaled_cols=filtered_columns, columns_to_cluster=filtered_columns)
cluster_rec, summary_rec = clusters_by_use_case(cluster_d, use='pers', scaled_cols=filtered_columns, columns_to_cluster=filtered_columns)

In [68]:
summary_pro['personal'] = summary_rec['pcs/m']
summary_pro.rename(columns={'pcs/m':'professional'}, inplace=True)
summary_pro.drop(['use', 'nsamps'], inplace=True, axis=1)
cols = [x for x in summary_pro.columns if x not in ['professional','personal']]
cluster_features = summary_pro[cols].copy()
cluster_results = summary_pro[['professional', 'personal']].copy()

cf = cluster_features.to_markdown()
cr = cluster_results.to_markdown()

In [69]:
cluster_features

Unnamed: 0_level_0,public-services,buildings,forest,undefined,vineyards,streets,recreation
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.038,0.167,0.559,0.133,0.0,0.096574,0.002
1,0.003,0.3,0.213,0.486,0.0,0.062862,0.0
2,0.061,0.309,0.107,0.541,0.02,0.49708,0.025
3,0.044,0.682,0.153,0.149,0.015,0.251846,0.005
4,0.003,0.047,0.271,0.55,0.0,0.0,0.155
5,0.188,0.14,0.308,0.297,0.192,0.273278,0.014


In [70]:
cluster_results

Unnamed: 0_level_0,professional,personal
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.422857,0.235714
1,0.1225,0.11625
2,1.864483,0.281034
3,1.125217,1.392174
4,0.540833,0.695833
5,2.826667,1.536667


### Summarize the regression results for each use group


In [50]:
regression_results, best_model, the_name, predictions, X_test, y_test, X_train, y_train = perform_regression_analysis(cluster_rec.copy(), features=filtered_columns)

#### Personal

In [52]:
regression_results = pd.DataFrame(regression_results)

Unnamed: 0,Model,R²,MSE
0,Linear Regression,0.572063,0.446874
1,Random Forest Regression,0.63406,0.382133
2,Gradient Boosting Regression,0.404186,0.622179
3,Theil-Sen Regressor,0.275514,0.756545
4,Bagging:Random Forest Regression,0.660807,0.354203
5,Voting,0.558787,0.460737


#### Professional

In [None]:
regression_results, best_model, the_name, predictions, X_test, y_test, X_train, y_train = perform_regression_analysis(cluster_pro.copy(), features=filtered_columns)

In [52]:
regression_results = pd.DataFrame(regression_results)

Unnamed: 0,Model,R²,MSE
0,Linear Regression,0.572063,0.446874
1,Random Forest Regression,0.63406,0.382133
2,Gradient Boosting Regression,0.404186,0.622179
3,Theil-Sen Regressor,0.275514,0.756545
4,Bagging:Random Forest Regression,0.660807,0.354203
5,Voting,0.558787,0.460737


#### Summarize the model feature importance

In [None]:
feature_importance_df, perm_importance_df = evaluate_feature_importance(best_model, model_name, X_test, y_test, X_train, y_train)
feature_importance_df, perm_importance_df = evaluate_feature_importance(best_model, model_name, X_test, y_test, X_train, y_train)

In [8]:
report_file_name = 'results_file.md'
with open(report_file_name, "w") as file:
    file.write("# Response:\n\n")

append_to_markdown(report_file_name, "\n\n" + "hello" + "\n\n" + dit)

In [9]:
context_r = f"""

## Litter surveys in Switzerland 2020-2021 - IQAASL

Identification, quantification and analysis of anthropogenic Swiss litter (IQAASL) is a project commissioned by the Swiss 
Federal Office for the Environment to collect data concerning visible pollutants along Swiss lakes and rivers. All 
discarded materials were collected and identified using litter survey techniques. in total there were 406 samples from 163 
locations in 95 municipalities.

This report is a summary and analysis of the litter surveys conducted and the methods employed in Switzerland from March
2020 through August 2021. This sampling phase overlaps with the Swiss Litter Report (SLR) survey period, which ran
from April 2017 to March 2018. The SLR was the first project on a national level to use the standard protocol described in 
the Guide to monitoring beach litter or any other comparable method. This overlap allows the results of the 
present study to be compared with those of the SLR.

## Lakes and rivers

The lakes and rivers were sampled from 2020-03 through 2021-05, a total of 54,744 objects were removed and classified over 
the course of 386 surveys. The survey locations were divided into survey areas for regional analysis and defined by the Aare, 
Rhône, Ticino and Linth/Limmat rivers. Surveys were conducted at 143 different locations, representing 77 municipalities. 
The total linear distance surveyed was 20 km with a surface area of 9 hectares and a total municipal population of 1.7 million.

Most surveys were along lake shorelines (331 samples) as lakes offer more consistent and safe year-round access with 
respect to rivers. Additionally, lakes are large areas of reduced flow that receive input from multiple rivers, streams 
and drainage systems providing ideal locations to assess the variety of objects in and around the water bodies.

In total 316 samples came from seven principal lakes in 3 major river basins. Twenty locations were selected to sample 
monthly for a twelve-month period with the exception of Lago Maggiore, which was sampled every three months. 
Surveys were also conducted on Lago di Lugano, Lac des Quatre cantons, Brienzersee and Zugersee. In addition, there 
were 55 surveys on 16 rivers.

### The sampling locations - type and description

The land use is reported as the percent of total area attributed to each land use category within a 1500m radius of the 
survey location. The ratio of the number of samples completed at the different land use profiles is an indicator of the 
environmental and economic conditions around the survey locations.

The land use around the survey locations had a higher attribution to buildings as opposed to agriculture and woods. For 
example, half of all the surveys had at least 37% of land use devoted to buildings as opposed to 19% for agriculture or 
13% to woods. Land use devoted to recreation was at least 6% for half of all samples.

The length of the road network within the buffer zone differentiates between locations that have other wise similar land 
use characteristics. The length of road per buffer ranges from 13km to 212km, 50% of the surveys had less than 67km of road network.

The number of intersections ranges from zero to 23, 50% of the surveys had 3 or fewer intersections within 1500m of the 
survey location. The size of the intersecting river or canal was not taken into consideration. Survey locations on rivers 
have zero intersections.

The population (not shown) is taken from statpop 2018 and represents the population of the municipality surrounding the 
survey location. The smallest population was 442 and the maximum was 415,367, 50% of the surveys come from 
municipalities with a population of at least 12,812.

Overall, surveys at locations with more buildings and more recreation sites were more likely to facilitate the accumulation 
of trash on the shoreline. When the most common objects are considered, only four of the twelve were found at higher rates 
in the presence of more buildings. All of those objects are likely related to food or tobacco consumption near the location. 
Suggesting that there are still gains to be made in prevention and attenuation efforts in areas of high traffic near the water.

However, six of the twelve objects have no positive association to land use attributed to buildings but were found in at 
least 50% of all the surveys. These objects are generally associated with professional use or in the case of cotton swabs 
personal hygiene:

* plastic construction waste
* fragmented plastics
* industrial sheeting
* expanded polystyrene
* cotton bud/swabs
* insulation, includes spray foams

Furthermore, compared to products related to tobacco or food consumption these objects have fewer positive associations in 
general. Indicating that the appropriate land use feature is not currently accounted for and/or these objects are found 
at similar quantities indifferent of the land use features. Suggesting that these objects are ubiquitous in the environment.

Finally, two of the twelve most common objects were found in less than 50% of the surveys and have few positive associations:

* industrial pellets
* expanded foams < 5mm

These objects are found in large quantities sporadically at specific locations. They have been found in all survey areas 
and in all lakes. Industrial pellets have a very specific use and client base making it possible to find partners based 
on the density of the pellets found and the location of the nearest consumer or producer of pellets, see Shared responsibility.

### Median survey total

The results are in units of pieces of litter per 100 meters (p/100m). The median survey result of all data was approximately
189 p/100m. The maximum recorded value was 6,617 p/100m (Rhône survey area) and the minimum recorded was 2p/100m (Aare survey area).
The Rhône survey area had the highest median survey total of 442p/100m, this can in part be explained by the high number
of urban survey locations with respect to the other survey areas and the deposition of fragmented plastics and foamed 
plastics at the Rhône River out flow in the upper lake region.

A reference value was calculated excluding the results from samples that were less than 10m and objects less than 2.5cm. 
This method, described in EU Marine Beach Litter Baselines was used to calculate the reference and threshold 
values for all European beaches in 2015 and 2016 resulting in a median value of 131 p/100m. The results from the European 
baseline value lie outside the 95% confidence interval (CI) of 147 - 213p/100m established using the data from IQAASL.

Surveys in Switzerland were on average, smaller scale than in marine environments and in locations that would be 
considered urban under most circumstances. To date monitoring of lakes and rivers upstream of coastal regions has 
not generalized on the European continent. However, there is a concerted effort by a group of associations in 
Switzerland and France to establish a common monitoring and data exchange protocol for the Rhône basin. Additionally, 
the Wageningen University & Research has begun analyzing data collected in the Meusse - Rhine delta using 
protocols like those in IQAASL.

### The most common objects

The most common objects are defined as those objects identified in at least 50% of all surveys and/or are among the ten 
most abundant by quantity. As a group the most common objects represent 68% of all objects identified in the sampling period. 
Of the most common items 27% are food, drink and tobacco related and 24% are infrastructure and agriculture related.

Objects related to food, drink and tobacco are identified at higher rates at survey locations with a greater percentage 
of land attributed to buildings or fixed infrastructure, the inverse is true of the locations with a higher percentage 
of land attributed to woods or agriculture. However, infrastructure material and fragmented plastics, are found at similar 
rates throughout all survey areas indifferent of land use surrounding the survey locations.

The most common objects identified in the surveys were:

* cigarette ends: total 8'485, % of all objects 15.5%, fail-rate 87%, p/100m 20
* fragmented plastics: total 7'400, 13% of all objects, fail-rate 86%, p/100m 18
* expanded polystyrene: total 5'563, 10% of all objects, fail-rate 68%, p/100m ,
* snack wrappers: total 3'325, 6% of all objects, fail-rate 85%, p/100m 9
* industrial sheeting: total 2'534, 4% of all objects, fail-rate 69%, p/100m 5
* glass drink bottles, pieces: total 2'136, 3% of all objects, fail-rate 65%, p/100m 3
* industrial pellets: total 1'968, 3% of all objects, fail-rate 30%, p/100m 4
* insulation, includes spray foams: total 1'702, 3% of all objects, fail-rate 53%, p/100m 1
* cotton bud/swabs: total 1'406, 2% of all objects, fail-rate 50%, p/100m 1
* expanded foams < 5mm: total 1'209, 2% of all objects, fail-rate 25%, p/100m 0
* plastic construction waste: total 992, 1% of all objects, fail-rate 52%, p/100m 1
* metal bottle caps: total 700, 1% of all objects, fail-rate 52%, p/100m 1



Industrial pellets and expanded foams < 5mm both occurred in significant quantities but identified in less than 50% of 
the surveys (median of 0), indicating high counts at specific locations. While both are micro plastics, their use, 
origin and rate of occurrence are different depending on the survey area region. Industrial pellets are raw materials 
used in injection molding processes whereas foamed plastic beads are the result of fragmentation of expanded polystyrene.

### Conclusions

At the national level, the IQAASL results are stable compared to the surveys that were carried out in 2017 as part of the 
SLR study. However, there was a general decrease in the quantity of food, drink and tobacco objects. Infrastructure 
objects and fragmented plastics and foams did not decline and some locations may have experienced sharp increases. 
Pandemic restrictions limiting large outdoor gatherings may have had a beneficial effect on the reduction of food, drink 
and tobacco items. The greatest increases in infrastructure related objects were in Valais, Vaud and Brienz, which are 
locations near the Rhône and Aare rivers discharge points.

The land use around a survey location has a measurable effect on the deposition of certain objects. The more buildings 
and fixed infrastructure there are the more tobacco and food products are found. Objects like fragmented plastics and 
industrial sheeting do not have the same association and are identified at approximately equal rates indifferent of the 
land use with increases near river/canal discharge points.

Currently three of the four survey areas in the IQAASL are actively monitored by research and governmental agencies 
downstream of Switzerland using similar methods presented in this report. Additionally, regional associations in 
Switzerland are actively pursuing a standardization of reporting and protocols with partner organizations in the EU.

The IQAASL is a citizen-science project that only uses open-source tools and shares data on GNU public license, 
enabling collaboration with stakeholders. At the end of the mandate, December 31, 2021, Hammerdirt will assume the 
responsibility of maintaining the code and data repository which is hosted publicly on Github.

The associations that participated in the IQAASL are actively seeking ways to incorporate the data collection process 
and/or the results into their own business model. However, there is a shortage of data scientists within many regional 
associations which may lengthen the process of integration and stifle the rate of innovation at the level where it is needed most.
"""

glue('context-r', context_r, display=False)

## Context: Litter surveys in Switzerland 2020-2021 - IQAASL

We provide one document for the context here. It is indexed with 

Identification, quantification and analysis of anthropogenic Swiss litter (IQAASL) is a project commissioned by the Swiss 
Federal Office for the Environment to collect data concerning visible pollutants along Swiss lakes and rivers. All 
discarded materials were collected and identified using litter survey techniques. in total there were 406 samples from 163 
locations in 95 municipalities.

This report is a summary and analysis of the litter surveys conducted and the methods employed in Switzerland from March
2020 through August 2021. This sampling phase overlaps with the Swiss Litter Report (SLR) survey period, which ran
from April 2017 to March 2018. The SLR was the first project on a national level to use the standard protocol described in 
the Guide to monitoring beach litter or any other comparable method. This overlap allows the results of the 
present study to be compared with those of the SLR.

:::{dropdown} See the rest of the context document

```{glue:md} contex-r
:format: myst
```

## System prompts

This is a retireval augmented generation application (RAG) for this analysis there is only one document. Their is a system prompt and a prompt to consider the question in the relation to recent chat history.

````{dropdown} System and chat prompts

### Basic langchain recipe

```python
system_prompt = (
    "You are a research assistant assigned the results section of a manuscript. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use paragraphs please"
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(client, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)


contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    client, retriever, contextualize_q_prompt
)


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(client, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

```

````

In [10]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')


client = ChatOpenAI(model="gpt-3.5-turbo-0125")
headers_to_split = [('##', 'Lakes and rivers'), ('##', 'Land use profile lakes and rivers'),  ('##', 'Median survey total'), ('##', 'The most common objects')]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split
)
md_header_splits = markdown_splitter.split_text(context_r)
vectorstore = Chroma.from_documents(documents=md_header_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()


with open(report_file_name, "w") as file:
    file.write(context_r + "\n\n" + "Response:\n\n")


system_prompt1 = (
    "You are a research assistant assigned the task of summarizing"
    "table data for the results section of a manuscript. You are to"
    "Transcribe the values from tables and put them in paragraph form."
    "Being carefull that each value in the table is accounted for in the"
    "paragraph. You are to do this in a narrative form. answeres must be concise"   
    "\n\n"
    "{context}"
)



system_prompt = (
    "You are a research assistant assigned the task of creating the"
    "results section of the next manuscript."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use paragraphs please"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt1),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(client, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    client, retriever, contextualize_q_prompt
)


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(client, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

## Questions

### What is IQAASL ? What was the median survey total in IQAASL?  What were the most common objects ?

answer:

In [11]:
chat_history = []

question = "What is IQAASL ? What was the median survey total in IQAASL?  What were the most common objects ?"
# context = get_context_for_question(question)  # Retrieve context
ai_msg_1 = rag_chain.invoke({"input": question, "chat_history": chat_history})
append_to_markdown(report_file_name,ai_msg_1["answer"] + "\n\n")
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_1["answer"]),
    ]
)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [12]:
Markdown(ai_msg_1["answer"])

IQAASL stands for Identification, quantification and analysis of anthropogenic Swiss litter. The median survey total in IQAASL was approximately 189 pieces of litter per 100 meters. The most common objects identified in the surveys were cigarette ends, fragmented plastics, expanded polystyrene, snack wrappers, industrial sheeting, glass drink bottles, industrial pellets, insulation (including spray foams), cotton bud/swabs, expanded foams < 5mm, plastic construction waste, and metal bottle caps.

### What types of locations were sampled ? What was the land-use of the locations surveyed ? Was it mostly buildings or forest?

#### Answer:

In [13]:
second_question = "What types of locations were sampled ? What was the land-use of the locations surveyed ? Was it mostly buildings or forest?"
# context = get_context_for_question(second_question)  # Retrieve context for the second question
ai_msg_2 = rag_chain.invoke({"input": second_question, "chat_history": chat_history})
append_to_markdown(report_file_name, ai_msg_1["answer"] + "\n\n")
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_2["answer"]),
    ]
)


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [14]:
Markdown(ai_msg_2["answer"])

The types of locations sampled included lakes, rivers, and their shorelines. The land use of the locations surveyed had a higher attribution to buildings compared to agriculture and woods. Half of all the surveys had at least 37% of land use devoted to buildings, while only 19% was attributed to agriculture and 13% to woods. Land use devoted to recreation was at least 6% for half of all samples.

In [15]:
example_narrative = """
    Cluster analysis identified three distinct clusters with varying levels of litter density (pcs/m) and distinct land use characteristics. 
    Cluster 0, which includes 7 samples, has an average pcs/m of 0.235 and is primarily associated with high forest coverage (0.553) and lower building presence (0.162). 
    In contrast, Cluster 1, containing the majority of the samples (70), has a higher average pcs/m of 0.377. This cluster is characterized by a balance between undefined areas (0.536) and buildings (0.248). 
    Cluster 2, with 21 samples, exhibits the highest litter density at 1.405 pcs/m. This cluster is dominated by significant building coverage (0.586) and lower presence of undefined areas (0.119). 
    The public services feature remains low across all clusters, highlighting its minimal impact on litter density in this analysis.
    """
    
# prompt = (
#     f"Generate a narrative summary based on the following table. Use the example narrative as a style guide:\n\n"
#     f"{table}\n\n"
#     f"Example Narrative:\n{example_narrative}\n\n"
# )
observed_results = f"""
1. Objects of professional origin, objects not directly associated with consumption on location. This includes the following items:

{pro_codes}

2. Objects of personal consumption , objects that are most likely consumed on location. This includes the following items:

{rec_codes}

These are observed results of the personal and professional groups combined for the canton of Bern. The groups are defined by the members as above.

In a separate paragraph: Generate a narrative summary based on the following table.\n\n

{lake_combined_summary}

\n\n




\n\n

The following table separates the combined in to the two use groups. The units are pieces per meter of trash (pcs/m). 
Recall that pers = personal and pro = profesional please refer to them by their proper label. Note that 'count' = the number of samples, 
which is the same for both groups you do not need to mention that.

In a separate paragraph: Generate a narrative summary based on the following table.\n\n

{di}
\n\n
"""
ai_msg_3 = rag_chain.invoke({"input": observed_results , "chat_history": chat_history})
append_to_markdown(report_file_name,ai_msg_3["answer"] + "\n\n")
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_3["answer"])]
)


glue('prompt-3', Markdown(observed_results), display=False)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


### Can you summarise the survey results?

````{dropdown} Prompt

```{glue:md} prompt-3
:format: myst
``` 
````
#### answer

In [16]:
Markdown(ai_msg_3['answer'])


In the canton of Bern, a total of 5678 objects were observed over 98 samples, with an average of 1.62 pieces per meter of trash. The distribution of results ranged from a minimum of 0.1085 to a maximum of 7.75 pieces per meter, with the median (50th percentile) at 0.98 pieces per meter.

When separating the results into the personal (pers) and professional (pro) use groups, both groups had the same number of samples. The mean number of pieces per meter for the personal group was 0.587, while the professional group had a mean of 1.03 pieces per meter. The standard deviation for the personal group was 0.837, and for the professional group was 1.12. The range of results for the personal group was from 0 to 4.04 pieces per meter, and for the professional group was from 0 to 5.32 pieces per meter. The 50th percentile for the personal group was 0.25 pieces per meter, and for the professional group was 0.575 pieces per meter.

In [17]:
lake_combined_summary

Unnamed: 0,result
total,5678
nsamples,98
average,1.616224
5th,0.1085
25th,0.3525
50th,0.98
75th,2.5625
95th,4.764
std,1.647057
max,7.75


In [18]:
di

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
use,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
pers,98.0,0.587449,0.836707,0.0,0.12,0.25,0.675,4.04
pro,98.0,1.028776,1.124252,0.0,0.1925,0.575,1.65,5.32


In [19]:
introduction = f"""
### Land use 

Land use refers to the measurable topographic features within a cirlce of r = 1 500 m and area = $\pi r²$ with the survey location in the middle (the buffer). 
The features, measured in meters squared, are given as a ratio <area of feature>/<area of buffer>. Thus a location with high percentage of buildings (an index of 4 or 5),
will have 60 - 100% of the land in the buffer dedicated to buildings.

The land use is further divided in to two groups: 1.cover , 2.use. Cover refers to those topographical features that do not overlap on a map. That is cover features are mutually 
exclusive, a given area of the buffer is either one or the other of the cover features but never both. The cover features are:

* Buildings, orchards, forest, undefined, vineyards

On the other hand use refers to the activities or features that are present in the buffer and overlap the cover features. For example public services can be located within buildings 
(hospitals, schools) or in a forest (parks, nature areas). These features represent activites or, the features are:

* Pubilc-services, streets, recreation

#### Streets

The streets are measured as the length of the road network in the buffer. The lengths for each location are normalized from 0 - 1. Thus in the table below, the locations 
that have the shortest road net work will be in category 1, the those with a more dense network will be higher.

#### Sampling profile

The sampling profile details the proportion of samples conducted for each land use and magnitude. For example if the column is Forest and the index is 1 and the value is .3, that means
that 30% of the samples were taken from locations that had between 0 and 20% of the buffer dedicated to forest. 

The sample results profile is the average pcs/m for each land use and magnitude. For example if For example if the column is Forest and the index is 1 and the value is .3, that means
that 30% of the samples were taken from locations that had between 0 and 20% of the buffer dedicated to forest. 

There are two tables below. The first table is the sample results profile and the second is the sampling profile.The index is the magnitude of the feature on a scale of 1-5.
In the first table the average pcs/trash per meter of the combined results is given for each land-use (columns) and magnitude (index, 1-5). 

In a separate paragraph for each table: Generate a narrative summary based on the following tables.\n\n
{lake_rate}
\n\n


"""

ai_msg_4 = rag_chain.invoke({"input": introduction, "chat_history": chat_history})
append_to_markdown(report_file_name,ai_msg_4["answer"] + "\n\n")
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_4["answer"]),
    ]
)

glue('prompt-4', Markdown(introduction), display=False)


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


## land use


### What kind of locations were surveyed in the canton of Bern ? How does that compare to the national results?

````{dropdown} Prompt

```{glue:md} prompt-4
:format: myst
```
````
#### Answer

In [20]:
Markdown(ai_msg_4["answer"])

The first table provides the average number of pieces of trash per meter of shoreline based on the magnitude of the land feature. It shows that areas with buildings, wetlands, and recreation features had relatively higher amounts of trash per meter, especially in the lower magnitude categories. Forest areas also had higher trash amounts in the lower magnitude categories but decreased in higher magnitudes. Streets and vineyards had lower to no trash amounts across all magnitudes.

The second table displays the proportion of samples taken based on the magnitude of the land feature. It indicates that samples were more frequently taken from areas with buildings, forests, and public-services, especially in the lower magnitude categories. Undefined areas were also sampled relatively often. Streets, vineyards, and orchards had fewer samples taken, with no samples in the higher magnitude categories.

In [21]:
lake_rates

Unnamed: 0,buildings,wetlands,forest,public-services,recreation,undefined,streets,vineyards,orchards
1,0.774792,0.808112,1.004483,0.808112,0.808112,1.041154,0,0.808112,0.808112
2,0.620588,0.0,0.770323,0.0,0.0,1.532857,0,0.0,0.0
3,1.620937,0.0,0.329286,0.0,0.0,0.636846,0,0.0,0.0
4,0.439,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
5,0.41,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [22]:
lake_profile

Unnamed: 0,buildings,wetlands,forest,public-services,recreation,undefined,streets,vineyards,orchards
2,1.040816,0.0,1.265306,0.0,0.0,0.142857,0.0,0.0,0.0
1,0.489796,2.0,0.591837,2.0,2.0,0.530612,0.0,2.0,2.0
3,0.326531,0.0,0.142857,0.0,0.0,1.326531,0.0,0.0,0.0
4,0.102041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.040816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cluster analysis



In [23]:
# cluster_d['pcs/m'] = target_scaler.fit_transform(cluster_d[['pcs/m']])
cluster_d['pcs/m'] = target_scaler.fit_transform(cluster_d[['pcs/m']])
cluster_d['streets'] = street_scaler.fit_transform(cluster_d[['streets']])
# these_cols = [x for x in filtered_columns if x != 'streets']
cluster_d[filtered_columns] = feature_scaler.fit_transform(cluster_d[filtered_columns])

cluster_pro, summary_pro = clusters_by_use_case(cluster_d, use='pro',scaled_cols=filtered_columns, columns_to_cluster=filtered_columns)
cluster_rec, summary_rec = clusters_by_use_case(cluster_d, use='pers', scaled_cols=filtered_columns, columns_to_cluster=filtered_columns)

In [24]:
summary_pro['pers'] = summary_rec['pcs/m']
summary_pro.rename(columns={'pcs/m':'pro'}, inplace=True)
summary_pro.drop(['use', 'nsamps'], inplace=True, axis=1)
cols = [x for x in summary_pro.columns if x not in ['pro','pers']]
cluster_features = summary_pro[cols].copy()
cluster_results = summary_pro[['pro', 'pers']].copy()

cf = cluster_features.to_markdown()
cr = cluster_results.to_markdown()

In [25]:
prompt_5 = f"""
#### Cluster Analysis

method : [kmeans scikit learn](kmeans https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)

The following are the results of the cluster analysis. The columns are the features that were used to make the clusters. The optimal number of clusters was
determined using the elbow method (you can check the docs for this: https://hammerdirt-analyst.github.io/feb_2024/titlepage.html). 

Table has the following format:

1. the columns are the measured land use features
2. the index is the cluster number
3. the value is the proportion that is attributed to that column. For example if buildings in cluster 1 = .17 it means that the locations in that cluster had
on average 17% of the buffer attributed to buildings

can you please summarize kmeans clustering, with reference to scikit learn (provide link) and explain the elbow method in paragraph form ? 

In a separate paragraph please summarize the componsition of each cluster in paragraph form. Be specific and use the values from the table. Be sure to account for
at least 50% of each cluster in your summary.



{cf}


"""


ai_msg_5 = rag_chain.invoke({"input": prompt_5, "chat_history": chat_history})
append_to_markdown(report_file_name,ai_msg_5["answer"] + "\n\n")
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_5["answer"]),
    ]
)

glue('prompt-5', Markdown(prompt_5), display=False)


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


### What are results of the cluster analysis ?

````{dropdown} Prompt

```{glue:md} prompt-5
:format: myst
```
````
#### Answer

In [26]:
Markdown(ai_msg_5["answer"])

KMeans clustering is a popular machine learning algorithm used for clustering tasks in data analysis. It is implemented in Python's scikit-learn library, and you can find more information about it [here](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html). The algorithm works by partitioning the data into a predefined number of clusters based on the features provided. The elbow method is a technique used to determine the optimal number of clusters for KMeans clustering. It involves plotting the within-cluster sum of squares (inertia) against the number of clusters and identifying the "elbow" point where the rate of decrease in inertia slows down. The number of clusters at the elbow point is considered the optimal choice.

In the cluster analysis results provided, there are six distinct clusters identified based on the measured land use features. Cluster 0 is characterized by a significant proportion of forest land use (55.9%) with moderate proportions of buildings (16.7%) and streets (9.6%). Cluster 1 shows a high proportion of undefined land use (48.6%) and forest (21.3%), with a substantial amount of buildings (30%). Cluster 2 has a dominant proportion of forest (54.1%) and buildings (30.9%), with streets (49.7%) also playing a significant role. Cluster 3 stands out with the highest proportion of buildings (68.2%) and a balanced mix of forest (15.3%) and streets (25.2%). Cluster 4 is characterized by a predominant proportion of undefined land use (55%) and forest (27.1%), with a notable presence of recreation areas (15.5%). Cluster 5 shows a diverse profile with relatively high proportions of public-services (18.8%), forest (30.8%), and streets (27.3%), along with vineyards (19.2%).

In [27]:
cluster_features

Unnamed: 0_level_0,public-services,buildings,forest,undefined,vineyards,streets,recreation
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.038,0.167,0.559,0.133,0.0,0.096574,0.002
1,0.003,0.3,0.213,0.486,0.0,0.062862,0.0
2,0.061,0.309,0.107,0.541,0.02,0.49708,0.025
3,0.044,0.682,0.153,0.149,0.015,0.251846,0.005
4,0.003,0.047,0.271,0.55,0.0,0.0,0.155
5,0.188,0.14,0.308,0.297,0.192,0.273278,0.014


In [28]:
prompt_6 = f"""
#### Cluster Analysis

The following table is the average pcs/m per cluster, where the clusters were defined in the previous table. In this case we consider
the survey results for different objects groups in the same cluster. For example if the column = 'pers' and cluster = 2 and the value is
1.23 that means that for survey locations in cluster 2 the average pcs/m for these locations is 1.23. Cluster 2 is defined in the previous table.

Table has the following format:

1. the columns are the use group
2. the index is the cluster number
3. the value is the average pcs/m found in that cluster for that use group


{cr}
\n\n

Can you please summarize the results ? Be sure to note where each use group has the highest and lowest pcs/m ? Identify where there is
signiifgant difference between personal and profesional use. Make sure to summarize the dominant components of each cluster in reference
to this table that defines the composition of each cluster:

{cf}
"""


ai_msg_6 = rag_chain.invoke({"input": prompt_6, "chat_history": chat_history})
append_to_markdown(report_file_name,ai_msg_6["answer"] + "\n\n")
chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=ai_msg_6["answer"]),
    ]
)

glue('prompt-6', Markdown(prompt_6), display=False)


Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


### What are the differences between the two use groups?

````{dropdown} Prompt

```{glue:md} prompt-6
:format: myst
```
````
#### Answer

In [29]:
Markdown(ai_msg_6["answer"])

Cluster 0 has an average of 0.422 pcs/m for professional use and 0.236 pcs/m for personal use. Cluster 1 shows 0.123 pcs/m for professional use and 0.116 pcs/m for personal use. In Cluster 2, the average pcs/m is 1.864 for professional use and 0.281 for personal use. Cluster 3 has 1.125 pcs/m for professional use and 1.392 pcs/m for personal use. Cluster 4 exhibits 0.541 pcs/m for professional use and 0.696 pcs/m for personal use. Finally, Cluster 5 shows 2.827 pcs/m for professional use and 1.537 pcs/m for personal use.

In summary, Cluster 5 has the highest average pcs/m for both professional and personal use, while Cluster 1 has the lowest for both categories. The most significant difference between personal and professional use is observed in Cluster 3, with much higher pcs/m for personal use compared to professional use.

Analyzing the dominant components of each cluster based on the composition table provided:
- Cluster 0 is characterized by a substantial proportion of forest land, with moderate amounts of buildings and streets.
- Cluster 1 has a high proportion of undefined land use and forest, with a notable presence of buildings.
- Cluster 2 shows a dominant proportion of forest and buildings, with a significant amount of streets.
- Cluster 3 stands out with the highest proportion of buildings and a balanced mix of forest and streets.
- Cluster 4 has a predominant proportion of undefined land use and forest, with a notable presence of recreation areas.
- Cluster 5 displays a diverse profile with relatively high proportions of public-services, forest, streets, and vineyards.

These results indicate varying levels of litter accumulation across different land use compositions, with some clusters showing higher average pcs/m for either professional or personal use, emphasizing the importance of considering the local environment when assessing litter pollution.

In [30]:
cluster_results

Unnamed: 0_level_0,pro,pers
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.422857,0.235714
1,0.1225,0.11625
2,1.864483,0.281034
3,1.125217,1.392174
4,0.540833,0.695833
5,2.826667,1.536667
