In [1]:
import pandas as pd
import numpy as np


import matplotlib.patches as mpatches
import session_config
from session_config import  collect_survey_data, feature_variables
from reports import make_report_objects, reports_and_forecast
from reports import admin_report, features_present, histograms_standard
from reports import ecdf_plots_standard, scatter_plot_standard
from reports import labels_for_display, make_standard_report, make_report_objects
# import userdisplay
# import geospatial
import gridforecast as gfcast
# import datetime as dt
from IPython.display import Markdown

from featureevaluator import FeatureEvaluation
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression, LassoCV, TheilSenRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
# from sklearn.decomposition import PCA
# from scipy.spatial import ConvexHull
from sklearn.exceptions import ConvergenceWarning
import warnings

import openai
from dotenv import load_dotenv
import os
from myst_nb import glue

# import bs4
# from langchain import hub
# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain_openai import ChatOpenAI
# from langchain_community.document_loaders import WebBaseLoader
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_openai import OpenAIEmbeddings
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.chains import create_history_aware_retriever
# from langchain_core.prompts import MessagesPlaceholder
# from langchain_chroma import Chroma
# from langchain_text_splitters import MarkdownHeaderTextSplitter
# from langchain_core.messages import AIMessage, HumanMessage
# from langchain_openai import OpenAI

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

datax = collect_survey_data()
codes = pd.read_csv('data/end_process/codes.csv').set_index('code')

# from use_cases example
ooi = ['G10',  'G30', 'G31', 'G33', 'G34', 'G35', 'G8', 'G7', 'G6', 'G5', 'G4', 'G37', 'G2', 'G27', 'G25', 'G26', 'G11']
# more refined search
tobo_snacks = ['G27', 'G30', 'G35']
# unidentified, plastic, different uses
# udi = ['Gfrags', 'Gfoams']
# industrial
indus = ['G89', 'G67', 'G112', 'G93' , 'G66','G74', 'G72', 'G87', 'G65', 'G69', 'G68', 'G43', 'G41', 'G38', 'G36', 'G19', 'G17', 'Gfrags']

# features
land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'recreation', 'streets']

In [2]:
def evaluate_feature_importance(best_model, model_name, X_test, y_test, X_train, y_train):

    # the permuation importance of the variables
    if model_name in ['Random Forest Regression', 'Linear Regression', 'Gradient Boosting Regression',  'Theil-Sen Regressor']:
        perm_importance = permutation_importance(best_model, X_test, y_test, n_repeats=30, random_state=42)
        perm_importance_df = pd.DataFrame({
            'Feature': X_test.columns,
            'Importance': perm_importance.importances_mean
            }).sort_values(by='Importance', ascending=False)

    try:
    # model feature importance
        feature_importances_rf = best_model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': X_test.columns,
            'Importance': feature_importances_rf
        }).sort_values(by='Importance', ascending=False)
        return feature_importance_df, perm_importance_df
    except AttributeError:
    # if feature importance not avaialable try the coefficients
        try:
            params = best_model.coef_
            feature_importances_rf = params
            feature_importance_df = pd.DataFrame({'feature':X_test.columns, 'Coeficient':feature_importances_rf})
            return feature_importance_df, perm_importance_df
        except AttributeError:
            #return an empty DataFrame
            return pd.DataFrame(), perm_importance_df

def find_elbow_point(sse):
    n_points = len(sse)
    all_coords = np.vstack((range(n_points), sse)).T
    first_point = all_coords[0]
    last_point = all_coords[-1]

    line_vec = last_point - first_point
    line_vec_norm = line_vec / np.sqrt(np.sum(line_vec**2))

    vec_from_first = all_coords - first_point
    scalar_product = np.sum(vec_from_first * line_vec_norm, axis=1)
    vec_from_first_parallel = np.outer(scalar_product, line_vec_norm)
    vec_to_line = vec_from_first - vec_from_first_parallel

    dist_to_line = np.sqrt(np.sum(vec_to_line**2, axis=1))
    elbow_point = np.argmax(dist_to_line)
    
    return elbow_point + 1

def filter_features(data, threshold: float = 0.2, terms: [] = None ):
   
    filtered_columns = [col for col in terms if (data[col] > 0).mean() >= threshold]
    return data[['pcs/m',  *filtered_columns]], filtered_columns
    

def determine_optimal_clusters(d):
   
    sse = []
    k_range = range(1, 11)
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(d)
        sse.append(kmeans.inertia_)
    
    optimal_k = find_elbow_point(sse)
    return optimal_k, sse

def kmeans_clustering(n_clusters, w_interactions: bool = False):
    
    kmeans = kmeans_plusplus(n_clusters=n_clusters, random_state=42)
    
        
    d['clusters'] = kmeans.fit_predict(d)
    some_features = [x for x in d.columns if x not in ['pcs/m','clusters', 'streets']]
    
    means = d.groupby(['clusters'])['pcs/m'].mean()
    means_unscaled = self.unscale_target(means)
    
    counts = d.groupby(['clusters'])['pcs/m'].count()
    
    cluster_summary = d.groupby('clusters').agg({x:'mean' for x in some_features}).reset_index()
    cluster_summary = self.unscale_values(cluster_summary, columns=some_features, w_interactions=w_interactions)
    cluster_summary['pcs/m'] = means_unscaled
    cluster_summary['samples'] = counts.values
    cluster_summary = cluster_summary[['samples', 'pcs/m', *cluster_summary.columns[:-2]]]
           
    return cluster_summary, kmeans, d

def unscale_target(means, ascaler):
    means = means.values
    means_shape = means.shape
    if means.ndim == 1:
        means = means.reshape(1, -1)

    means_unscaled = ascaler.inverse_transform(means)
        
    means_unscaled.reshape(means_shape)
    return means_unscaled[0]

def perform_regression_analysis(d, features: [] = None, target_var: str = 'pcs/m'):
    params = {
        "n_estimators": 100,
        "max_depth": 4,
        "min_samples_split": 5,
        "learning_rate": 0.01,
        "loss": "huber",
        "alpha": .9
       
        }
    these_models = {
        'Linear Regression': LinearRegression(),
        'Random Forest Regression': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting Regression': GradientBoostingRegressor(**params),
        'Theil-Sen Regressor': TheilSenRegressor(random_state=42)
        }
      
    
    X = d[features]
    y = d[target_var].values
       
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    regression_results = []
    best_model = None
    best_r2 = -np.inf
    the_name = None
    
    # sklearn - linear models        
    for model_name, model in these_models.items():
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", ConvergenceWarning)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            regression_results.append({'Model': model_name, 'R²': r2, 'MSE': mse})
            
            if r2 > best_r2:
                best_r2 = r2
                best_model = model
                the_name = model_name
    # bagging
    bag_estimator = these_models[the_name]
    bag = BaggingRegressor(estimator=bag_estimator)
    bag.fit(X_train, y_train)
    y_pred = bag.predict(X_test)
    predictions = {
        the_name: best_model.predict(X_test),
        'Bagging': y_pred
    }

    regression_results.append({'Model': f'Bagging:{the_name}', 'R²': bag.score(X_test, y_test), 'MSE':mean_squared_error(y_test, y_pred)})
    # voting

    lnr = these_models['Linear Regression']
    rf = these_models['Random Forest Regression']
    gbr = these_models['Gradient Boosting Regression']
    voting = VotingRegressor([('lnr', lnr), ('rf', rf), ('gbr', gbr)])
    voting.fit(X_train, y_train)
    y_pred = voting.predict(X_test)
    predictions.update({'voting': y_pred})
    
    regression_results.append({'Model': 'Voting', 'R²': voting.score(X_test, y_test), 'MSE':mean_squared_error(y_test, y_pred)})    
    
    return regression_results, best_model, the_name, predictions, X_test, y_test, X_train, y_train

def create_interaction_terms(data, interaction_terms=None, target='pcs/m'):
    if interaction_terms is None:
        interaction_terms = ['streets', 'public-services', 'recreation']
    
    
    d_cols = [x for x in data.columns if x not in [target, 'use']]
    interaction_data = {}
    interaction_columns = []
    for col in d_cols:
        if col not in interaction_terms:
            feature_value = data[col].values
            interaction_name = f'{col}'
            for term in interaction_terms:
                feature_value += data[col].values * data[term].values
                interaction_name += f'_inter_{term}'
                
            interaction_data[interaction_name] = feature_value
            interaction_columns.append(interaction_name)
    
    interaction_data = pd.DataFrame(interaction_data)
    interaction_data[target] = data[target]
    interaction_data['use'] = data['use']
    return interaction_data, interaction_columns



def clusters_by_use_case(cluster_data, scaled_cols: [] = None, columns_to_cluster: [] = None, scalers: {} = None):

   
       
    cluster_p = cluster_data.copy()
   
    nclusters = determine_optimal_clusters(cluster_p[columns_to_cluster])
    kmeans = KMeans(n_clusters=nclusters[0], random_state=42).fit(cluster_p[columns_to_cluster])
    cluster_p['cluster'] = kmeans.labels_
    cluster_p[scaled_cols] = scalers['feature_scaler'].inverse_transform(cluster_p[scaled_cols])
    cluster_p['pcs/m'] = scalers['target_scaler'].inverse_transform(cluster_p['pcs/m'].values.reshape(-1,1))
    cluster_p['streets'] = scalers['street_scaler'].inverse_transform(cluster_p['streets'].values.reshape(-1,1))
    scaler = MinMaxScaler().fit(cluster_p['streets'].values.reshape(-1,1))
    cluster_p['streets'] = scaler.transform(cluster_p['streets'].values.reshape(-1,1))
    df = cluster_p.drop_duplicates('cluster').sort_values('cluster').set_index('cluster', drop=True)
    pcs_m = cluster_p.groupby(['cluster'], as_index=False).agg({'pcs/m': 'mean'}).set_index('cluster', drop=True)
    samps = cluster_p.groupby(['cluster'], as_index=False).agg({'pcs/m': 'count'}).rename(columns={'pcs/m':'nsamples'}).set_index('cluster', drop=True)
    pcs_m['nsamps'] =samps.nsamples.values
    df = pcs_m.merge(df[columns_to_cluster], left_index=True, right_index=True)

    return cluster_p, df



def append_to_markdown(filename, content):
    with open(filename, 'a') as f:
        f.write(content)

def use_chat_completion(client, model: str = 'gpt-3.5-turbo-0125', messages: [{}] = None):
    
    
    completed_chat = client.chat.completions.create(model=model, messages=messages)
    return completed_chat

def messages_for_chat_completion(system_prompt: str = None, user_prompt: str = None):
    
    messages=[
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}]
    
    return messages

def create_system_prompt(prompt, context="") -> str:
    return f"{prompt}{context}"



o_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}

canton = 'Vaud'
this_feature_type = 'l'

d = datax.reset_index(drop=True)

dc = d[d.canton.isin(['Genève', 'Valais', 'Vaud', 'Zürich', 'Bern'])].copy()
dc['date'] = pd.to_datetime(dc['date'])
dc_lakes = dc[dc.feature_type == 'l'].feature_name.unique()

In [3]:


def filter_dataframe(df, filters):
    """
    Filters the DataFrame based on the provided dictionary of column-value pairs.
    Special handling for 'start' and 'end' keys to filter date ranges.
    
    Args:
    df (pd.DataFrame): The DataFrame to filter.
    filters (dict): A dictionary where keys are column names and values are the values of interest.
    
    Returns:
    pd.DataFrame: The filtered DataFrame.
    """
    queries = []
    date_column = None
    
    for col, value in filters.items():
        if col == 'start':
            date_column = 'date'  # Assuming 'date' is the column name for dates
            queries.append(f"({date_column} >= '{value}')")
        elif col == 'end':
            date_column = 'date'  # Assuming 'date' is the column name for dates
            queries.append(f"({date_column} <= '{value}')")
        else:
            if isinstance(value, str):
                queries.append(f"({col} == '{value}')")
            else:
                queries.append(f"({col} == {value})")
    
    query = " & ".join(queries)
    return df.query(query)




# data = filter_dataframe(dc.copy(), result_dates)
# info_columns =  ['canton', 'city', 'feature_name']
# first_report, first_land_use = make_report_objects(data, info_columns=info_columns)

In [4]:
# from typing import Annotated, Literal, TypedDict
# from typing import List, Optional, Dict

# from langchain_core.tools import tool
# from langgraph.checkpoint.memory import MemorySaver
# from langgraph.graph import END, StateGraph, START, MessagesState
# from langgraph.prebuilt import ToolNode
# from langchain_openai import ChatOpenAI

# from langchain_core.messages import (
#     BaseMessage,
#     HumanMessage,
#     ToolMessage,
# )
# from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder





In [6]:
o_dates = {'start':'2020-01-01', 'end':'2021-12-31'}
prior_dates = {'start':'2015-11-15', 'end':'2019-12-31'}

canton = 'Bern'
this_feature_type = 'l'

d = datax.reset_index(drop=True)

d = d[d.canton.isin(['Genève', 'Valais', 'Vaud', 'Zürich', 'Bern'])]

# make complete report
params_l = {'canton':canton, 'date_range':o_dates, 'feature_type': this_feature_type}
params_p = {'canton':canton, 'date_range':prior_dates, 'feature_type':this_feature_type}

# set the parameters for the weighted prior
# exclude records in the likelihood, set date range and feature type
lu_catalogue = d[(d.canton != canton)&(d['date'] <= o_dates['end'])&(d.feature_type == 'l')].copy()
catalog_surveys, catalog_features = make_report_objects(lu_catalogue)

# this is the prior data: all data collected from
# the same feature type. Lakes, rivers or parks
prior_feature = catalog_features.df_cat
prior_feature['feature_type'] = 'l'

# the prior and likelihood data from the region of interest
all_data_of_interest = d[(d['date'] >= prior_dates['start']) & (d['date'] <= o_dates['end'])&(d.feature_type == 'l')].copy()
all_data_of_interest = all_data_of_interest[all_data_of_interest.canton == 'Bern'].copy()

# create a variable for different code group totals
all_data_of_interest = all_data_of_interest[all_data_of_interest.code.isin([*indus, *tobo_snacks])].copy()

all_data_of_interest_i = all_data_of_interest[all_data_of_interest.code.isin(indus)].copy()
all_data_of_interest_i['use'] = 'pro'

all_data_of_interest_p = all_data_of_interest[all_data_of_interest.code.isin(tobo_snacks)].copy()
all_data_of_interest_p['use'] = 'pers'

all_data_of_interest = pd.concat([all_data_of_interest_i, all_data_of_interest_p])

all_data_of_interest.reset_index(inplace=True, drop=True)

land_covers = ['buildings', 'forest', 'undefined', 'public-services', 'streets', 'orchards', 'use', 'canton', 'city', 'feature_name']

all_report, all_land_use = make_report_objects(all_data_of_interest, info_columns = ['use', 'canton', 'city', 'feature_name'])


args = {
    'likelihood': {'canton':canton, 'date_range':o_dates},
    'prior' : {'canton':canton, 'date_range':prior_dates},
    'data' : all_data_of_interest.copy(),
    'land-use-inventory' : prior_feature.copy()
}


combined_results = reports_and_forecast(args['likelihood'], args['prior'], ldata=args['data'])
standard_combined = make_standard_report(combined_results, args)


lake_report = combined_results['this_report']
lake_prior_report = combined_results['prior_report']
lake_land_use = combined_results['this_land_use']

scaled_cols = ['public-services', 'buildings', 'forest', 'undefined', 'vineyards', 'orchards', 'streets', 'recreation']
system_prompt = (
    "Transcribe the values from tables and put them in paragraph form."
    "Being carefull that each value in the table is accounted for in the"
    "paragraph. You are to do this in a narrative form. Answers must be concise."
      
    "\n\n"
    "{context}"
)



In [7]:
context_r = f"""

## Litter surveys in Switzerland 2020-2021 - IQAASL

Identification, quantification and analysis of anthropogenic Swiss litter (IQAASL) is a project commissioned by the Swiss 
Federal Office for the Environment to collect data concerning visible pollutants along Swiss lakes and rivers. All 
discarded materials were collected and identified using litter survey techniques. in total there were 406 samples from 163 
locations in 95 municipalities.

This report is a summary and analysis of the litter surveys conducted and the methods employed in Switzerland from March
2020 through August 2021. This sampling phase overlaps with the Swiss Litter Report (SLR) survey period, which ran
from April 2017 to March 2018. The SLR was the first project on a national level to use the standard protocol described in 
the Guide to monitoring beach litter or any other comparable method. This overlap allows the results of the 
present study to be compared with those of the SLR.

## Lakes and rivers

The lakes and rivers were sampled from 2020-03 through 2021-05, a total of 54,744 objects were removed and classified over 
the course of 386 surveys. The survey locations were divided into survey areas for regional analysis and defined by the Aare, 
Rhône, Ticino and Linth/Limmat rivers. Surveys were conducted at 143 different locations, representing 77 municipalities. 
The total linear distance surveyed was 20 km with a surface area of 9 hectares and a total municipal population of 1.7 million.

Most surveys were along lake shorelines (331 samples) as lakes offer more consistent and safe year-round access with 
respect to rivers. Additionally, lakes are large areas of reduced flow that receive input from multiple rivers, streams 
and drainage systems providing ideal locations to assess the variety of objects in and around the water bodies.

In total 316 samples came from seven principal lakes in 3 major river basins. Twenty locations were selected to sample 
monthly for a twelve-month period with the exception of Lago Maggiore, which was sampled every three months. 
Surveys were also conducted on Lago di Lugano, Lac des Quatre cantons, Brienzersee and Zugersee. In addition, there 
were 55 surveys on 16 rivers.

### The sampling locations - type and description

The land use is reported as the percent of total area attributed to each land use category within a 1500m radius of the 
survey location. The ratio of the number of samples completed at the different land use profiles is an indicator of the 
environmental and economic conditions around the survey locations.

The land use around the survey locations had a higher attribution to buildings as opposed to agriculture and woods. For 
example, half of all the surveys had at least 37% of land use devoted to buildings as opposed to 19% for agriculture or 
13% to woods. Land use devoted to recreation was at least 6% for half of all samples.

The length of the road network within the buffer zone differentiates between locations that have other wise similar land 
use characteristics. The length of road per buffer ranges from 13km to 212km, 50% of the surveys had less than 67km of road network.

The number of intersections ranges from zero to 23, 50% of the surveys had 3 or fewer intersections within 1500m of the 
survey location. The size of the intersecting river or canal was not taken into consideration. Survey locations on rivers 
have zero intersections.

The population (not shown) is taken from statpop 2018 and represents the population of the municipality surrounding the 
survey location. The smallest population was 442 and the maximum was 415,367, 50% of the surveys come from 
municipalities with a population of at least 12,812.

Overall, surveys at locations with more buildings and more recreation sites were more likely to facilitate the accumulation 
of trash on the shoreline. When the most common objects are considered, only four of the twelve were found at higher rates 
in the presence of more buildings. All of those objects are likely related to food or tobacco consumption near the location. 
Suggesting that there are still gains to be made in prevention and attenuation efforts in areas of high traffic near the water.

However, six of the twelve objects have no positive association to land use attributed to buildings but were found in at 
least 50% of all the surveys. These objects are generally associated with professional use or in the case of cotton swabs 
personal hygiene:

* plastic construction waste
* fragmented plastics
* industrial sheeting
* expanded polystyrene
* cotton bud/swabs
* insulation, includes spray foams

Furthermore, compared to products related to tobacco or food consumption these objects have fewer positive associations in 
general. Indicating that the appropriate land use feature is not currently accounted for and/or these objects are found 
at similar quantities indifferent of the land use features. Suggesting that these objects are ubiquitous in the environment.

Finally, two of the twelve most common objects were found in less than 50% of the surveys and have few positive associations:

* industrial pellets
* expanded foams < 5mm

These objects are found in large quantities sporadically at specific locations. They have been found in all survey areas 
and in all lakes. Industrial pellets have a very specific use and client base making it possible to find partners based 
on the density of the pellets found and the location of the nearest consumer or producer of pellets, see Shared responsibility.

### Median survey total

The results are in units of pieces of litter per 100 meters (p/100m). The median survey result of all data was approximately
189 p/100m. The maximum recorded value was 6,617 p/100m (Rhône survey area) and the minimum recorded was 2p/100m (Aare survey area).
The Rhône survey area had the highest median survey total of 442p/100m, this can in part be explained by the high number
of urban survey locations with respect to the other survey areas and the deposition of fragmented plastics and foamed 
plastics at the Rhône River out flow in the upper lake region.

A reference value was calculated excluding the results from samples that were less than 10m and objects less than 2.5cm. 
This method, described in EU Marine Beach Litter Baselines was used to calculate the reference and threshold 
values for all European beaches in 2015 and 2016 resulting in a median value of 131 p/100m. The results from the European 
baseline value lie outside the 95% confidence interval (CI) of 147 - 213p/100m established using the data from IQAASL.

Surveys in Switzerland were on average, smaller scale than in marine environments and in locations that would be 
considered urban under most circumstances. To date monitoring of lakes and rivers upstream of coastal regions has 
not generalized on the European continent. However, there is a concerted effort by a group of associations in 
Switzerland and France to establish a common monitoring and data exchange protocol for the Rhône basin. Additionally, 
the Wageningen University & Research has begun analyzing data collected in the Meusse - Rhine delta using 
protocols like those in IQAASL.

### The most common objects

The most common objects are defined as those objects identified in at least 50% of all surveys and/or are among the ten 
most abundant by quantity. As a group the most common objects represent 68% of all objects identified in the sampling period. 
Of the most common items 27% are food, drink and tobacco related and 24% are infrastructure and agriculture related.

Objects related to food, drink and tobacco are identified at higher rates at survey locations with a greater percentage 
of land attributed to buildings or fixed infrastructure, the inverse is true of the locations with a higher percentage 
of land attributed to woods or agriculture. However, infrastructure material and fragmented plastics, are found at similar 
rates throughout all survey areas indifferent of land use surrounding the survey locations.

The most common objects identified in the surveys were:

* cigarette ends: total 8'485, % of all objects 15.5%, fail-rate 87%, p/100m 20
* fragmented plastics: total 7'400, 13% of all objects, fail-rate 86%, p/100m 18
* expanded polystyrene: total 5'563, 10% of all objects, fail-rate 68%, p/100m ,
* snack wrappers: total 3'325, 6% of all objects, fail-rate 85%, p/100m 9
* industrial sheeting: total 2'534, 4% of all objects, fail-rate 69%, p/100m 5
* glass drink bottles, pieces: total 2'136, 3% of all objects, fail-rate 65%, p/100m 3
* industrial pellets: total 1'968, 3% of all objects, fail-rate 30%, p/100m 4
* insulation, includes spray foams: total 1'702, 3% of all objects, fail-rate 53%, p/100m 1
* cotton bud/swabs: total 1'406, 2% of all objects, fail-rate 50%, p/100m 1
* expanded foams < 5mm: total 1'209, 2% of all objects, fail-rate 25%, p/100m 0
* plastic construction waste: total 992, 1% of all objects, fail-rate 52%, p/100m 1
* metal bottle caps: total 700, 1% of all objects, fail-rate 52%, p/100m 1



Industrial pellets and expanded foams < 5mm both occurred in significant quantities but identified in less than 50% of 
the surveys (median of 0), indicating high counts at specific locations. While both are micro plastics, their use, 
origin and rate of occurrence are different depending on the survey area region. Industrial pellets are raw materials 
used in injection molding processes whereas foamed plastic beads are the result of fragmentation of expanded polystyrene.

### Conclusions

At the national level, the IQAASL results are stable compared to the surveys that were carried out in 2017 as part of the 
SLR study. However, there was a general decrease in the quantity of food, drink and tobacco objects. Infrastructure 
objects and fragmented plastics and foams did not decline and some locations may have experienced sharp increases. 
Pandemic restrictions limiting large outdoor gatherings may have had a beneficial effect on the reduction of food, drink 
and tobacco items. The greatest increases in infrastructure related objects were in Valais, Vaud and Brienz, which are 
locations near the Rhône and Aare rivers discharge points.

The land use around a survey location has a measurable effect on the deposition of certain objects. The more buildings 
and fixed infrastructure there are the more tobacco and food products are found. Objects like fragmented plastics and 
industrial sheeting do not have the same association and are identified at approximately equal rates indifferent of the 
land use with increases near river/canal discharge points.

Currently three of the four survey areas in the IQAASL are actively monitored by research and governmental agencies 
downstream of Switzerland using similar methods presented in this report. Additionally, regional associations in 
Switzerland are actively pursuing a standardization of reporting and protocols with partner organizations in the EU.

The IQAASL is a citizen-science project that only uses open-source tools and shares data on GNU public license, 
enabling collaboration with stakeholders. At the end of the mandate, December 31, 2021, Hammerdirt will assume the 
responsibility of maintaining the code and data repository which is hosted publicly on Github.

The associations that participated in the IQAASL are actively seeking ways to incorporate the data collection process 
and/or the results into their own business model. However, there is a shortage of data scientists within many regional 
associations which may lengthen the process of integration and stifle the rate of innovation at the level where it is needed most.
"""

glue('context-r', context_r, display=False)

## Context: Litter surveys in Switzerland 2020-2021 - IQAASL

We provide one document for the context here. It is indexed with 

Identification, quantification and analysis of anthropogenic Swiss litter (IQAASL) is a project commissioned by the Swiss 
Federal Office for the Environment to collect data concerning visible pollutants along Swiss lakes and rivers. All 
discarded materials were collected and identified using litter survey techniques. in total there were 406 samples from 163 
locations in 95 municipalities.

This report is a summary and analysis of the litter surveys conducted and the methods employed in Switzerland from March
2020 through August 2021. This sampling phase overlaps with the Swiss Litter Report (SLR) survey period, which ran
from April 2017 to March 2018. The SLR was the first project on a national level to use the standard protocol described in 
the Guide to monitoring beach litter or any other comparable method. This overlap allows the results of the 
present study to be compared with those of the SLR.

:::{dropdown} See the rest of the context document

```{glue:md} contex-r
:format: myst
```

In [8]:
indus_code_defs = codes.loc[indus, 'en']
pro_codes = (', ').join(indus_code_defs.values)

rec_code_defs = codes.loc[tobo_snacks, 'en']
rec_codes = (', ').join(rec_code_defs.values)

use_groups = {'personal':rec_codes, 'professional':pro_codes}

In [9]:
def a_model_feature_importance_prompt(table):
    feature_importance_prompt = f"""
The following table details the model feature importance. 

Table has the following format:

1. Feature: the name of the land-use feature
2. importance: The model feature importance

Convert the following table into a paragraph, reporting the values for each row without any comments or analysis:

{table}


"""
    return feature_importance_prompt




def a_permutation_feature_importance_prompt(table):
    feature_importance_prompt = f"""
The following table details the permutation feature importance. 

Table has the following format:

1. Feature: the name of the land-use feature
2. importance: The model feature importance

Convert the following table into a paragraph, reporting the values for each row without any comments or analysis:
   
{table}


"""
    return feature_importance_prompt

def a_forecast_prompt(table):
    forecast_prompt = f"""
The following contains the expected distribution of survey results.

The table has the following format:

1. average: the expected average sample total
2. hdi min: the minimum of the 90% Highest Density Interval
3. hdi max: the maximum of the 90% of the Highest Density Interval
4. 5th, 25th, 50th, 75th, 95th : the percentile rankings based on the expected distribution
5. max predicted: the maximum value predicted by the model

Generate a narrative summary based on the following table. Include all values. Reply in paragraph format, do not comment do not embelish. Use the following style guide:
    
{table}


"""
    return forecast_prompt

def admin_prompt(table, place_names):
    prompt  = f"""
The following table details the number of survey locations, cities, cantons and survey areas present in the data under analysis. 

Please provide a concise narrative of the contents of the following table. In your narrative be sure to include the list of cities, 
and the names of the canton and survey areas.

 {table}

The following is the names of the cities, cantons, and survey areas.

{place_names}    
"""
    return prompt

def feature_count_prompt(table, place_names):
    prompt = f"""
The following table details the number and the name of the lakes, rivers and parks in the survey data under analysis. 

Please provide a concise narrative of the contents of the following table. In your narrative be sure to the name of each park, lake or river
that is mentioned.

{table}


The following is the names of the lakes, rivers and parks included in the data.

{place_names}


"""
    return prompt

def survey_result_summary_prompt(table):
    
    combined_summary_prompt  = f"""
These are the survey totals for the data we are studying. We are analyzing count data from beach-litter surveys. The table has the following format:

1. total (quantity) = the total number of objects identified
2. nsamples = the numner of samples collected
3. average = average objects per meter\n
4. 5th, 25th, 50th,	75th, 95th = the objects per meter percentile ranking
5. std = standard deviation in objects per meter
6. max = the maximum recorded objects per meter
7. start = the date of the first sample
8. end = the date of the las sample

Generate a narrative summary based on the following table.

{table}


"""
    return combined_summary_prompt

def inventory_prompt(table):    
    inventory_prompt  = f"""
This is the list of all objects found at the beach. The table has the following format:   

1. code: object identifier
1. quantity: the total number found
2. pcs/m = average objects per meter
3. % of total = the proportion of the total for for this object
4. sample_id = the number of samples
5. fails = the number of times at least one of the object was found at a survey
6. rate =  fails/the number of samples
7. object: the plain english name of the object type

Generate a narrative summary based on the following table. You need to list all the codes starting from the top and working down that make up at least 50% of the total.
Provide the code quantity and % of total.

{table}


"""
    return inventory_prompt

def landuse_profile_prompt(table):
    
    profile_prompt  = f"""

The following table describes the distribution of samples according to the proportion of the buffer area (index) attributed to a topographical feature (columns) for example if
if the index is 1 (0-20%) and the forest column is .1 that means that 10% of the samples took place in locations where 0-20% of the buffer was attributed to forest.

Convert the following table into a paragraph, reporting the values for each column along with their respective index values without any comments or analysis:

The table has the following format:

1. Index = proportion of buffer occupied by feature: (1 - 5) or (0-20%, 20-40%, 40-60%, 60-80%, 80-100%)
2. Columns = The named topographical feature that maybe in the buffer
3. Values = The proportion of all the samples that were conducted at the magnitude and feature.


{table}


"""
    return profile_prompt

def landuse_rates_prompt(table):    

    rates_prompt  = f"""    
The table has the following format:

1. Index = proportion of buffer occupied by feature: (1 - 5) or (0-20%, 20-40%, 40-60%, 60-80%, 80-100%)
2. Columns = The named topographical feature that maybe in the buffer
3. Values = the average pieces of trash per meter that was observed at the magnitude and feature

Convert the following table into a paragraph, reporting the values for each column along with their respective index values without any comments or analysis:

{table}


"""
    return rates_prompt

def cluster_composition_prompt(table):    

    cluster_prompt = f"""    
The following are the summary results of a cluster analysis. The columns are the features that were used to make the clusters. The optimal number of clusters was
determined using the elbow method (you can check the docs for this: https://hammerdirt-analyst.github.io/feb_2024/titlepage.html). The table displays the average magnitude
of each feature in the cluster. For example if the value for forest, cluster 1 = .45 then that means that in cluster 1, the average sample was taken from a location that was
45% dedicated to forest.

Table has the following format:

1. the columns are the measured land use features
2. the index is the cluster number
3. the value is the proportion of the cluster that is attributed to that column. For example if buildings in cluster 1 = .17 it means that the average magnitude of
the buildings variable was 0.17 in cluster 1.

Convert the following table into a paragraph, reporting the values for each column along with their cluster number values without any comments or analysis:
   
{table}


"""
    return cluster_prompt

def cluster_rates_prompt(table):
    
    cluster_rates = f"""    
The following are the observed sample average per cluster. The units is objects per meter of beach. The columns are the use case of the objects: personal or professional. The index is
the cluster number.

Table has the following format:

1. the columns are the object use case
2. the index is the cluster number
3. the value is the objects found per meter of beach

Convert the following table into a paragraph, reporting the values for each column along with their respective cluster values without any comments or analysis:
The narrative needs to be in paragraph format.
       
{table}   


"""
    return cluster_rates

def regression_results_prompt(table):
    
    prompt = f"""
    
The following table details the results from different regression analysis of our data. 

Table has the following format:

1. Model: the type of regression model used
2. R²: The coefficient of determination
3. MSE: the mean squared error

Generate a narrative summary based on the following table. You need to include all the models and the R² and MSE result.
The narrative needs to be in paragraph format.
   

{table}


"""
    return prompt

system_prompt = (
    "Transcribe the values from tables and put them in paragraph form."
    "Being carefull that each value in the table is accounted for in the"
    "paragraph. You are to do this in a narrative form. Answers must be concise."
     "\n\n"
    "{context}"
)


model = 'gpt-4o-mini'
class ReportTexts:
    def __init__(self, name: str, start: str, end: str, groups: {}, standard_report: {}, report: {}, landuse_report: {}, client: callable = None):
        self.name = name
        self.start = start
        self.end = end
        self.groups = groups
        self.report = report
        self.landuse_report = landuse_report
        self.client = client
        self.standard_report = standard_report
        self.chat = False
        self.cluster_d = None

    def  the_admin_boundaries(self, system_prompt: str = None, user_prompt: str = None):
        d = self.report.administrative_boundaries()[0]
        d.loc['survey areas', 'count'] = d.loc['parent_boundary', 'count']
        d.drop('parent_boundary', inplace=True)

        d_names = self.report.administrative_boundaries()[1]
        d_names['survey_area'] = d_names['parent_boundary']
        d_names.pop('parent_boundary')
        report_label = f"\n## Administrative boundaries {self.name} {self.start} {self.end}\n\n__{self.name}: Political boundaries and survey locations__\n\n"

        if self.chat is True:
            user_prompt = admin_prompt(d.to_markdown(), d_names)
            messages = messages_for_chat_completion(system_prompt=system_prompt, user_prompt=user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = admin_prompt(d.to_markdown(), d_names)
            return f'{report_label}\n\n{user_prompt}'

    def the_named_features(self):
        d = self.report.feature_inventory()[0]
        d_names = self.report.feature_inventory()[1]
        report_label = f"\n## Named features {self.name} {self.start} {self.end}\n\n__{self.name}: The number and place names of lakes, rivers and parks__\n\n"

        if self.chat is True:
            user_prompt = feature_count_prompt(d.to_markdown(), d_names)
            messages = messages_for_chat_completion(system_prompt=system_prompt, user_prompt=user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = feature_count_prompt(d.to_markdown(), d_names)
            return f'{report_label}\n\n{user_prompt}'

    def summary_statistics(self):
        d = self.report.sampling_results_summary.T
        report_label = f"\n## Summary statistics {self.name} {self.start} {self.end}\n\n__{self.name}: The distribution of the sample totals__\n\n"

        if self.chat is True:
            user_prompt = survey_result_summary_prompt(d.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = survey_result_summary_prompt(d.to_markdown())
            return f'{report_label}\n\n{user_prompt}'

    def inventory(self):
        d = self.report.object_summary()
        d['object'] = d.index.map(lambda x: codes.loc[x, 'en'])
        report_label = f"\n## Inventory items {self.name} {self.start} {self.end}\n\n__{self.name}: The quantity, average density, % of total and fail rate per object__\n\n"

        if self.chat is True:
            user_prompt = inventory_prompt(d.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = inventory_prompt(d.to_markdown())
            return f'{report_label}\n\n{user_prompt}'

    def landuse_profile(self):
        d = self.landuse_report.n_samples_per_feature()/self.report.number_of_samples
        d.sort_index(inplace=True)
        report_label = f"\n## Land use profile {self.name} {self.start} {self.end}\n\n__{self.name}: The landuse profile of the surveys.__\n\n"

        if self.chat is True:
            user_prompt = landuse_profile_prompt(d.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = landuse_profile_prompt(d.to_markdown())
            return f'{report_label}\n\n{user_prompt}'

    def landuse_rates(self):
        d = self.landuse_report.rate_per_feature()
        report_label = f"\n## Land use and trash density {self.name} {self.start} {self.end}\n\n{self.name}: The density of trash by feature and proportion of buffer.__\n\n"
        
        if self.chat is True:
            user_prompt = landuse_rates_prompt(d.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = land_use_rates_prompt(d.to_markdown())
            return f'{report_label}\n\n{user_prompt}'

    def cluster_analysis(self, scaled_cols: [] = None):
        
        report_label_cluster_features = f"\n__{self.name}: Cluster composition__"
        report_label_cluster_averages = f"\n__{self.name}: Average density per cluster__"         
                                            
        cluster_d, filtered_columns = filter_features(self.landuse_report.df_cont.copy(), terms=scaled_cols)        
                            
        self.target_scaler = StandardScaler()
        self.feature_scaler = StandardScaler()
        self.street_scaler = StandardScaler()
                      
        cluster_d['pcs/m'] = self.target_scaler.fit_transform(cluster_d[['pcs/m']])
        cluster_d['streets'] = self.street_scaler.fit_transform(cluster_d[['streets']])
        cluster_d[filtered_columns] = self.feature_scaler.fit_transform(cluster_d[filtered_columns])
        self.cluster_d = cluster_d
        self.filtered_columns = filtered_columns        

        args = {
            'cluster_data': cluster_d,
            'columns_to_cluster':self.filtered_columns,
            'scaled_cols': self.filtered_columns,
            'scalers':{'street_scaler':self.street_scaler, 'target_scaler': self.target_scaler, 'feature_scaler':self.feature_scaler}
        }           

        cluster_pro, summary_pro = clusters_by_use_case(**args)              

        # if use is None:
        summary_pro.drop(['nsamps'], inplace=True, axis=1)
        cols = [x for x in summary_pro.columns if x not in ['pcs/m']]
        cluster_features = summary_pro[cols].drop_duplicates()
        cluster_results = summary_pro[['pcs/m']].copy()            
        
        
        if self.chat:
            # cluster composition
            user_prompt = cluster_composition_prompt(cluster_features.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat_comp = use_chat_completion(client, model, messages)
    
            # average rate per cluster
            user_prompt = cluster_rates_prompt(cluster_results.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat_rate = use_chat_completion(client, model, messages)
            self.cluster_comp = completed_chat_comp
            self.cluster_results = completed_chat_rate
            return report_label_cluster_features, completed_chat_comp, cluster_features, report_label_cluster_averages, completed_chat_rate
        else:
            user_prompt_f = cluster_composition_prompt(cluster_features.to_markdown())
            user_prompt_r = cluster_rates_prompt(cluster_results.to_markdown())
            
            return f"\n## Cluster analysis {self.name} {self.start} {self.end}\n\n{report_label_cluster_features}\n{user_prompt_f}\n\n{report_label_cluster_averages}\n{user_prompt_r }"
        
    
    def linear_and_ensemble_regression(self):
        
        d, best_model, the_name, predictions, X_test, y_test, X_train, y_train = perform_regression_analysis(self.cluster_d, features=self.filtered_columns)
        d = pd.DataFrame(d)
        report_label = f"\n## Summary of regression results {self.name} {self.start} {self.end}\n\n{self.name}: The density of trash by feature and proportion of buffer.__\n\n"
        
        
        self.best_model = best_model
        self.best_model_name = the_name
        self.predictions = predictions
        self.x_train = X_train
        self.x_test = X_test
        self.y_train = y_train
        self.y_test =  y_test
        if self.chat:
            user_prompt = regression_results_prompt(d.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat = use_chat_completion(client, model, messages)
            return d, completed_chat, report_label
        else:
            user_prompt = regression_results_prompt(d.to_markdown())
            return f'{report_label}\n\n{user_prompt}'

    def feature_importance(self):
        

        d1, d2 = evaluate_feature_importance(self.best_model, self.best_model_name, self.x_test, self.y_test, self.x_train, self.y_train)
        report_label_model_f = f"\n__Model feature importance__"
        report_label_model_p = f"\n__Permutation feature importance__"
       
        if self.chat:
            user_prompt = a_model_feature_importance_prompt(d1.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat_model_features = use_chat_completion(client, model, messages)

            user_prompt = a_model_feature_importance_prompt(d2.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat_model_permutation = use_chat_completion(client, model, messages)
            return report_label_model_f,  completed_chat_model_features, d1, report_label_model_p, completed_chat_model_permutation, d2
        else:
            user_prompt_f = a_model_feature_importance_prompt(d1.to_markdown())
            user_prompt_p = a_model_feature_importance_prompt(d2.to_markdown())
            
            return f"{report_label_model_f}\n{user_prompt_f}\n\n{report_label_model_p}\n{user_prompt_p}"

       
    def grid_approximation(self):
        
        d1 = self.standard_report['weighted-forecast'].copy()
        d2 = self.standard_report['observed-99-forecast'].copy()
        report_label_model_f = f"\n__{self.name}: Weighted prior forecast__"
        report_label_model_p = f"\n__{self.name}: Observed 99th percentile forecast__"
       
        if self.chat:
            user_prompt = a_forecast_prompt(d1)
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat_weighted = use_chat_completion(client, model, messages)

            user_prompt = a_forecast_prompt(d2.to_markdown())
            messages = messages_for_chat_completion(system_prompt, user_prompt)
            completed_chat_99 = use_chat_completion(client, model, messages)
            return report_label_model_f, completed_chat_weighted, d1, report_label_model_p, completed_chat_99, d2
        else:
            user_prompt_f = a_forecast_prompt(d1.to_markdown())
            user_prompt_p = a_forecast_prompt(d2.to_markdown())
            return f"\n## Forecasts__\n\n{report_label_model_f}__\n{user_prompt_f}\n\n__{report_label_model_p}__\n{user_prompt_p}"
    
    
    
    def chat_rep(self, scaled_cols, file_name):        
    
        title = f"\n# Survey report {self.name} {self.start} {self.end}\n\n"
        objects = f"\n__Objects in data__\n\n{', '.join([x for x in self.groups.values()])}\n\n"
        with open(file_name, 'w') as file:
            file.write(title)

        append_to_markdown(file_name, objects)
        
        a, b, c = self.the_admin_boundaries(system_prompt=system_prompt)
        entry = f'{c}{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
        
        a, b, c = self.the_named_features()
        entry = f'{c}{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
    
        a, b, c = self.summary_statistics()
        entry = f'{c}{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
    
        a, b, c = self.inventory()
        entry = f'{c}{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')

        a, b, c = self.landuse_profile()
        entry = f'{c}{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')

        a, b, c = self.landuse_rates()
        entry = f'{c}{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
        
        a, b, c, d, e = self.cluster_analysis(scaled_cols=scaled_cols)
        section = f"\n## Cluster analysis {self.name} {self.start} {self.end}\n\n"
        entry = f'{section}{a}\n\n{b.choices[0].message.content}\n\n{c.to_markdown()}\n\n{d}\n\n{e.choices[0].message.content}'
        append_to_markdown(file_name, entry + '\n\n')
    
        a, b, c = self.linear_and_ensemble_regression()
        entry = f'{c}\n\n{b.choices[0].message.content}\n\n{a.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
    
        a, b, c, d, e, f = self.feature_importance()
        section = f"\n## Feature and permutation importance {self.name} {self.start} {self.end}\n\n"
        entry = f'{section}{a}\n\n{b.choices[0].message.content}\n\n{c.to_markdown()}\n\n{d}\n\n{e.choices[0].message.content}\n\n{f.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
    
        a, b, c, d, e, f = self.grid_approximation()
        section = f"\n## Forecasts {self.name} {self.start} {self.end}\n\n"
        entry = f'{section}{a}\n\n{b.choices[0].message.content}\n\n{c.to_markdown()}\n\n{d}\n\n{e.choices[0].message.content}\n\n{f.to_markdown()}'
        append_to_markdown(file_name, entry + '\n\n')
        
        return print(f"file saved as {file_name}") 

    def string_rep(self, scaled_cols: [] = None):
        title = f"\n# Survey report {self.name} {self.start} {self.end}\n\n"
        objects = f"\n__Objects in data__\n\n{', '.join([x for x in self.groups.values()])}\n\n"
        admin_boundaries = self.the_admin_boundaries()
        feature_names = self.the_named_features()
        summary_statistics = self.summary_statistics()
        inventory = self.inventory()
        clusteranalysis= self.cluster_analysis(scaled_cols=scaled_cols)
        linear_ensemble = self.linear_and_ensemble_regression()
        forecasts = self.grid_approximation()
        astring = f"""
        {title}
        {objects}
        {admin_boundaries}
        {feature_names}
        {summary_statistics}
        {inventory}
        {clusteranalysis}
        {linear_ensemble}
        {forecasts}
        """
        return astring

    def __repr__(self):
        return self.string_rep(scaled_cols=scaled_cols)
        
client = openai.OpenAI()
args = {
    'report':lake_report,
    'landuse_report': lake_land_use,
    'client': client,
    'start': prior_dates['start'],
    'end': o_dates['end'],
    'groups': use_groups,
    'standard_report': standard_combined,
    'name': canton
}



achatter = ReportTexts(**args)
# title = f"\n# Survey report {achatter.name} {achatter.start} {achatter.end}\n\n"
# objects = f"\n__Objects in data__\n\n{', '.join([x for x in achatter.groups.values()])}\n\n"
file_name = 'report_results.md'
achatter.chat = True
achatter.chat_rep(scaled_cols, file_name)



file saved as report_results.md
