In [1]:
# sys, file and nav packages:
import os
import datetime as dt
import csv
import json

# math packages:
import pandas as pd
import numpy as np

# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# home brew utitilties
import utilities.utility_functions as ut
import utilities.abundance_classes as ac

# chart kwargs
title_k = {'loc':'left', 'pad':14, 'linespacing':1.5, 'fontsize':12}
title_k14 = {'loc':'left', 'pad':16, 'linespacing':1.5, 'fontsize':14}
xlab_k = {'labelpad':10, 'fontsize':12}
xlab_k14 = {'labelpad':14, 'fontsize':14}

survey_data, location_data, code_defs, stat_ent, geo_data, output = ut.make_local_paths()

In [2]:
# set some parameters:
start_date = '2020-04-01'
end_date ='2021-04-01'

# name the folder:
name_of_project = 'keyindicators'

# use this to store things:
project_directory = ut.make_project_folder(output, name_of_project)

# keep track of output
files_generated = []
figure_num = 0
data_num = 0

# keyword arguments for the abundance class:
group_names_locations = {
    "waste water": "wastewater.json" ,
    "less than 5mm":"codeListMicros.json",
    "construction":"construction.json",
    "food":"foodstuff.json",
    "agg-con-trans":"cat.json",
    "agriculture":"ag.json",
    "tobacco":"tobac.json",
    "recreation":"recreation.json",    
    "packaging non food":"packaging.json",
    "personal items":"pi.json",    
}


frag_plas = {"fragmented plastics":["G79", "G78", "G75"]}


def add_output(**kwargs):
    files_generated.append({'tag':kwargs['tag'], 'number':kwargs['figure_num'], 'file':kwargs['file'],'type':kwargs['a_type']})
    if kwargs['a_type'] == 'data':
        kwargs['data'].to_csv(F"{kwargs['file']}.csv", index=False)
    else:
        plt.savefig(F"{kwargs['file']}.jpeg", dpi=300)

In [3]:
# non aggregated survey data
# Zero values are assigned for all codes not identified at a survey, for each survey
# see the notebook 'getdataforrepo' to see how this is done
dfSurveys = pd.read_csv(F"{survey_data}/results_with_zeroes.csv")

# house keeping
# slice the data by the start and end date, convert date to datetime
dfSurveys = dfSurveys[(dfSurveys.date >= start_date)&(dfSurveys.date <= end_date)]
dfSurveys['date'] = pd.to_datetime(dfSurveys['date'], format="%Y-%m-%d")

# import river bassin labels
river_bassins = ut.json_file_get(F"{location_data}/river_basins.json")

# import beach data
dfBeaches = pd.read_csv(F"{location_data}/beaches_with_ranks.csv")
dfBeaches.set_index('slug', inplace=True)

# assign river bassin labels to dfBeaches:
for k,v in river_bassins.items():
    dfBeaches.loc[dfBeaches.water_name.isin(v), 'river_bassin'] = k

# code definitions
dfCodes = pd.read_csv(F"{code_defs}/mlw_codes.csv", index_col='code')

# map codes to descriptions and material type:
material_map = dfCodes.material
desc_map = dfCodes.description

# project lakes and beaches:
project_lakes = ut.json_file_get(F"{location_data}/project_lakes.json")
project_beaches = ut.json_file_get(F"{location_data}/project_beaches.json")

In [4]:
# make a copy of the survey data and tag the records as either lake or river:
data = dfSurveys.copy()

# map lake or river from dfBeaches
lakes = dfBeaches[dfBeaches.water == 'l'].water_name.unique()
rivers = dfBeaches[dfBeaches.water == 'r'].water_name.unique()

# map values to new column t:
data['type'] = 't'
for a_place in data.water_name.unique():
    data.loc[data.water_name.isin(lakes), 'type'] = 'l'
    data.loc[data.water_name.isin(rivers), 'type'] = 'r'

# check if there any un categorized records:
if len(data[data['type']=='t']) > 0:
    print(F"\nThere are {len(data[data['type']=='t'])} records that were not classified as either lake or river:\n\n{data[data['type']=='t']}\n")
else:
    pass

## <span style="color:#008891">Description of survey results</span>

### <span style="color:#008891"> Total number of surveys and observations, total number of objects found, number of cities and total population concerned</span>

In [5]:
# put the data into a class
a_class_kwargs = dict(
    code_group_data=group_names_locations,
    new_code_group=frag_plas,
    levels=['river_bassin', 'water_name', 'city'],
    river_bassins=river_bassins,
    exp_variables=['population','buildings', 'streets', 'intersects', 'pop_group_proj', 'pop_group_rip', 'streets_rank', 'buildings_rank'],
    code_group_loc=output,    
)
a = ac.PreprocessData(data, dfBeaches,**a_class_kwargs)

In [9]:
# define the final data set here:
a_data = a.survey_data[a.survey_data.river_bassin != 'reuss'].copy()

# describe the data set:
num_obs = len(a_data)
num_samps = len(a_data.loc_date.unique())
num_obj = a_data.quantity.sum()
num_locs = len(a_data.location.unique())



# number of municipalities
a_map = dfBeaches['city']
munis = [a_map[x] for x in a_data.location.unique()]
munis = list(set(munis))
num_munis = len(munis)

# population
total_pop_d = dfBeaches.loc[a_data.location.unique()][['city', 'population']]
total_pop_c = total_pop_d.drop_duplicates('city')
total_pop = total_pop_c.population.sum()

num_rivs

14

In [None]:
# create an intermediary group
foams_bg = ut.json_file_get(F"{code_defs}/all_foams.json")
sheeting_bg = ut.json_file_get(F"{code_defs}/sheeting.json")
fragplas_bg = ut.json_file_get(F"{code_defs}/fragmented_plastics.json")

# put all the group members into one list
the_big_group = ['G27','G30', *foams_bg, *sheeting_bg, *fragplas_bg]


# map intermediary group to survey data
a_data['big_group'] = 'no group'
a_data.loc[a_data.code.isin(foams_bg), 'big_group'] = 'foams'
a_data.loc[a_data.code.isin(sheeting_bg), 'big_group'] = 'sheeting'
a_data.loc[a_data.code.isin(fragplas_bg), 'big_group'] = 'frag plastic'
a_data.loc[a_data.code.isin(['G27']), 'big_group'] = 'cigarette ends'
a_data.loc[a_data.code.isin(['G30']), 'big_group'] = 'snack wrapper'

# if an object is not in "the_big_group" it retains its code name
for code in [x for x in a_data.code.unique() if x not in the_big_group]:
    a_data.loc[a_data.code==code, 'big_group'] = code


## <span style="color:#008891">The top ten objects identified</span>

In [None]:
# note all records where quantity is > 0
a_data['fail'] = a_data.quantity > 0
w_bassin_total = a_data.groupby('river_bassin').quantity.sum()

# get the list of codes by quantity
national_topten= a.code_totals.sort_values(ascending=False)

# take first ten records after sorting
national_topten_codes = national_topten.index[:10]

# grab the data from the survey results:
top_ten_national = a_data[a_data.code.isin(national_topten_codes)][['code', 'quantity', 'fail', 'pcs_m', 'loc_date']].copy()

# add descriptive and categorical variables:
top_ten_national['material'] = top_ten_national.code.map(lambda x: material_map.loc[x])
top_ten_national['description'] = top_ten_national.code.map(lambda x: desc_map.loc[x])


# agg the values from the survey data
# there is a convenience method for this
# the groups and the level where quantity is calculated:
groups = {'quantity_level':'code', 'columns':['code', 'description']}

# the columns to aggregate
aggs = {'pcs_m':'mean', 'quantity':'sum', 'fail':'sum', 'loc_date':'nunique'}

# columns divided by other columns
rates = [
    {'rate_name':'fail rate','columns':{'this':'fail', 'over_that':'loc_date'}},
    {'rate_name':'% of total', 'columns':{'this':'quantity', 'over_that':'feature_total'}},
]

# product of two columns
products = [
     {'rate_name':'rating', 'columns':{'this':'pcs_m', 'times_that':'fail rate'}}    
]

# method that returns the project total total:
def get_the_project_total(x , adf):
    return  adf.quantity.sum()

# calculate the fail rate and % of total for each code:
top_ten_agg = ac.calculate_rates(top_ten_national, feature_total_map=get_the_project_total, feature_map=a_data,groups=groups, aggs=aggs, rates=rates, products=products)

# for display purposes make the description the index
top_ten_agg.set_index('description', inplace=True)
print(F"\nThe top ten objects are {round((top_ten_agg.quantity.sum()/num_obj)*100, 2)}% of all objects identified\n")

tt_agg = top_ten_agg[['quantity']].sort_values(by='quantity', ascending=False).round(2)
tt_agg

#### <span style="color:#008891">Survey results: consolidating object categories</span>

The top ten list gives a very high level summary of the survey results. Based entirely on the total amount of an object found. It is a good indicator of overall abundance of an object in the ecosystem. Using this method we can account for ~60% of objects identified and very quickly identify the most abundant objects.

The top ten list can be improved to account for a greater percentage of the objects found. Currently, polystyrene is present in three forms in the top ten list, extruded polystyrene and two size variants of expanded polystyrene. Fragmented plastics are also present twice in the top ten list. The survey method was designed to get a maximum amount of detail for each survey. Grouping like objects by size is a common way to differentiate observations.

Some of the objects in the top ten list that can be consolidated:

1. There are three objects that are made from expanded or extruded polystyrene
2. Fragmented plastics are present in two different size ranges.

By combining like objects or objects that have the same or similar origin the impact of that group can be better appreciated. When the foam and the fragmented plastics are combined, that liberates three new places in the top ten list. Foams replace cigarette ends as the most abundant (pcs/m and quantity) but cigarette ends still retain the title as most frequently found (fail rate). The top ten list now accounts for ~70% of the objects identified.

#### <span style="color:#008891"> There is more room at the top: a consolidated top ten list</span>

In [None]:
fts = a_data.groupby(['river_bassin','big_group','loc_date'], as_index=False).agg({'pcs_m':'sum', 'quantity':'sum'})
fts['fail'] = fts.quantity > 0

# assign a description to results:
def assign_descriptions(x, **kwargs):
    if x in kwargs['exclude']:
        data = x
    else:
        data = kwargs['som_keys'][x]
    return data


groups = {'quantity_level':'big_group', 'columns':['big_group']}

ftsx = ac.calculate_rates(fts, feature_total_map=get_the_project_total, feature_map=a_data,groups=groups, aggs=aggs, rates=rates, products=products)

bg_desc = ['foams', 'cigarette ends', 'frag plastic', 'snack wrapper', 'sheeting']
som_kwargs = dict(exclude=bg_desc, som_keys=desc_map)

ftsx['description'] = ftsx.big_group.map(lambda x: assign_descriptions(x, **som_kwargs))


ftsx = ftsx[['description','quantity', '% of total', 'pcs_m','fail rate']].set_index('description').sort_values(by='quantity', ascending=False)[:10].round(2)
print(F"\nThe consolidated top ten objects are {round((ftsx.quantity.sum()/num_obj)*100, 2)}% of all objects identified\n")
ftsx

The *fail rate* is a **key indicator** as well as *% of total* and *pieces per meter*. Each indicator can be used to understand different aspects of the survey results. 

## <span style="color:#008891">Using the survey results: key indicators of the top ten objects</span>

### <span style="color:#008891">Key indicators for the most frequent questions:</span>

1. What do you find?
2. How often do you find it?
3. Do you find alot of it?
4. What else do you find?
5. Where do you find the most? 

The key indicators provide reasonable answwers to those questions using parameters that are taken directly from the survey data. The key indicators are used throughout the report. In this section we explain the key indicators and use them to answer some common questions about the national results.  There is a detailed example of each calculation in annex A.

### <span style="color:#008891">Key indicators definition and use</span>


#### **Fail rate:** How often at least one of a particular item was found at a survey

There are 76,466 observations from 346 surveys or 221 observations per survey. Observations that have a quantity greater than 0 for a survey are scored with a 1, if the quantity is 0 it is scored with a zero. The sum of the score is the number of times that an object was identified. 

_The fail rate is the number of times that an object was found divided by the number of samples taken._

__what does it mean:__ The fail rate describes how likely you are to find at least one of an object in the course of the survey

__how to use it:__ The objects with a high fail rate are those that are most likely to be found at a particular aggregation level

__Why is this important?__ The fail rate alerts us to the presence of items that are identified regularly even though they may not be found in large quantities or the inverse.
<br/><br />

#### **Pieces per meter:** How many objects were found within a defined distance

The pieces per meter ratio is the total number of objects found divided by the length in meters of the survey. This ratio is calculated for each record in the survey. Objects that were not identified durring a survey have a pcs/m ratio of 0.

_Pieces per meter is the number of objects found divided by the number of samples taken._

__what does it mean:__ The pcs-m ratio describes the average amount of an object you are likely to find if you mulitply pcs/m by the survey length

__how to use it:__ Objects with a high pcs/m ratio have a higher minimum value per survey (if they are found: see fail rate)

__Why is this is important?__ A high pcs/m ration indicates either proximity to a source or a zone of accumulation
<br/><br />

#### **% of total** The amount of an object relative to the other objects indeitififed

The percent of total describes the value of an object when all the other objects are considered.

__what does it mean:__ The % of total describes how much of the problem can be attributed to an object or group of objects

__how to use it:__ The % of total is the often the first indicator used to prioritize mitigation campaigns

__Why is this important:__ This helps define the problem at different levels
<br/><br />

#### <span style="color:#008891">Key indicators of the top ten objects</span>

In [None]:
fig, axs = plt.subplots(1,3, figsize=(5.4,8))

sns.heatmap(pd.DataFrame(ftsx['% of total']), ax = axs[0], cmap='YlOrRd', linewidth=.01, linecolor='white', annot=True, square=True, fmt=".0%", cbar=False)
sns.heatmap(pd.DataFrame(ftsx['pcs_m']), ax = axs[1], cmap='YlOrRd', linewidth=.01, linecolor='white', annot=True, square=True, fmt=".2", yticklabels=False, cbar=False)
sns.heatmap(pd.DataFrame(ftsx['fail rate']), ax = axs[2], cmap='YlOrRd', linewidth=.01, linecolor='white', annot=True, square=True, fmt=".0%", yticklabels=False, cbar=False)

for i in [0,1,2]:
    axs[i].set_ylabel("")
    axs[i].set_xlabel("")
    axs[i].xaxis.tick_top()
    axs[i].xaxis.set_label_position('top') 

plt.tight_layout()

plt.show()
plt.close()

#### Hopefully that just worked for you

if not contact analyst@hammerdirt.ch