In [37]:
%load_ext watermark

import pandas as pd
import numpy as np

from review_methods_tests import collect_vitals, find_missing, find_missing_loc_dates
from review_methods_tests import use_gfrags_gfoams_gcaps, make_a_summary,combine_survey_files

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


# Review of previous data 

## Notes

After the end of the IQAASL project the data was stored separately and other surveys were conducted. Those results were stored in separate places. The codes used for the different projects were not all the same.

Here we standardize, indentify locations or records that cannot be verified and define the set of codes used since 2015. The results will be used to define the data model for the next itteration.

This work considers the results from experiences with the Solid-Waste-Team, Wagenigen Research and Univeristy, the previous collaboration with OFEV and current projects with the ASL.

### Eliminated survey locations

The following locations were either duplicated or the length of the shoreline could not be verified.

```python

not_these = [
    'sihlsee_einsiedeln_schilligerllacherl',
    'schiffenensee_duedingen_hirschij',
    'lac-leman-hammerdirt',
    'thur_schoenenberg_schaera',
    'katzenbach_zuerich_sanesim',
    'inn_pradella_kohlt',
    'emme_luterbach_huggenbergerk',
    'lotschebach_bern_scheurerk',
    'mammern-swisslitterreport',
    'berlingen-swisslitterreport'    
]
```

### Eliminated codes

The codes G909, G910, G911 and G912 were eliminated. The recorded value was placed under the parent code:

1. G909, G910 => G74
2. G911 => G81
3. G912 => G82

### Gfoams, Gfrags, Gcaps

These are aggregate groups. It is difficult to infer how well a participant differentiates between size or use of the following codes.

1. Gfrags: G80, G79, G78, G75, G76, G77
2. Gfoams: G83, G82, G81
3. Gcaps: G21, G22, G23, G24

These aggregate groups are used when comparing values between sampling campaigns.

### Sampling campaigns

The dates of the sampling campaigns are expanded to include the surveys that happened between large organized campaigns. The start and end dates are defined below.

__Attention!!__ The codes used for each survey campaign are different. Different groups organized and conducted surveys using the MLW protocol. The data was then sent to us.

__MCBP:__ November 2015 - November 2016. The initial sampling campaign. Fragmented plastics (Gfrags/G79/G78/G76) were not sorted by size. All unidentified hard plastic items were classified in this manner.

* start_date = 2015-11-15
* end_date = 2017-03-31

__SLR:__ April 2017 - May 2018. Sampling campaign by the WWF. Objects less than 2.5 cm were not counted.

* start_date = 2017-04-01
* end_date = 2020-03-31

__IQAASL:__ April 2020 - May 2021. Sampling campaign mandated by the Swiss confederation. Additional codes were added for regional objects.

* start_date = 2020-04-01
* end_date = 2021-05-31

__Plastock (not added yet):__ January 2022 - December 2022. Sampling campaign from the Association pour la Sauvegarde du Léman. Not all objects were counted, They only identified a limited number of objects.

### Feature type

The feature type is a label that applies to general conditions of use for the location and other locations in the region

* r: rivers: surveys on river banks
* l: lake: surveys on the lake shore
* p: parcs: surveys in recreational areas

### Parent boundary

Designates the larger geographic region of the survey location. For lakes and rivers it is the name of the catchment area or river basin. For parcs it is the the type of park ie.. les Alpes. Recall that each feature has a name, for example Alpes Lépontines is the the name of a feature in the geographic region of _Les Alpes_.

### Language

The code descriptions are available in three languages

* en: english
* fr: french
* de: german

In [38]:
group_by_columns = [
    'loc_date', 
    'date', 
    'feature_name',
    'feature_type',
    'slug',     
    'parent_boundary',
    'length',
    'groupname',
    'city',
    'code', 
]
agg_this = {
    "quantity":"sum",
    "pcs_m": "sum"
}

codes = pd.read_csv("data/end_process/codes.csv")
codes.set_index("code", drop=True, inplace=True)


data_sources = [
    "data/end_process/after_may_2021.csv",
    "data/end_process/iqaasl.csv",
    "data/end_process/mcbp.csv",
    "data/end_process/slr.csv",
]

data_source = [
    "data/end_process/new_allx.csv"
]

old_surveys = combine_survey_files(data_source)
new_surveys = combine_survey_files(data_sources)

In [58]:
codes.loc["G76", "parent_code"] = "Gfrags"

In [59]:
codes.loc["G77", "parent_code"] = "Gfrags"

In [60]:
codes.to_csv("data/end_process/codes.csv", index=True)

In [40]:
olds = old_surveys.loc_date.unique()
news = new_surveys.loc_date.unique()
fmap = new_surveys[["slug", "feature_type"]].drop_duplicates().set_index("slug")

In [41]:
old_surveys["feature_type"] = old_surveys.location.apply(lambda x: fmap.loc[x, "feature_type"])

In [42]:
old_surveys.rename(columns={"water_name_slug":"feature_name", "location":"slug", "river_bassin":"parent_boundary"}, inplace=True)
old_surveys['length'] = old_surveys.length.astype("int")

In [43]:
[x for x in old_surveys.code.unique() if x not in new_surveys.code.unique()]

['G82',
 'G76',
 'G81',
 'G83',
 'G80',
 'G79',
 'G75',
 'G21',
 'G23',
 'G22',
 'G24',
 'G78']

In [44]:
old_surveys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318478 entries, 0 to 318477
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loc_date         318478 non-null  object 
 1   date             318478 non-null  object 
 2   feature_name     318478 non-null  object 
 3   slug             318478 non-null  object 
 4   code             318478 non-null  object 
 5   pcs_m            318478 non-null  float64
 6   quantity         318478 non-null  int64  
 7   parent_boundary  318478 non-null  object 
 8   length           318478 non-null  int64  
 9   groupname        318478 non-null  object 
 10  city             318478 non-null  object 
 11  feature_type     318478 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 29.2+ MB


In [45]:
new_surveys.info()

<class 'pandas.core.frame.DataFrame'>
Index: 318478 entries, 0 to 200785
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loc_date         318478 non-null  object 
 1   date             318478 non-null  object 
 2   feature_name     318478 non-null  object 
 3   slug             318478 non-null  object 
 4   code             318478 non-null  object 
 5   pcs_m            318478 non-null  float64
 6   quantity         318478 non-null  int64  
 7   parent_boundary  318478 non-null  object 
 8   length           318478 non-null  int64  
 9   groupname        318478 non-null  object 
 10  city             318478 non-null  object 
 11  feature_type     318478 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 31.6+ MB


In [46]:
len(new_surveys)

318478

In [47]:
len(old_surveys)

318478

## Survey data

In [48]:
# aggregate to object totals per day per sample
# groupby eliminates the objects with a quantity of zero
# split the data into two parts

# assign the new code values to the results
g_frag_foam = use_gfrags_gfoams_gcaps(surveys, codes)

# separate the values greater than zero and the new code values
gthan_zero = g_frag_foam[(g_frag_foam.quantity > 0) | (g_frag_foam.code.isin(["Gfrags", "Gfoams", "Gcaps"]))].copy()

# separate the values = to zero and the codes that are not being changed
t_t0 = g_frag_foam[(g_frag_foam.quantity == 0) & (~g_frag_foam.code.isin(["Gfrags", "Gfoams", "Gcaps"]))].copy()

# group the codes that have a value greater
t_ti = gthan_zero.groupby(group_by_columns, as_index=False).agg(agg_this)

t_th = pd.concat([t_t0, t_ti])

NameError: name 'surveys' is not defined

### Summary all data

In [49]:
vitals_all = collect_vitals(new_surveys)
print(make_a_summary(vitals_all))


    Number of objects: 196842
    
    Median pieces/meter: 0.0
    
    Number of samples: 1352
    
    Number of unique codes: 227
    
    Number of sample locations: 245
    
    Number of features: 59
    
    Number of cities: 142
    
    Start date: 2015-11-23
    
    End date: 2022-10-06
    
    


In [50]:
vitals_old = collect_vitals(old_surveys)
print(make_a_summary(vitals_old))


    Number of objects: 196842
    
    Median pieces/meter: 0.0
    
    Number of samples: 1352
    
    Number of unique codes: 239
    
    Number of sample locations: 245
    
    Number of features: 59
    
    Number of cities: 142
    
    Start date: 2015-11-23
    
    End date: 2022-10-06
    
    


### Summary MCBP

start_date = 2015-11-15

end_date = 2017-03-31

In [51]:
mcbpn = new_surveys[(new_surveys["date"] >= "2015-11-15")&(new_surveys["date"] <= "2017-03-31")].copy()
# mcbpn.to_csv("data/end_process/mcbp.csv", index=False)
vitals_mcbpn = collect_vitals(mcbpn)
print(make_a_summary(vitals_mcbpn, add_summary_name="MCBPn"))


        Summary name = MCBPn

        
    Number of objects: 35837
    
    Median pieces/meter: 0.0
    
    Number of samples: 94
    
    Number of unique codes: 226
    
    Number of sample locations: 21
    
    Number of features: 1
    
    Number of cities: 9
    
    Start date: 2015-11-23
    
    End date: 2017-03-20
    
    
        


In [52]:
mcbpo = old_surveys[(old_surveys["date"] >= "2015-11-15")&(old_surveys["date"] <= "2017-03-31")].copy()
mcbpo.to_csv("data/end_process/mcbp.csv", index=False)
vitals_mcbpo = collect_vitals(mcbpo)
print(make_a_summary(vitals_mcbpo, add_summary_name="MCBPl"))


        Summary name = MCBPl

        
    Number of objects: 35837
    
    Median pieces/meter: 0.0
    
    Number of samples: 94
    
    Number of unique codes: 235
    
    Number of sample locations: 21
    
    Number of features: 1
    
    Number of cities: 9
    
    Start date: 2015-11-23
    
    End date: 2017-03-20
    
    
        


### Summary SLR

start_date = 2017-04-01

end_date = 2018-05-31

In [53]:
slrn = new_surveys[(new_surveys["date"] >= "2017-04-01")&(new_surveys["date"] <= "2020-03-31")].copy()
# slrn.to_csv("data/end_process/slr.csv", index=False)
vitals_slrn = collect_vitals(slrn)
print(make_a_summary(vitals_slrn, add_summary_name="SLRn"))


        Summary name = SLRn

        
    Number of objects: 96851
    
    Median pieces/meter: 0.0
    
    Number of samples: 853
    
    Number of unique codes: 226
    
    Number of sample locations: 114
    
    Number of features: 38
    
    Number of cities: 79
    
    Start date: 2017-04-02
    
    End date: 2020-03-20
    
    
        


In [54]:
slro = old_surveys[(old_surveys["date"] >= "2017-04-01")&(old_surveys["date"] <= "2020-03-31")].copy()
slro.to_csv("data/end_process/slr.csv", index=False)
vitals_slro = collect_vitals(slro)
print(make_a_summary(vitals_slro, add_summary_name="SLRo"))


        Summary name = SLRo

        
    Number of objects: 96851
    
    Median pieces/meter: 0.0
    
    Number of samples: 853
    
    Number of unique codes: 235
    
    Number of sample locations: 114
    
    Number of features: 38
    
    Number of cities: 79
    
    Start date: 2017-04-02
    
    End date: 2020-03-20
    
    
        


### summary IQAASL

start_date = 2020-04-01

end_date = 2021-05-31


In [55]:
iqaasln = new_surveys[(new_surveys["date"] >= "2020-04-01")&(new_surveys["date"] <= "2021-05-31")].copy()
# iqaasln.to_csv("data/end_process/iqaasln.csv", index=False)

# the_rest = surveys[(surveys["date"] > "2021-05-31")]
# the_rest.to_csv("data/end_process/after_may_2021.csv", index=False)
vitals_iqaasln = collect_vitals(iqaasln)
print(make_a_summary(vitals_iqaasln, add_summary_name="IQAASLn"))


        Summary name = IQAASLn

        
    Number of objects: 54773
    
    Median pieces/meter: 0.0
    
    Number of samples: 387
    
    Number of unique codes: 227
    
    Number of sample locations: 149
    
    Number of features: 34
    
    Number of cities: 83
    
    Start date: 2020-04-09
    
    End date: 2021-05-29
    
    
        


In [57]:
iqaaslo = old_surveys[(old_surveys["date"] >= "2020-04-01")&(old_surveys["date"] <= "2021-05-31")].copy()
iqaaslo.to_csv("data/end_process/iqaasl.csv", index=False)

the_rest = old_surveys[(old_surveys["date"] > "2021-05-31")]
the_rest.to_csv("data/end_process/after_may_2021.csv", index=False)
vitals_iqaaslo = collect_vitals(iqaaslo)
print(make_a_summary(vitals_iqaaslo, add_summary_name="IQAASLo"))


        Summary name = IQAASLo

        
    Number of objects: 54773
    
    Median pieces/meter: 0.0
    
    Number of samples: 387
    
    Number of unique codes: 238
    
    Number of sample locations: 149
    
    Number of features: 34
    
    Number of cities: 83
    
    Start date: 2020-04-09
    
    End date: 2021-05-29
    
    
        


### Summary parent-boundary

In [34]:
parent_boundary = "aare"

aaren = new_surveys[new_surveys.parent_boundary == "aare"].copy()
vitals_aaren = collect_vitals(aaren)
print(make_a_summary(vitals_aaren, add_summary_name="Aaren"))


        Summary name = Aaren

        
    Number of objects: 33446
    
    Median pieces/meter: 0.0
    
    Number of samples: 363
    
    Number of unique codes: 226
    
    Number of sample locations: 70
    
    Number of features: 15
    
    Number of cities: 48
    
    Start date: 2017-04-02
    
    End date: 2021-04-23
    
    
        


In [35]:
parent_boundary = "aare"

aareo = old_surveys[old_surveys.parent_boundary == "aare"].copy()
vitals_aareo = collect_vitals(aareo)
print(make_a_summary(vitals_aareo, add_summary_name="Aareo"))


        Summary name = Aareo

        
    Number of objects: 33446
    
    Median pieces/meter: 0.0
    
    Number of samples: 363
    
    Number of unique codes: 235
    
    Number of sample locations: 70
    
    Number of features: 15
    
    Number of cities: 48
    
    Start date: 2017-04-02
    
    End date: 2021-04-23
    
    
        


## Location data

In [36]:
beaches = pd.read_csv("data/end_process/beaches.csv")
beaches.head()

Unnamed: 0,slug,latitude,longitude,country,feature_type,display_feature_name,city_slug,feature_name,city,parent_boundary,canton
0,aabach,47.220989,8.940365,CH,l,Zürichsee,schmerikon,zurichsee,Schmerikon,linth,St. Gallen
1,aare-limmatspitz,47.50106,8.237371,CH,r,Aare,gebenstorf,aare,Gebenstorf,aare,Aargau
2,aare-port,47.11617,7.26955,CH,r,Nidau-Büren-Kanal,port,aarenidau-buren-kanal,Port,aare,Bern
3,aare-solothurn-lido-strand,47.196949,7.521643,CH,r,Aare,solothurn,aare,Solothurn,aare,Solothurn
4,aare_bern_caveltin,46.923579,7.473319,CH,r,Aare,muri-bei-bern,aare,Muri bei Bern,aare,Bern


### Summary by feature type

In [14]:
r_summary = t_th[t_th.feature_type == "r"].copy()
vitals_r = collect_vitals(r_summary)
print(make_a_summary(vitals_r, add_summary_name="Rivers"))


        Summary name = Rivers

        
    Number of objects: 42073
    
    Median pieces/meter: 0.0
    
    Number of samples: 574
    
    Number of unique codes: 226
    
    Number of sample locations: 98
    
    Number of features: 36
    
    Number of cities: 69
    
    Start date: 2017-04-02
    
    End date: 2021-05-06
    
    
        


In [15]:
l_summary = t_th[t_th.feature_type == "l"].copy()
vitals_l = collect_vitals(l_summary)
print(make_a_summary(vitals_l, add_summary_name="lake"))


        Summary name = lake

        
    Number of objects: 146993
    
    Median pieces/meter: 0.0
    
    Number of samples: 758
    
    Number of unique codes: 227
    
    Number of sample locations: 127
    
    Number of features: 17
    
    Number of cities: 68
    
    Start date: 2015-11-23
    
    End date: 2022-10-06
    
    
        


In [16]:
p_summary = t_th[t_th.feature_type == "p"].copy()
vitals_p = collect_vitals(p_summary)
print(make_a_summary(vitals_p, add_summary_name="parcs"))


        Summary name = parcs

        
    Number of objects: 7776
    
    Median pieces/meter: 0.0
    
    Number of samples: 20
    
    Number of unique codes: 227
    
    Number of sample locations: 20
    
    Number of features: 7
    
    Number of cities: 18
    
    Start date: 2021-04-24
    
    End date: 2021-08-28
    
    
        


In [17]:
def vitals_to_sum(vitals):
    return np.array([vitals[0], vitals[2], vitals[4], vitals[5], vitals[6]])

t = vitals_to_sum(vitals_l) + vitals_to_sum(vitals_r) + vitals_to_sum(vitals_p)
list(t)

[196842, 1352, 245, 60, 155]

In [18]:
print(make_a_summary(vitals_all))


    Number of objects: 196842
    
    Median pieces/meter: 0.0
    
    Number of samples: 1352
    
    Number of unique codes: 227
    
    Number of sample locations: 245
    
    Number of features: 59
    
    Number of cities: 142
    
    Start date: 2015-11-23
    
    End date: 2022-10-06
    
    


## Codes

In [19]:
codes.head()

Unnamed: 0_level_0,material,en,source,parent_code,single_use,groupname,fr,de
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
G708,metal,Batons de ski,Usagers,G199,False,recreation,Batons de ski,Skistöcke
G712,cloth,Gants de ski,Usagers,G135,False,recreation,Gants de ski,Skihandschuhe
G902,cloth,"Mask medical, cloth",Personal hygiene,G145,False,personal items,"Masque médical, tissu réutilisable","Medizinische Masken, Stoff"
G917,glass,Terracotta balls,Utility items,G210,False,unclassified,Boules de terre cuite,Blähton
G921,glass,Ceramic tile and pieces,Construction,G204,False,infrastructure,Carreaux et pièces de céramique,Keramikfliesen und Bruchstücke


In [20]:
# language

codes.loc["G79", "fr"]

'Plastiques fragmentés x > 25mm'

In [21]:
codes.loc["G79", "de"]

'Objekte aus Kunststoff 2,5 - 50 cm'

In [22]:
codes.rename(columns={"description":"en"}, inplace=True)

In [23]:
codes.loc["G79", "en"]

'Plastic pieces 2.5cm - 50cm'

In [24]:
%watermark -a hammerdirt-analyst -co --iversions

Author: hammerdirt-analyst

conda environment: cantonal_report

numpy : 1.25.2
pandas: 2.0.3

