In [1]:
import pandas as pd
import numpy as np

# Review of previous data

## Notes

After the end of the IQAASL project the stored separately and other surveys were conducted. Those results were stored in separate places. The codes used for the different projects were not all the same.

Here we standardize, indentify locations or records that cannot be verified and define the set of codes used since 2015.

### Eliminated survey locations

The following locations were either duplicated or the length of the shoreline could not be verified.

```python

not_these = [
    'sihlsee_einsiedeln_schilligerllacherl',
    'schiffenensee_duedingen_hirschij',
    'lac-leman-hammerdirt',
    'thur_schoenenberg_schaera',
    'katzenbach_zuerich_sanesim',
    'inn_pradella_kohlt',
    'emme_luterbach_huggenbergerk',
    'lotschebach_bern_scheurerk',
    'mammern-swisslitterreport',
    'berlingen-swisslitterreport'    
]
```

### Eliminated codes

The codes G909, G910, G911 and G912 were eliminated. The recorded value was placed under the parent code:

1. G909, G910 => G74
2. G911 => G81
3. G912 => G82

### Sampling campaigns

__Attention!!__ The codes used for each survey campaign are different. Different groups organized and conducted surveys using the MLW protocol. The data was then sent to us.

__MCBP:__ November 2015 - November 2016. The initial sampling campaign. Fragmented plastics (Gfrags/G79/G78/G76) were not sorted by size. All unidentified hard plastic items were classified in this manner.

__SLR:__ April 2017 - May 2018. Sampling campaign by the WWF. Objects less than 2.5 cm were not counted.

__IQAASL:__ April 2020 - May 2021. Sampling campaign mandated by the Swiss confederation. Additional codes were added for regional objects.

__Plastock:__ January 2022 - December 2022. Sampling campaign from the Association pour la Sauvegarde du Léman. Not all objects were counted, They only identified a limited number of objects.

### quantity

In [25]:
def collect_vitals(data):
    total = data.quantity.sum()
    median = data.pcs_m.median()
    samples = data.loc_date.nunique()
    ncodes = data.code.nunique()
    nlocations = data.location.nunique()
    nbodies = data.water_name_slug.nunique()
    ncities = data.city.nunique()
    min_date = data["date"].min()
    max_date = data["date"].max()
    
    return total, median, samples, ncodes, nlocations, nbodies, ncities, min_date, max_date

def find_missing(more_than, less_than):
    return np.setdiff1d(more_than, less_than)
def find_missing_loc_dates(done, dtwo):
    locs_one = done.loc_date.unique()
    locs_two = dtwo.loc_date.unique()
    return find_missing(locs_one, locs_two)

def aggregate_gcaps_gfoams_gfrags(data, codes,columns=["Gfoams", "Gfrags", "Gcaps"]):
    for col in columns:
        change = codes.loc[codes.parent_code == col].index
        data.loc[data.code.isin(change), "code"] = col
        
    return data
code_cols = ['material', 'description', 'source', 'parent_code', 'single_use', 'groupname']
a_cols = [
    'loc_date',
    'date',
    'water_name_slug',
    'location',
    'code',
    'pcs_m',
    'quantity',
    'river_bassin',
    'length', 
    'groupname',
    'city'
]

c_cols = [
    'loc_date',
    'location',
    'date',
    'water_name_slug',     
    'river_bassin',
    'length', 
    'groupname',
    'city',
    'code',
]
agg_this = {
    "quantity":"sum",
    "pcs_m": "sum"
}
dfCodes = pd.read_csv("data/end_process/codes.csv")
test_this = pd.read_csv("data/end_process/new_all.csv")
t_t = test_this.groupby(c_cols, as_index=False).agg(agg_this)

## Survey data

In [26]:
test_this.head()

Unnamed: 0,loc_date,date,water_name_slug,location,code,pcs_m,quantity,river_bassin,length,groupname,city
0,"('anarchy-beach', '2018-04-02')",2018-04-02,lac-leman,anarchy-beach,G70,0.0,0,rhone,70.042373,recreation,La Tour-de-Peilz
1,"('maladaire', '2021-05-01')",2021-05-01,lac-leman,maladaire,G48,0.0,0,rhone,74.113445,recreation,La Tour-de-Peilz
2,"('maladaire', '2021-05-01')",2021-05-01,lac-leman,maladaire,G170,0.0,0,rhone,74.113445,agriculture,La Tour-de-Peilz
3,"('maladaire', '2021-05-01')",2021-05-01,lac-leman,maladaire,G709,0.0,0,rhone,74.113445,recreation,La Tour-de-Peilz
4,"('maladaire', '2021-05-01')",2021-05-01,lac-leman,maladaire,G100,0.0,0,rhone,74.113445,waste water,La Tour-de-Peilz


In [27]:
test_this.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318478 entries, 0 to 318477
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loc_date         318478 non-null  object 
 1   date             318478 non-null  object 
 2   water_name_slug  318478 non-null  object 
 3   location         318478 non-null  object 
 4   code             318478 non-null  object 
 5   pcs_m            318478 non-null  float64
 6   quantity         318478 non-null  int64  
 7   river_bassin     318478 non-null  object 
 8   length           318478 non-null  float64
 9   groupname        318478 non-null  object 
 10  city             318478 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 26.7+ MB


### Summary all data



In [31]:
vitals_all = collect_vitals(test_this)


def make_a_summary(vitals):

    a_summary = f"""
    Number of objects: {vitals[0]}
    
    Median pieces/meter: {vitals[1]}
    
    Number of samples: {vitals[2]}
    
    Number of unique codes: {vitals[3]}
    
    Number of sample locations: {vitals[4]}
    
    Number of features: {vitals[5]}
    
    Number of cities: {vitals[6]}
    
    Start date: {vitals[7]}
    
    End date: {vitals[8]}
    
    """
    return a_summary


print(make_a_summary(vitals_all))


    Number of objects: 196842
    
    Median pieces/meter: 0.0
    
    Number of samples: 1352
    
    Number of unique codes: 239
    
    Number of sample locations: 245
    
    Number of features: 59
    
    Number of cities: 142
    
    Start date: 2015-11-23
    
    End date: 2022-10-06
    
    


### Summary MCBP

In [36]:
mcbp = test_this[(test_this["date"] > "2015-11-15")&(test_this["date"] <= "2016-12-01")].copy()
vitals_mcbp = collect_vitals(mcbp)
print(make_a_summary(vitals_mcbp))


    Number of objects: 33347
    
    Median pieces/meter: 0.0
    
    Number of samples: 85
    
    Number of unique codes: 235
    
    Number of sample locations: 20
    
    Number of features: 1
    
    Number of cities: 9
    
    Start date: 2015-11-23
    
    End date: 2016-11-28
    
    


### Summary SLR

In [37]:
slr = test_this[(test_this["date"] >= "2017-04-01")&(test_this["date"] <= "2018-05-01")].copy()
vitals_slr = collect_vitals(slr)
print(make_a_summary(vitals_slr))


    Number of objects: 93401
    
    Median pieces/meter: 0.0
    
    Number of samples: 837
    
    Number of unique codes: 235
    
    Number of sample locations: 105
    
    Number of features: 38
    
    Number of cities: 73
    
    Start date: 2017-04-02
    
    End date: 2018-04-02
    
    
