In [1]:
%load_ext watermark
import pandas as pd
import numpy as np

from review_methods_tests import collect_vitals, find_missing, find_missing_loc_dates, indexed_feature_data
from review_methods_tests import use_gfrags_gfoams_gcaps, make_a_summary,combine_survey_files

# Testing data models

The methods used in the version of the federal report were tested, but their was not a specific set of validation criteria beforehand. Test were done as the work progressed. This wasted alot of time

here we test the land use and survey data models.

1. is the land use data complete for each survey location?
2. does the survey data aggregate correctly to sample level?
   * what happens to objects with a quantity of zero?
   * aggregating to cantonal, municipal or survey area
     * are all locations included?
     * are lakes and rivers distinguished?
3. Does the aggregated data for iqaasl match the federal report?

### Gfoams, Gfrags, Gcaps

These are aggregate groups. It is difficult to infer how well a participant differentiates between size or use of the following codes.

1. Gfrags: G79, G78, G75
2. Gfoams: G81, G82, G76
3. Gcaps: G21, G22, G23, G24

These aggregate groups are used when comparing values between sampling campaigns.

### Sampling campaigns

The dates of the sampling campaigns are expanded to include the surveys that happened between large organized campaigns. The start and end dates are defined below.

__Attention!!__ The codes used for each survey campaign are different. Different groups organized and conducted surveys using the MLW protocol. The data was then sent to us.

__MCBP:__ November 2015 - November 2016. The initial sampling campaign. Fragmented plastics (Gfrags/G79/G78/G76) were not sorted by size. All unidentified hard plastic items were classified in this manner.

* start_date = 2015-11-15
* end_date = 2017-03-31

__SLR:__ April 2017 - May 2018. Sampling campaign by the WWF. Objects less than 2.5 cm were not counted.

* start_date = 2017-04-01
* end_date = 2020-03-31

__IQAASL:__ April 2020 - May 2021. Sampling campaign mandated by the Swiss confederation. Additional codes were added for regional objects.

* start_date = 2020-04-01
* end_date = 2021-05-31

__Plastock (not added yet):__ January 2022 - December 2022. Sampling campaign from the Association pour la Sauvegarde du Léman. Not all objects were counted, They only identified a limited number of objects.

### Feature type

The feature type is a label that applies to general conditions of use for the location and other locations in the region

* r: rivers: surveys on river banks
* l: lake: surveys on the lake shore
* p: parcs: surveys in recreational areas

### Parent boundary

Designates the larger geographic region of the survey location. For lakes and rivers it is the name of the catchment area or river basin. For parcs it is the the type of park ie.. les Alpes. Recall that each feature has a name, for example Alpes Lépontines is the the name of a feature in the geographic region of _Les Alpes_.

In [2]:
# period dates
period_dates = {
    "mcbp":["2015-11-15", "2017-03-31"],
    "slr": ["2017-04-01", "2020-02-28"],
    "iqaasl": ["2020-03-01", "2021-05-31"],
    "2022": ["2021-06-01", "2022-12-01"]
}
code_cols = ['material', 'description', 'source', 'parent_code', 'single_use', 'groupname']

group_by_columns = [
    'loc_date', 
    'date', 
    'feature_name', 
    'slug',     
    'parent_boundary',
    'length',
    'groupname',
    'city',
    'code', 
]
agg_this = {
    "quantity":"sum",
    "pcs_m": "sum"
}

survey_data = [
    "data/end_process/after_may_2021.csv",
    "data/end_process/iqaasl.csv",
    "data/end_process/mcbp.csv",
    "data/end_process/slr.csv",
]

code_data =  "data/end_process/codes.csv"
beach_data = "data/end_process/beaches.csv"
land_cover_data = "data/end_process/land_cover.csv"
land_use_data = "data/end_process/land_use.csv"
street_data = "data/end_process/streets.csv"
intersection_attributes = "data/end_process/river_intersect_lakes.csv"
surveys = combine_survey_files(survey_data)
codes = indexed_feature_data(code_data, index="code")
beaches = indexed_feature_data(beach_data, index="slug")
land_cover = pd.read_csv(land_cover_data)
land_use = pd.read_csv(land_use_data)
streets = pd.read_csv(street_data)
river_intersect_lakes = pd.read_csv(intersection_attributes)

In [3]:
# # assign the new code values to the results
# g_frag_foam = use_gfrags_gfoams_gcaps(surveys, codes)

# # separate the values greater than zero and the new code values
# gthan_zero = g_frag_foam[(g_frag_foam.quantity > 0) | (g_frag_foam.code.isin(["Gfrags", "Gfoams", "Gcaps"]))].copy()

# # separate the values = to zero and the codes that are not being changed
# t_t0 = g_frag_foam[(g_frag_foam.quantity == 0) & (~g_frag_foam.code.isin(["Gfrags", "Gfoams", "Gcaps"]))].copy()

# # group the codes that have a value greater
# t_ti = gthan_zero.groupby(group_by_columns, as_index=False).agg(agg_this)

# t_th = pd.concat([t_t0, t_ti])

In [4]:
codes.loc["G76"]
# codes.to_csv("data/end_process/codes.csv", index=True)

material                                             plastic
en                     Plastic/foamed polystyrene 2.5 > < 50
source                                             Undefined
parent_code                                           Gfrags
single_use                                             False
groupname                                     plastic pieces
fr                  Plastique/polystyrène expansé 2,5 > < 50
de             Objekte aus Kunststoff/Polystyrol 2,5 - 50 cm
Name: G76, dtype: object

## Aggregate a set of data by sample (location and date)

Use the loc_date column in the survey data. Use the IQAASL period and test against the federal report

### Before aggregating does the number of locations, cities, samples and quantity match the federal report?

__The feature types include lakes and rivers, alpes were condsidered separately__

From https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/lakes_rivers.html#

1. cities = yes
2. samples = yes
3. locations = yes
4. quantity = No it is short 50 pieces

In [5]:
period = "iqaasl"
survey_areas = ["rhone", "ticino", "linth", "aare"]
start, end = [*period_dates[period]]
survey_data = surveys[surveys.parent_boundary.isin(survey_areas)].copy()

def slice_data_by_date(data, start, end):
    mask = (data.date >= start) & (data.date <= end)
    return data[mask]

feature_d = slice_data_by_date(survey_data.copy(), start, end)
feature_d[feature_d.code.isin(

In [6]:
feature_data = use_gfrags_gfoams_gcaps(feature_d, codes)
feature_vitals = collect_vitals(feature_d)
print(make_a_summary(feature_vitals))


    Number of objects: 54694
    
    Median pieces/meter: 0.0
    
    Number of samples: 386
    
    Number of unique codes: 226
    
    Number of sample locations: 143
    
    Number of features: 28
    
    Number of cities: 77
    
    Start date: 2020-03-08
    
    End date: 2021-05-12
    
    


### aggregate to sample

The assessments are made on a per sample basis. That means that we can look at an individual object value at each sample. The sum of all the individual objects in a survey is the total for that survey. Dividing the totals by the length of the survey gives the assessment metric: _pieces of trash per meter_.

1. Are the quantiles of the current data  = to the federal report? Yes
2. Are the material totals = to the federal report? No,plastics if off by 50 pcs

In [7]:
import pandas as pd
from typing import Type, Optional
from typing import List, Dict, Union

def aggregate_dataframe(
    df: pd.DataFrame, 
    groupby_columns: List[str], 
    aggregation_functions: Dict[str, Union[str, callable]],
    index: bool = False
) -> pd.DataFrame:
    """
    Aggregate specified columns in a Pandas DataFrame using given aggregation functions.

    Args:
        df (pd.DataFrame): The input DataFrame.
        groupby_columns (List[str]): List of column names to group by.
        aggregation_functions (Dict[str, Union[str, callable]]): 
            A dictionary where keys are column names to aggregate, 
            and values are either aggregation functions (e.g., 'sum', 'mean', 'max', 'min')
            or custom aggregation functions (callable functions).
        index (bool, optional): Whether to use the groupby columns as an index.
            Default is False.

    Returns:
        pd.DataFrame: A new DataFrame with aggregated values.
    """
    grouped = df.groupby(groupby_columns, as_index=index).agg(aggregation_functions)
    
    return grouped
    
def merge_dataframes_on_column_and_index(
    left_df: pd.DataFrame, 
    right_df: pd.DataFrame, 
    left_column: str, 
    how: str = 'inner',
    validate: str = 'many_to_one'
) -> pd.DataFrame:
    """
    Merge two DataFrames where the left DataFrame is merged on a specified column and 
    the right DataFrame is merged on its index.

    Args:
        left_df (pd.DataFrame): The left DataFrame to be merged.
        right_df (pd.DataFrame): The right DataFrame to be merged on its index.
        left_column (str): The column in the left DataFrame to merge on.
        how (str, optional): The type of merge to be performed ('left', 'right', 'outer', or 'inner'). 
            Default is 'inner'.
        validate (str, optional): Whether to perform merge validation checks. 
            Default is 'many_to_one'.

    Returns:
        pd.DataFrame: A new DataFrame resulting from the merge operation.
    """
  
    merged_df = left_df.merge(right_df, left_on=left_column, right_index=True, how=how)
    return merged_df

code_result_df = aggregate_dataframe(feature_data.copy(), group_by_columns, agg_this)
sample_totals = aggregate_dataframe(code_result_df.copy(), ["loc_date", "slug", "parent_boundary"], agg_this)
sample_totals.pcs_m.describe()

count    386.000000
mean       3.952073
std        7.063422
min        0.020000
25%        0.822500
50%        1.895000
75%        3.865000
max       66.170000
Name: pcs_m, dtype: float64

In [8]:
merged_result = merge_dataframes_on_column_and_index(code_result_df.copy(), codes["material"], 'code', how='inner', validate=True)
materials = aggregate_dataframe(merged_result.copy(), ["material"], {"quantity":"sum"})

In [9]:
def get_top_x_records_with_max_quantity(df, quantity_column, id_column, x):
    """
    Get the top x records with the greatest quantity and their associated ID from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        quantity_column (str): The name of the quantity column.
        id_column (str): The name of the ID column.
        x (int): The number of records to return.

    Returns:
        pd.DataFrame: A DataFrame with the top x records having the greatest quantity and the associated ID.
    """
    # Sort the DataFrame by the quantity column in descending order, take the top x records, and select the ID column
    top_x_records = df.nlargest(x, quantity_column)[[id_column, quantity_column]]
    return top_x_records



code_totals = aggregate_dataframe(code_result_df.copy(), ["code"], {"quantity":"sum"})
result = get_top_x_records_with_max_quantity(code_totals.copy(), "quantity", "code", 10)
print(result)


       code  quantity
111     G27      8485
225  Gfrags      7289
224  Gfoams      5670
115     G30      3325
147     G67      2534
97     G200      2136
14     G112      1968
223   Gcaps      1844
165     G74      1656
217     G95      1406


In [10]:
cx = pd.read_csv("data/from_prev/codes_with_group_names_2015.csv")
cx[cx.code.isin(['G75', 'G78', 'G79', 'G80', 'Gfrags'])]

Unnamed: 0,code,material,description,source,source_two,source_three,parent_code,direct,single_use,micro,ospar_code,groupname
183,G75,Plastic,Plastic/polystyrene pcs 0.5 - 2.5 cm,Undefined,Where does it come from,none,Parent code,True,False,False,117.0,plastic pieces
186,G78,Plastic,Plastic pieces 0.5cm - 2.5cm,Undefined,Where does it come from,none,Parent code,True,False,False,117.0,plastic pieces
187,G79,Plastic,Plastic pieces 2.5cm - 50cm,Undefined,,,Parent code,True,False,False,46.0,plastic pieces
189,G80,Plastic,Plastic pieces > 50cm,Undefined,Where does it come from,,Parent code,True,False,False,,plastic pieces
273,Gfrags,Plastic,Fragmented plastics,Undefined,,,Parent code,True,False,False,46.0,plastic pieces


In [11]:
cx[cx.code.isin(['G76', 'G81', 'G82', 'G83'])]

Unnamed: 0,code,material,description,source,source_two,source_three,parent_code,direct,single_use,micro,ospar_code,groupname
184,G76,Plastic,Plastic/foamed polystyrene 2.5 > < 50,Undefined,Where does it come from,Indirect,Parent code,True,False,False,46.0,plastic pieces
190,G81,Plastic,Foamed polystyrene pieces 0.5cm - 2.5cm,Packaging,Construction,,Parent code,False,False,False,117.0,infrastructure
191,G82,Plastic,Foam polystyrene 2.5 - 50cm,Undefined,Packaging,Construction,Parent code,False,False,False,46.0,infrastructure
192,G83,Plastic,Polystyrene pieces > 50cm,Undefined,Where does it come from,,Parent code,True,False,False,,infrastructure


In [12]:
%watermark -a hammerdirt-analyst -co --iversions

Author: hammerdirt-analyst

conda environment: cantonal_report

pandas: 2.0.3
numpy : 1.25.2



In [13]:
54744-54694

50

In [14]:
5670-5563

107

In [15]:
7289 + 107

7396

In [16]:
codes[codes.parent_code == "Gfrags"].index

Index(['G75', 'G76', 'G78', 'G79', 'G80', 'Gfrags'], dtype='object', name='code')

In [17]:
codes[codes.parent_code == "Gfoams"].index

Index(['G81', 'G82', 'G83', 'Gfoams'], dtype='object', name='code')

In [18]:
feature_data[feature_data.code.isin(['G76', 'G81', 'G82', 'G83'])].groupby("code").quantity.sum()

Series([], Name: quantity, dtype: int64)