In [1]:
%load_ext watermark
import pandas as pd
import numpy as np
from typing import Type, Optional
from typing import List, Dict, Union

from review_methods_tests import collect_vitals, find_missing, find_missing_loc_dates
from review_methods_tests import use_gfrags_gfoams_gcaps, make_a_summary,combine_survey_files

from setvariables import *

In [16]:
def slice_data_by_date(data: pd.DataFrame, start: str, end: str):
    mask = (data.date >= start) & (data.date <= end)
    return data[mask]

def aggregate_dataframe(df: pd.DataFrame,
                        groupby_columns: List[str],
                        aggregation_functions: Dict[str, Union[str, callable]],
                        index: bool = False) -> pd.DataFrame:
    """
    Aggregate specified columns in a Pandas DataFrame using given aggregation functions.

    Args:
        df (pd.DataFrame): The input DataFrame.
        groupby_columns (List[str]): List of column names to group by.
        aggregation_functions (Dict[str, Union[str, callable]]): 
            A dictionary where keys are column names to aggregate, 
            and values are either aggregation functions (e.g., 'sum', 'mean', 'max', 'min')
            or custom aggregation functions (callable functions).
        index (bool, optional): Whether to use the groupby columns as an index.
            Default is False.

    Returns:
        pd.DataFrame: A new DataFrame with aggregated values.
    """
    grouped = df.groupby(groupby_columns, as_index=index).agg(aggregation_functions)
    
    return grouped
    
def merge_dataframes_on_column_and_index(left_df: pd.DataFrame,
                                         right_df: pd.DataFrame,
                                         left_column: str,
                                         how: str = 'inner',
                                         validate: str = 'many_to_one') -> pd.DataFrame:
    """
    Merge two DataFrames where the left DataFrame is merged on a specified column and 
    the right DataFrame is merged on its index.

    Args:
        left_df (pd.DataFrame): The left DataFrame to be merged.
        right_df (pd.DataFrame): The right DataFrame to be merged on its index.
        left_column (str): The column in the left DataFrame to merge on.
        how (str, optional): The type of merge to be performed ('left', 'right', 'outer', or 'inner'). 
            Default is 'inner'.
        validate (str, optional): Whether to perform merge validation checks. 
            Default is 'many_to_one'.

    Returns:
        pd.DataFrame: A new DataFrame resulting from the merge operation.
    """
  
    merged_df = left_df.merge(right_df, left_on=left_column, right_index=True, how=how)
    return merged_df

def get_top_x_records_with_max_quantity(df: pd.DataFrame, quantity_column: str, id_column: str, x: int):
    """
    Get the top x records with the greatest quantity and their associated ID from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        quantity_column (str): The name of the quantity column.
        id_column (str): The name of the ID column.
        x (int): The number of records to return.

    Returns:
        A data frame
    """
    # Sort the DataFrame by the quantity column in descending order, take the top x records, and select the ID column
    top_x_records = df.nlargest(x, quantity_column)[[id_column, quantity_column]]
    top_x_records["%"] = top_x_records[quantity_column]/top_x_records[quantity_column].sum()
    return top_x_records[[id_column, quantity_column, "%"]]

def calculate_object_occurrence_rates(df: pd.DataFrame,
                                      objects_to_check: List[str],
                                      y: int,
                                      j: int) -> Dict[str, float]:
    """
    Calculate the rate of occurrence for each object in a group of objects 'X' for a given quantity 'y' and number of samples 'j'
    from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'
        objects_to_check (List[str]): The list of objects to calculate occurrence rates for.
        y (int): The minimum quantity required for objects to be considered.
        j (int): The total number of samples.

    Returns:
        Dict[str, float]: A dictionary where keys are objects and values are the rates of occurrence for each object.
    """
    # Filter the DataFrame to include rows where 'object' is in 'objects_to_check' and quantity is greater than or equal to 'y'
    filtered_df = df[(df['code'].isin(objects_to_check)) & (df['quantity'] >= y)]

    # Calculate the occurrence rates for each object
    occurrence_rates = {}
    for obj in objects_to_check:
        object_filtered_df = filtered_df[filtered_df['code'] == obj]
        rate = len(object_filtered_df) / j if j > 0 else 0
        occurrence_rates[obj] = rate

    return occurrence_rates

def calculate_quantity_proportions(df: pd.DataFrame,
                                   objects_to_check: List[str]
                                   ) -> Dict[str, float]:
    """
    Calculate the proportion of the quantity of each object in a group of objects from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'
        objects_to_check (List[str]): The list of objects to calculate proportions for.

    Returns:
        Dict[str, float]: A dictionary where keys are objects and values are their quantity proportions.
    """
    # Filter the DataFrame to include rows where 'object' is in 'objects_to_check'
    filtered_df = df[df['code'].isin(objects_to_check)]

    # Calculate the total quantity for each object
    object_quantities = filtered_df.groupby('code')['quantity'].sum().to_dict()

    # Calculate the proportion for each object
    total_quantity = df.quantity.sum()
    proportions = {obj: quantity / total_quantity for obj, quantity in object_quantities.items()}

    return proportions

def calculate_rate_per_unit(df: pd.DataFrame,
                            objects_to_check: List[str],
                            column_of_interest: str = "code",
                            groupby_columns: List[str] = ['code'],
                            unit_measurement: str = "pcs_m",
                            method: Dict[str, str] = {"pcs_m": "median"},
                            ) -> pd.DataFrame:
    """
    Calculate the rate of occurence of object(s) for a given unit measurement.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'
        objects_to_check (List[str]): The list of objects to calculate proportions for.
        column_of_interest (str): The column label of the objects being compared.
        groupby_columns Dict[str]: The columns used for the aggregation.
        unit_measurement (str): The column labe of the unit of measurement.
        method (Dict[str]): Dictionary specifying the aggregation functions for the unit_measurement.

    Returns:
        pd.DataFrame: A dataframe where index is column_of_interest and the value column is the rate.
    """
    # Filter the DataFrame to include rows where 'object' is in 'objects_to_check'
    filtered_df = df[df[column_of_interest].isin(objects_to_check)]

    # Calculate the total quantity for each object
    object_rates = filtered_df.groupby(groupby_columns, as_index=False)[rate].agg(method)

    # Calculate the proportion for each object
    rates = object_rates[[column_of_interest, rate]].set_index(column_of_interest, drop=True)
    

    return rates



def count_objects_with_positive_quantity(df: pd.DataFrame) -> Dict[str, int]:
    """
    Count how many times each object had a quantity greater than zero in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'

    Returns:
        pd.Series: A Series with the count of positive quantity occurrences for each object.
    """
    # Filter the DataFrame to include rows where quantity is greater than zero
    positive_quantity_df = df[df['quantity'] > 0]
    no_count_df = df[(df['quantity'] == 0)]

    # Count the occurrences of positive quantities for each object
    object_counts = positive_quantity_df['code'].value_counts()
    failed = object_counts/df.loc_date.nunique()
    no_counts = no_count_df['code'].value_counts()
    zeroes = no_counts[~no_counts.index.isin(object_counts.index)]
    zeroes.loc[:] = 0

    return pd.concat([failed, zeroes])

def aggregate_boundaries(df: pd.DataFrame, unit_columns: list, unit_agg: dict, boundary_labels: list, boundary_columns: list, group_agg: dict)-> pd.DataFrame:
    """
    Aggregate data from a dataframe by boundaries and groups.

    Aggregates a dataframe in two steps. First, it performs
    aggregation at the 'unit' level defined by 'unit_columns' and 'unit_agg' to obtain
    test statistics. Then, it aggregates these 'unit' statistics further at the
    'boundary' level defined by 'boundary_labels' and 'boundary_columns', and computes
    the test statistics for each boundary.

    Args:
        df (pd.DataFrame): The input DataFrame containing data to be aggregated.
        unit_columns (list): List of columns for 'unit' level aggregation.
        unit_agg (dict): Dictionary specifying the aggregation functions for 'unit' level.
        boundary_labels (list): List of boundary labels to define 'boundaries' for further aggregation.
        boundary_columns (list): List of columns for 'boundary' level aggregation.
        group_agg (dict): Dictionary specifying the aggregation functions for 'boundary' level.

    Returns:
        pd.DataFrame: A DataFrame containing aggregated data at the 'boundary' level with
        additional 'label' column indicating the boundary label. 
    """
    

    unit_aggregate = aggregate_dataframe(code_result_df.copy(), unit_columns, unit_agg)
    boundary_summaries = []
    for label in boundary_labels:
        boundary_mask = unit_aggregate.parent_boundary == label
        boundary_aggregate = unit_aggregate[boundary_mask].groupby(boundary_columns, as_index=False).agg(agg_groups)
        boundary_aggregate['label'] = label
        boundary_summaries.append(boundary_aggregate)

    return pd.concat(boundary_summaries)

# Testing data models

The methods used in the version of the federal report were tested, but their was not a specific set of validation criteria beforehand. Test were done as the work progressed. This wasted alot of time

here we test the land use and survey data models.

1. is the land use data complete for each survey location?
2. does the survey data aggregate correctly to sample level?
   * what happens to objects with a quantity of zero?
   * aggregating to cantonal, municipal or survey area
     * are all locations included?
     * are lakes and rivers distinguished?
3. Does the aggregated data for iqaasl match the federal report?

### Gfoams, Gfrags, Gcaps

These are aggregate groups. It is difficult to infer how well a participant differentiates between size or use of the following codes.

1. Gfrags: G79, G78, G75
2. Gfoams: G81, G82, G76
3. Gcaps: G21, G22, G23, G24

These aggregate groups are used when comparing values between sampling campaigns.

### Sampling campaigns

The dates of the sampling campaigns are expanded to include the surveys that happened between large organized campaigns. The start and end dates are defined below.

__Attention!!__ The codes used for each survey campaign are different. Different groups organized and conducted surveys using the MLW protocol. The data was then sent to us.

__MCBP:__ November 2015 - November 2016. The initial sampling campaign. Fragmented plastics (Gfrags/G79/G78/G76) were not sorted by size. All unidentified hard plastic items were classified in this manner.

* start_date = 2015-11-15
* end_date = 2017-03-31

__SLR:__ April 2017 - May 2018. Sampling campaign by the WWF. Objects less than 2.5 cm were not counted.

* start_date = 2017-04-01
* end_date = 2020-03-31

__IQAASL:__ April 2020 - May 2021. Sampling campaign mandated by the Swiss confederation. Additional codes were added for regional objects.

* start_date = 2020-04-01
* end_date = 2021-05-31

__Plastock (not added yet):__ January 2022 - December 2022. Sampling campaign from the Association pour la Sauvegarde du Léman. Not all objects were counted, They only identified a limited number of objects.

### Feature name

The feature name is the name of a river lake or other regional label that you would find on a map. People in the region know the name.

### Feature type

The feature type is a label that applies to general conditions of use for the location and other locations in the region

* r: rivers: surveys on river banks
* l: lake: surveys on the lake shore
* p: parcs: surveys in recreational areas

### Parent boundary

Designates the larger geographic region of the survey location. For lakes and rivers it is the name of the catchment area or river basin. For parcs it is the the type of park ie.. les Alpes. Recall that each feature has a name, for example Alpes Lépontines is the the name of a feature in the geographic region of _Les Alpes_.

In [3]:
surveys = combine_survey_files(survey_data)
codes = pd.read_csv(code_data).set_index("code")
beaches = pd.read_csv(beach_data).set_index("slug")
land_cover = pd.read_csv(land_cover_data)
land_use = pd.read_csv(land_use_data)
streets = pd.read_csv(street_data)
river_intersect_lakes = pd.read_csv(intersection_attributes)

## Aggregate a set of data by sample (location and date)

Use the loc_date column in the survey data. Use the IQAASL period and four river baisns test against the federal report.

### Before aggregating does the number of locations, cities, samples and quantity match the federal report?

__The feature types include lakes and rivers, alpes were condsidered separately__

From https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/lakes_rivers.html#

1. cities = yes
2. samples = yes
3. locations = yes
4. quantity = No it is short 50 pieces
5. start and end date = yes

In [4]:
# startint varaibles
period = "iqaasl"
survey_areas = ["rhone", "ticino", "linth", "aare"]
start, end = [*period_dates[period]]

# the surveys from the survey areas of intersest
survey_data = surveys[surveys.parent_boundary.isin(survey_areas)].copy()

# the survey data sliced by the start and end data
feature_d= slice_data_by_date(survey_data.copy(), start, end)

# convert codes to gfrags, gcaps and gfoams
feature_data = use_gfrags_gfoams_gcaps(feature_d.copy(), codes)

# check the numbers
feature_vitals = collect_vitals(feature_d)
print(make_a_summary(feature_vitals))


    Number of objects: 54694
    
    Median pieces/meter: 0.0
    
    Number of samples: 386
    
    Number of unique codes: 235
    
    Number of sample locations: 143
    
    Number of features: 28
    
    Number of cities: 77
    
    Start date: 2020-03-08
    
    End date: 2021-05-12
    
    


### Number of lakes, rivers, parcs, cities and cantons

In [5]:
locations = feature_data.slug.unique()
feature_columns = ["feature_name", "feature_type", "city"]
beaches.loc[locations].groupby("canton").agg({x:"nunique" for x in feature_columns})

Unnamed: 0_level_0,feature_name,feature_type,city
canton,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aargau,2,1,4
Bern,7,2,21
Fribourg,1,1,2
Genève,2,2,2
Glarus,3,2,2
Luzern,1,1,1
Neuchâtel,2,1,4
Schwyz,1,1,1
Solothurn,1,1,2
St. Gallen,5,2,5


### aggregate to sample

The assessments are made on a per sample basis. That means that we can look at an individual object value at each sample. The sum of all the individual objects in a survey is the total for that survey. Dividing the totals by the length of the survey gives the assessment metric: _pieces of trash per meter_.

1. Are the quantiles of the current data  = to the federal report? Yes
2. Are the material totals = to the federal report? No,plastics if off by 50 pcs
3. Are the fail rates of the most common objects = to the federal report? Yes
4. Is the % of total of the most common objects = to the fedral report? yes
5. Is the median pieces/meter of the most common objects = to the federal report? yes
6. Is the quantity of the most common objects = to the federal report? yes

#### The summary of survey totals

fig 1.5 in IQAASL

In [6]:
# when the codes are changed to gfrags, gfoams and gcaps that creates 
# multiple code results for the same code at the same sample
# note that the code_result_columns do not have the groupname column
# this is because the code is changed and not the groupname

code_result_df = aggregate_dataframe(feature_data.copy(), code_result_columns, unit_agg)
code_result_df = code_result_df.merge(codes.groupname, left_on="code", right_index=True)

# aggregate the code totals on the sample day and check against the federal report
sample_totals = aggregate_dataframe(code_result_df.copy(), ["loc_date", "slug", "parent_boundary"], unit_agg)
sample_summary = sample_totals.pcs_m.describe()
sample_summary["total"] = sample_totals.quantity.sum()
pd.DataFrame(sample_summary)

Unnamed: 0,pcs_m
count,386.0
mean,3.952073
std,7.063422
min,0.02
25%,0.8225
50%,1.895
75%,3.865
max,66.17
total,54694.0


#### Material totals and proportions

fig 1.5 iqaal

In [7]:
# add the material label to each code
merged_result = merge_dataframes_on_column_and_index(code_result_df.copy(), codes["material"], 'code', how='inner', validate=True)

# sum the materials for the data frame
materials = aggregate_dataframe(merged_result.copy(), ["material"], {"quantity":"sum"})
materials["%"] = materials.quantity/materials.quantity.sum()
materials

Unnamed: 0,material,quantity,%
0,chemicals,140,0.00256
1,cloth,343,0.006271
2,glass,2919,0.05337
3,metal,1874,0.034263
4,paper,1527,0.027919
5,plastic,47093,0.861027
6,rubber,390,0.007131
7,unidentified,2,3.7e-05
8,wood,406,0.007423


#### Quantity, median pcs/m, fail rate, and % of total

fig 1.6 iqaasl

In [8]:
# sum the cumulative quantity for each code and calculate the median pcs/meter
code_totals = aggregate_dataframe(code_result_df.copy(), ["code"], {"quantity":"sum", "pcs_m":"median"})

# find the top ten codes
abundant = get_top_x_records_with_max_quantity(code_totals.copy(), "quantity", "code", len(code_totals))
abundant

Unnamed: 0,code,quantity,%
111,G27,8485,0.155136
225,Gfrags,7400,0.135298
224,Gfoams,5559,0.101638
115,G30,3325,0.060793
147,G67,2534,0.046330
...,...,...,...
162,G712,0,0.000000
163,G713,0,0.000000
169,G88,0,0.000000
78,G180,0,0.000000


In [9]:
# identify the objects that were found in at least 50% of the samples
# calculate the quantity per sample for each code and sample
occurrences = aggregate_dataframe(code_result_df, ["loc_date", "code"], {"quantity":"sum"})

# count the number of times that any object was found and
# and divide it by the total number of samples 
event_counts  = count_objects_with_positive_quantity(occurrences)

# select the objects that were found in at least 50% of the surveys
abundant["fail"] = abundant.code.apply(lambda x: event_counts.loc[x])

Unnamed: 0,code,quantity,%,fail
111,G27,8485,0.155136,0.878238
225,Gfrags,7400,0.135298,0.862694
224,Gfoams,5559,0.101638,0.686528
115,G30,3325,0.060793,0.852332
147,G67,2534,0.046330,0.696891
...,...,...,...,...
162,G712,0,0.000000,0.000000
163,G713,0,0.000000,0.000000
169,G88,0,0.000000,0.000000
78,G180,0,0.000000,0.000000


In [10]:
abundant.sort_values(by="quantity", inplace=True, ascending=False)
abundant.reset_index(inplace=True, drop=True)

### The most common objects

fig 1.6 iqaasl

In [11]:
the_most_common = abundant[(abundant.quantity > abundant.loc[10, "quantity" ]) | (abundant["%"] >= 0.5)]
the_most_common

Unnamed: 0,code,quantity,%,fail
0,G27,8485,0.155136,0.878238
1,Gfrags,7400,0.135298,0.862694
2,Gfoams,5559,0.101638,0.686528
3,G30,3325,0.060793,0.852332
4,G67,2534,0.04633,0.696891
5,G200,2136,0.039054,0.650259
6,G112,1968,0.035982,0.30829
7,Gcaps,1844,0.033715,0.65285
8,G74,1656,0.030278,0.533679
9,G95,1406,0.025707,0.507772


### Results by groupname and feature boundary

In [12]:
# aggregate by parent_boundary

unit_columns = ["parent_boundary", "loc_date", "groupname"]
boundary_columns = ["groupname"]
boundary_labels = code_result_df.parent_boundary.unique()

boundary_summaries = aggregate_boundaries(code_result_df.copy(), unit_columns, unit_agg, boundary_labels, boundary_columns, agg_groups)
boundary_summaries.pivot(index="groupname", columns="label", values="pcs_m")

label,aare,linth,rhone,ticino
groupname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
agriculture,0.06,0.03,0.14,0.06
food and drink,0.25,0.28,0.7,0.28
infrastructure,0.14,0.12,0.545,0.205
micro plastics (< 5mm),0.01,0.0,0.115,0.0
packaging non food,0.09,0.13,0.205,0.075
personal items,0.04,0.04,0.1,0.065
plastic pieces,0.185,0.105,0.48,0.095
recreation,0.06,0.04,0.165,0.035
tobacco,0.15,0.265,0.5,0.18
unclassified,0.0,0.0,0.02,0.0


In [13]:
%watermark -a hammerdirt-analyst -co --iversions

Author: hammerdirt-analyst

conda environment: cantonal_report

numpy : 1.25.2
pandas: 2.0.3

