In [1]:
%load_ext watermark
import pandas as pd
import numpy as np
from typing import Type, Optional, Callable
from typing import List, Dict, Union

from review_methods_tests import collect_vitals, find_missing, find_missing_loc_dates
from review_methods_tests import use_gfrags_gfoams_gcaps, make_a_summary,combine_survey_files

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap

from setvariables import *

In [2]:
def slice_data_by_date(data: pd.DataFrame, start: str, end: str):
    mask = (data.date >= start) & (data.date <= end)
    return data[mask]

top = mpl.colormaps['Oranges'].resampled(2000)
bottom = mpl.colormaps['Greys'].resampled(2000)

newcolors = np.vstack((bottom(np.linspace(0, 1, 2000)),
                   top(np.linspace(0, 1, 2000))))
newcmp = ListedColormap(newcolors, name='OrangeBlue')

my_cmap = newcmp(np.arange(newcmp.N))
my_cmap[:, -1] = np.linspace(0, 1, newcmp.N)
newcmp = ListedColormap(my_cmap)

format_kwargs = dict(precision=2, thousands="'", decimal=",")

# this defines the css rules for the note-book table displays
header_row = {'selector': 'th:nth-child(1)', 'props': f'background-color: #FFF; font-size:12px; text-align:left;'}
even_rows = {"selector": 'tr:nth-child(even)', 'props': f'background-color: rgba(139, 69, 19, 0.08);'}
odd_rows = {'selector': 'tr:nth-child(odd)', 'props': 'background: #FFF;'}
table_font = {'selector': 'tr', 'props': 'font-size: 10px;'}
table_data = {'selector': 'td', 'props': 'padding:4px; font-size:12px;'}
table_css_styles = [even_rows, odd_rows, table_font, header_row, table_data]

def aggregate_dataframe(df: pd.DataFrame,
                        groupby_columns: List[str],
                        aggregation_functions: Dict[str, Union[str, callable]],
                        index: bool = False) -> pd.DataFrame:
    """
    Aggregate specified columns in a Pandas DataFrame using given aggregation functions.

    Args:
        df (pd.DataFrame): The input DataFrame.
        groupby_columns (List[str]): List of column names to group by.
        aggregation_functions (Dict[str, Union[str, callable]]): 
            A dictionary where keys are column names to aggregate, 
            and values are either aggregation functions (e.g., 'sum', 'mean', 'max', 'min')
            or custom aggregation functions (callable functions).
        index (bool, optional): Whether to use the groupby columns as an index.
            Default is False.

    Returns:
        pd.DataFrame: A new DataFrame with aggregated values.
    """
    grouped = df.groupby(groupby_columns, as_index=index).agg(aggregation_functions)
    
    return grouped
    
def merge_dataframes_on_column_and_index(left_df: pd.DataFrame,
                                         right_df: pd.DataFrame,
                                         left_column: str,
                                         how: str = 'inner',
                                         validate: str = 'many_to_one') -> pd.DataFrame:
    """
    Merge two DataFrames where the left DataFrame is merged on a specified column and 
    the right DataFrame is merged on its index.

    Args:
        left_df (pd.DataFrame): The left DataFrame to be merged.
        right_df (pd.DataFrame): The right DataFrame to be merged on its index.
        left_column (str): The column in the left DataFrame to merge on.
        how (str, optional): The type of merge to be performed ('left', 'right', 'outer', or 'inner'). 
            Default is 'inner'.
        validate (str, optional): Whether to perform merge validation checks. 
            Default is 'many_to_one'.

    Returns:
        pd.DataFrame: A new DataFrame resulting from the merge operation.
    """
  
    merged_df = left_df.merge(right_df, left_on=left_column, right_index=True, how=how)
    return merged_df

def get_top_x_records_with_max_quantity(df: pd.DataFrame, quantity_column: str, id_column: str, x: int):
    """
    Get the top x records with the greatest quantity and their proportion to the total from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        quantity_column (str): The name of the quantity column.
        id_column (str): The name of the ID column.
        x (int): The number of records to return.

    Returns:
        A data frame
    """
    # Sort the DataFrame by the quantity column in descending order, take the top x records, and select the ID column
    top_x_records = df.nlargest(x, quantity_column)[[id_column, quantity_column]]
    top_x_records["%"] = top_x_records[quantity_column]/top_x_records[quantity_column].sum()
    
    return top_x_records[[id_column, quantity_column, "%"]]



def calculate_rate_per_unit(df: pd.DataFrame,
                            objects_to_check: List[str],
                            column_of_interest: str = "code",
                            groupby_columns: List[str] = ['code'],
                            method: Dict[str, str] = {"pcs_m": "median", "quantity":"sum"},
                            )-> pd.DataFrame:
    """
    Calculate the rate of occurence of object(s) for a given unit measurement. Adds the label
    'all' to each record.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'
        objects_to_check (List[str]): The list of objects to calculate proportions for.
        column_of_interest (str): The column label of the objects being compared.
        groupby_columns Dict[str]: The columns used for the aggregation.
        method (Dict[str]): Dictionary specifying the aggregation functions for the unit_measurement.

    Returns:
        pd.DataFrame: A dataframe where index is column_of_interest and the value column is the rate
            and the label is 'all'.
    """
    # Filter the DataFrame to include rows where 'object' is in 'objects_to_check'
    filtered_df = df[df[column_of_interest].isin(objects_to_check)]

    # Calculate the total quantity for each object
    object_rates = filtered_df.groupby(groupby_columns, as_index=False).agg(method)

    # Calculate the proportion for each object
    rates = object_rates[[column_of_interest, *method.keys()]].set_index(column_of_interest, drop=True)
    rates["label"] = "all"    

    return rates


# fail rate: quantity > 0
def count_objects_with_positive_quantity(df: pd.DataFrame, value_column: str = 'quantity', object_column: str = 'code') -> Dict[str, int]:
    """
    Count how many times each object had a quantity greater than zero in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'

    Returns:
        pd.Series: A Series with the count of positive quantity occurrences for each object.
    """
    # Filter the DataFrame to include rows where quantity is greater than zero
    positive_quantity_df = df[df[value_column] > 0]
    no_count_df = df[(df[value_column] == 0)]

    # Count the occurrences of positive quantities for each object
    object_counts = positive_quantity_df[object_column].value_counts()
    failed = object_counts/df.loc_date.nunique()

    # identify the objects with a zero count
    no_counts = no_count_df[object_column].value_counts()
    zeroes = no_counts[~no_counts.index.isin(object_counts.index)]
    zeroes.loc[:] = 0

    return pd.concat([failed, zeroes])

# pieces per merter for a set of data
def rate_per_unit_cumulative(df: pd.DataFrame, groupby_columns: List, object_labels: List, objects: List, agg_methods: Dict)-> pd.DataFrame:
    """
    Calculate cumulative rates per unit for specific objects and aggregation methods.

    This function takes a DataFrame and calculates cumulative rates per unit based on
    the specified groupby columns, object labels, objects of interest, and aggregation methods.

    Args:
        df (pd.DataFrame): The input DataFrame containing data for analysis.
        groupby_columns (List): List of columns to group by in the DataFrame.
        object_labels (List): List of labels to identify objects of interest.
        objects (List): List of objects for which cumulative rates are calculated.
        agg_methods (Dict): Dictionary specifying aggregation methods for calculating rates.

    Returns:
        pd.DataFrame: A DataFrame containing the cumulative rates per unit.

    Example:
        groupby_columns = ['Region', 'Year']
        object_labels = ['Object A', 'Object B']
        objects = ['A', 'B']
        agg_methods = {'Value': 'sum', 'Count': 'count'}

        cumulative_rates = rate_per_unit_cumulative(df, groupby_columns, object_labels, objects, agg_methods)
    """
    parent_summary = aggregate_dataframe(df, groupby_columns, agg_methods)
    parent_boundary_summary = calculate_rate_per_unit(parent_summary, object_labels, objects[0], objects)
    parent_boundary_summary.reset_index(drop=False, inplace=True)

    return parent_boundary_summary


def aggregate_boundaries(df: pd.DataFrame, unit_columns: list, unit_agg: dict, boundary_labels: list, boundary_columns: list, group_agg: dict)-> pd.DataFrame:
    """
    Aggregate data from a dataframe by boundaries and groups.

    Aggregates a dataframe in two steps. First, it performs
    aggregation at the 'unit' level defined by 'unit_columns' and 'unit_agg' to obtain
    test statistics. Then, it aggregates these 'unit' statistics further at the
    'boundary' level defined by 'boundary_labels' and 'boundary_columns', and computes
    the test statistics for each boundary.

    Args:
        df (pd.DataFrame): The input DataFrame containing data to be aggregated.
        unit_columns (list): List of columns for 'unit' level aggregation.
        unit_agg (dict): Dictionary specifying the aggregation functions for 'unit' level.
        boundary_labels (list): List of boundary labels to define 'boundaries' for further aggregation.
        boundary_columns (list): List of columns for 'boundary' level aggregation.
        group_agg (dict): Dictionary specifying the aggregation functions for 'boundary' level.

    Returns:
        pd.DataFrame: A DataFrame containing aggregated data at the 'boundary' level with
        additional 'label' column indicating the boundary label. 
    """
    

    unit_aggregate = aggregate_dataframe(df, unit_columns, unit_agg)
    boundary_summaries = []
    for label in boundary_labels:
        boundary_mask = unit_aggregate[unit_columns[0]] == label
        boundary_aggregate = unit_aggregate[boundary_mask].groupby(boundary_columns, as_index=False).agg(agg_groups)
        boundary_aggregate['label'] = label
        boundary_summaries.append(boundary_aggregate)

    return pd.concat(boundary_summaries)

def color_gradient(val, cmap: ListedColormap = None, min: float = 0.0, max: float = .9):
    """
    Apply a color gradient to a numerical value for cell styling.

    This function takes a numerical value 'val' and applies a color gradient based on the provided
    colormap ('cmap') and the specified range defined by 'min' and 'max'. It returns a CSS style
    for cell background color.

    Args:
        val (float): The numerical value to be colored using the gradient.
        cmap (ListedColormap, optional): The colormap to use for the color gradient. Defaults to None.
        min (float, optional): The minimum value of the data range. Defaults to 0.0.
        max (float, optional): The maximum value of the data range. Defaults to 1.0.

    Returns:
        str: A CSS style string for cell background color and text color.

    Example:
        # Apply a color gradient using a custom colormap 'cmap' to the DataFrame
        df.style.applymap(color_gradient, cmap=my_colormap, min=0.0, max=100.0)
    """
    # print(type(cmap))
    # Normalize the value to a range [0, 1] 
    # min, max should be the min max for the
    # data frame in question
    # print(type(val))
    normalized_val = (val - min) / max
    
    r, g, b, a = cmap(normalized_val)

    
    # Calculate the color based on the normalized value    
    hex_color = f"rgba({int(r*255)},{int(g*255)},{int(b*255)}, .5)"
    
       
    # Return the CSS style with the background color
    return f'background-color: {hex_color}; color:black'

def apply_gradient_style(df, gradient_func: Callable = None, **kwargs):
    """
    Apply a custom gradient style to a DataFrame using a specified gradient function.

    Args:
        df (pd.DataFrame): The input DataFrame.
        gradient_func (callable): A function that takes a value and returns CSS styles for styling.

    Returns:
        pd.io.formats.style.Styler: A styled representation of the DataFrame.

    Example:
        def gradient(val):
            cmap = 'coolwarm'
            vmin = df.values.min().min()
            vmax = df.values.max().max()
            norm = plt.Normalize(vmin, vmax)
            colormap = plt.cm.get_cmap(cmap)
            color = colormap(norm(val))
            hex_color = "#{:02x}{:02x}{:02x}".format(int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
            return f'background-color: {hex_color}'

        styled_df = apply_gradient_style(df, gradient)
    """
    # Use the .style.applymap method to apply the gradient function to each cell in the DataFrame
    styled_df = df.style.applymap(gradient_func, **kwargs)

    return styled_df
    
def apply_table_style_format(df, style: List = None, format_kwargs: Dict = None):
    
       
    return df.style.set_table_styles(style).format(**format_kwargs)

# a decorator for data frames
def apply_style_to_dataframe(mymethod, **kwargs):
    """
    Apply styling to a DataFrame using a custom method or gradient function.
    
    This decorator function allows you to apply styling to a DataFrame returned by a function.
    It takes a custom method and a gradient function as input and applies these functions to
    the DataFrame to style it based on specified criteria.
    
    Args:
    mymethod (callable): A custom method that applies styling to the DataFrame.
    gradient_func (callable): A function that defines a color gradient for styling.
    **kwargs: Additional keyword arguments to pass to the custom method.
    
    Returns:
    callable: A decorator function that can be used to style a DataFrame returned by a function.
    
    Example:
    def my_custom_styling(df, gradient_func, **kwargs):
        # Apply styling to the DataFrame using the gradient function and optional kwargs
        styled_df = df.style.applymap(gradient_func, **kwargs)
        return styled_df
    
    @apply_style_to_dataframe(my_custom_styling, my_gradient_function, colormap='coolwarm')
    def create_dataframe():
        # Create and return a DataFrame
        ...
    
    styled_result = create_dataframe()
    """
    def decorator(function):
    
        def wrapper(*args):
            # Call the original function to get the DataFrame
            df = function(*args)
                                 
            return mymethod(df,**kwargs)
        
        return wrapper
    return decorator


# @apply_style_to_dataframe(apply_gradient_style, gradient_func=color_gradient, min=0, max=1, cmap=newcmp)
def boundary_summary(parent_boundary: pd.DataFrame, boundary_summary: pd.DataFrame, object_columns: List, unit: str)-> pd.DataFrame:
    """
    Create a boundary summary DataFrame based on parent and boundary summaries.

    This function combines parent and boundary summaries to create a consolidated boundary summary
    DataFrame. The aggregation is based on the specified object columns and the 'unit' of interest.

    Args:
        parent_boundary (pd.DataFrame): The parent boundary summary DataFrame.
        boundary_summaries (pd.DataFrame): The boundary summaries for individual objects.
        object_columns (List): List of columns identifying the objects.
        unit (str): The unit of interest for aggregation.

    Returns:
        pd.DataFrame: A boundary summary DataFrame that combines parent and individual object summaries.

    Example:
        parent_boundary = ...
        boundary_summaries = ...
        object_columns = ['Object']
        unit = 'pcs_m'

        boundary_result = boundary_summary(parent_boundary, boundary_summaries, object_columns, unit)
    """
    boundary_limits = pd.concat([parent_boundary, boundary_summary])
    objects = boundary_limits[object_columns[0]].nunique()
    boundaries = boundary_limits.label.nunique()
    if objects >= boundaries:
        b = boundary_limits.pivot(index=object_columns[0], columns="label", values=unit)
        b = b[[*boundary_summary.label.unique(), *parent_boundary.label.unique()]]
    else:
        b = boundary_limits.pivot(columns=object_columns[0], index="label", values=unit)
        
    return b


def translate_word(X: str, map: pd.DataFrame, lan: str):
    """
    Translate a word or phrase using a language mapping DataFrame.

    This function takes a word or phrase 'X' and attempts to translate it into another language
    specified by 'lan' using a language mapping DataFrame 'map'. If the word is found in the index
    of the mapping DataFrame, the translation is returned; otherwise, the original word is returned.

    Args:
        X (str): The word or phrase to be translated.
        map (pd.DataFrame): A DataFrame containing language mappings.
        lan (str): The target language code for translation.

    Returns:
        str: The translated word or phrase, or the original word if not found in the mapping.

    Example:
        # Create a DataFrame for language mapping
        language_map = pd.DataFrame({'English': ['apple', 'banana', 'cherry'],
                                    'French': ['pomme', 'banane', 'cerise']})

        # Translate a word into French
        translated_word = translate_word('apple', language_map, 'French')
    """    
    if X in map.index:
        return map.loc[X,  lan]
    else:
        return X

def capitalize_index(X):
    return X.title()

def translate_for_display(df: pd.DataFrame, map: pd.DataFrame, lan: str):
    """
    Translate column names and index labels of a DataFrame for display.

    This function takes a DataFrame 'df' and translates its column names and index labels using a
    language mapping DataFrame 'map' for display in a specified language 'lan'. The translated
    column names are used as new column names in the DataFrame, and the index labels are replaced
    with their translations.

    Args:
        df (pd.DataFrame): The input DataFrame for translation.
        map (pd.DataFrame): A DataFrame containing language mappings.
        lan (str): The target language code for translation.

    Returns:
        pd.DataFrame: The DataFrame with translated column names and index labels for display.

    Example:
        # Create a DataFrame to be translated
        data = {'apple': [1, 2, 3], 'banana': [4, 5, 6]}
        original_df = pd.DataFrame(data)

        # Create a language mapping DataFrame
        language_map = pd.DataFrame({'English': ['apple', 'banana'],
                                    'French': ['pomme', 'banane']})

        # Translate the column names and index labels for display in French
        translated_df = translate_for_display(original_df, language_map, 'French')
    """
    
    old_columns = df.columns    
    changed_c = [x for x in old_columns if x in map.index]
        
    new_columns = {x: map.loc[x, lan] for x in changed_c}
    df.rename(columns=new_columns, inplace=True)
    
    new_index = {x:translate_word(x, map, lan) for x in df.index}
    df['new_index'] = df.index.map(lambda x: new_index[x])
    df.set_index('new_index', drop=True, inplace=True)
    df.index.name = None
    df.columns.name = None
    
    return df

def translated_and_style_for_display(df, map, lan, gradient: bool = True):
    """
    Translate, style, and format a DataFrame for display.

    This function translates column names and index labels, applies styling, and optionally
    adds a color gradient to a DataFrame to prepare it for display in a specified language 'lan'.
    
    Args:
        df (pd.DataFrame): The input DataFrame to be translated and styled.
        map (pd.DataFrame): A DataFrame containing language mappings.
        lan (str): The target language code for translation.
        gradient (bool, optional): Whether to apply a color gradient to the DataFrame. Defaults to True.

    Returns:
        Styler: A styled DataFrame ready for display with translated labels and styling.

    Example:
        # Create a DataFrame to be translated and styled
        data = {'apple': [1, 2, 3], 'banana': [4, 5, 6]}
        original_df = pd.DataFrame(data)

        # Create a language mapping DataFrame
        language_map = pd.DataFrame({'English': ['apple', 'banana'],
                                    'French': ['pomme', 'banane']})

        # Translate, style, and format the DataFrame for display in French
        styled_df = translated_and_style_for_display(original_df, language_map, 'French', gradient=True)
    """
    d = translate_for_display(df, map, lan)
    d = d.style.format(**format_kwargs).set_table_styles(table_css_styles)
    if gradient:
        d = d.applymap(color_gradient, cmap=newcmp)
    return d.format_index(str.title, axis=1).format_index(str.title, axis=0)


# @apply_style_to_dataframe(apply_table_style_format, style=table_css_styles, format_kwargs=format_kwargs)
def display_tabular_data_by_column_values(df, column_one: dict, column_two: dict, index: str):
    """
    Display tabular data based on column values.

    This function filters a DataFrame 'df' to include rows where either 'column_one' or 'column_two'
    meet specified conditions. The resulting DataFrame is then set to have 'index' as the index, and
    the index name is removed for cleaner tabular display.

    Args:
        df (pd.DataFrame): The input DataFrame containing tabular data.
        column_one (dict): A dictionary specifying the column and value condition for 'column_one'.
        column_two (dict): A dictionary specifying the column and value condition for 'column_two'.
        index (str): The column to be set as the index for the resulting DataFrame.

    Returns:
        pd.DataFrame: The filtered DataFrame with 'index' as the index and the index name removed.

    Example:
        # Create a sample DataFrame 'data_df'
        data_df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'],
                                'Age': [25, 30, 35],
                                'Salary': [50000, 60000, 70000]})

        # Define filtering conditions for 'Age' and 'Salary'
        column_one = {'column': 'Age', 'val': 30}
        column_two = {'column': 'Salary', 'val': 65000}

        # Display filtered tabular data by 'Name' where either 'Age' or 'Salary' meets the conditions
        filtered_data = display_tabular_data_by_column_values(data_df, column_one, column_two, 'Name')
    """
    d = df[(df[column_one["column"]] >= column_one["val"]) | (df[column_two["column"]] >= column_two["val"])].copy()
    d.set_index(index, inplace=True, drop=True)
    d.index.name = None       
    return d



def summary_of_parent_and_child_features(df: pd.DataFrame,
                                         cumulative_columns: List = None,
                                         boundary_labels: List = None,
                                         object_labels: List = None,
                                         object_columns: List = None,
                                         unit_agg: dict = None,
                                         unit_columns: List = None,
                                         agg_groups: dict = None)-> pd.DataFrame:
    """
    Generate a summary of parent and child features based on a DataFrame.

    This function computes a summary of parent and child features based on the provided DataFrame 'df'.
    It calculates cumulative values, aggregates boundary summaries, and generates a comprehensive summary
    DataFrame that includes both parent and child features.

    Args:
        df (pd.DataFrame): The input DataFrame containing data for analysis.
        cumulative_columns (List, optional): List of columns to be considered for cumulative values.
        boundary_labels (List, optional): List of labels for boundary summaries.
        object_labels (List, optional): List of labels for individual objects.
        object_columns (List, optional): List of columns identifying objects.
        unit_agg (dict, optional): Aggregation methods for unit summaries.
        unit_columns (List, optional): List of columns for unit summaries.
        agg_groups (dict, optional): Aggregation methods for boundary summaries.

    Returns:
        pd.DataFrame: A summary of parent and child features with comprehensive information.

    Example:
        # Define parameters for generating the summary
        cumulative_columns = ['quantity', 'total_weight']
        boundary_labels = ['Boundary 1', 'Boundary 2']
        object_labels = ['Object 1', 'Object 2']
        object_columns = ['object_id', 'object_name']
        unit_agg = {'quantity': 'sum', 'total_weight': 'mean'}
        unit_columns = ['unit_id', 'unit_name']
        agg_groups = {'quantity': 'sum', 'total_weight': 'mean'}

        # Generate the summary of parent and child features
        summary_df = summary_of_parent_and_child_features(data_df, cumulative_columns, boundary_labels,
                                                         object_labels, object_columns, unit_agg, unit_columns, agg_groups)
    """
                                            

    parent_boundary = rate_per_unit_cumulative(df, cumulative_columns, object_labels, object_columns, unit_agg)
    boundary_summaries = aggregate_boundaries(df, unit_columns, unit_agg, boundary_labels, object_columns, agg_groups)
    x = boundary_summary(parent_boundary, boundary_summaries, object_columns, "pcs_m")

    return x

def a_summary_of_one_vector(df, unit_columns, unit_agg, describe='pcs_m'):
    
    sample_totals = aggregate_dataframe(code_result_df.copy(), unit_columns, unit_agg)
    sample_summary = sample_totals[describe].describe()
    sample_summary["total"] = sample_totals.quantity.sum()
    sample_summary = pd.DataFrame(sample_summary)
    sample_summary[describe] = sample_summary[describe].astype(object)
    sample_summary.loc['count', describe] = int(sample_summary.loc['count', describe])
    sample_summary.loc['total', describe] = int(sample_summary.loc['total',describe])
    
    return sample_summary

def summarize_work_data(df, labels):
    dfs = []
    for alabel in labels:
        data = df[df.canton == alabel].copy()
        n_rivers = data[data.feature_type == 'r'].feature_name.nunique()
        n_lakes = data[data.feature_type == 'l'].feature_name.nunique()
        n_parks = data[data.feature_type == 'p'].feature_name.nunique()
        n_cities = data.city.nunique()
        n_samples = data.loc_date.nunique()
        total = data.quantity.sum()
        big_picture = {alabel: {'rivers':n_rivers, 'lakes':n_lakes, 'parks': n_parks, 'cities': n_cities, 'samples': n_samples, 'quantity': total}}
        dfs.append(pd.DataFrame(big_picture))
    return pd.concat(dfs, axis=1).T
format_kwargs = dict(precision=2, thousands="'", decimal=",")

# Testing data models

The methods used in the version of the federal report were tested, but their was not a specific set of validation criteria beforehand. Test were done as the work progressed. This wasted alot of time

here we test the land use and survey data models.

1. is the land use data complete for each survey location?
2. does the survey data aggregate correctly to sample level?
   * what happens to objects with a quantity of zero?
   * aggregating to cantonal, municipal or survey area
     * are all locations included?
     * are lakes and rivers distinguished?
3. Does the aggregated data for iqaasl match the federal report?

### Gfoams, Gfrags, Gcaps

These are aggregate groups. It is difficult to infer how well a participant differentiates between size or use of the following codes.

1. Gfrags: G79, G78, G75
2. Gfoams: G81, G82, G76
3. Gcaps: G21, G22, G23, G24

These aggregate groups are used when comparing values between sampling campaigns.

### Sampling campaigns

The dates of the sampling campaigns are expanded to include the surveys that happened between large organized campaigns. The start and end dates are defined below.

__Attention!!__ The codes used for each survey campaign are different. Different groups organized and conducted surveys using the MLW protocol. The data was then sent to us.

__MCBP:__ November 2015 - November 2016. The initial sampling campaign. Fragmented plastics (Gfrags/G79/G78/G76) were not sorted by size. All unidentified hard plastic items were classified in this manner.

* start_date = 2015-11-15
* end_date = 2017-03-31

__SLR:__ April 2017 - May 2018. Sampling campaign by the WWF. Objects less than 2.5 cm were not counted.

* start_date = 2017-04-01
* end_date = 2020-03-31

__IQAASL:__ April 2020 - May 2021. Sampling campaign mandated by the Swiss confederation. Additional codes were added for regional objects.

* start_date = 2020-04-01
* end_date = 2021-05-31

__Plastock (not added yet):__ January 2022 - December 2022. Sampling campaign from the Association pour la Sauvegarde du Léman. Not all objects were counted, They only identified a limited number of objects.

### Feature name

The feature name is the name of a river lake or other regional label that you would find on a map. People in the region know the name.

### Feature type

The feature type is a label that applies to general conditions of use for the location and other locations in the region

* r: rivers: surveys on river banks
* l: lake: surveys on the lake shore
* p: parcs: surveys in recreational areas

### Parent boundary

Designates the larger geographic region of the survey location. For lakes and rivers it is the name of the catchment area or river basin. For parcs it is the the type of park ie.. les Alpes. Recall that each feature has a name, for example Alpes Lépontines is the the name of a feature in the geographic region of _Les Alpes_.

In [3]:
surveys = combine_survey_files(survey_files)
codes = pd.read_csv(code_data).set_index("code")
beaches = pd.read_csv(beach_data).set_index("slug")
land_cover = pd.read_csv(land_cover_data)
land_use = pd.read_csv(land_use_data)
streets = pd.read_csv(street_data)
river_intersect_lakes = pd.read_csv(intersection_attributes)

## Aggregate a set of data by sample (location and date)

Use the loc_date column in the survey data. Use the IQAASL period and four river baisns test against the federal report.

### Before aggregating does the number of locations, cities, samples and quantity match the federal report?

__The feature types include lakes and rivers, alpes were condsidered separately__

From https://hammerdirt-analyst.github.io/IQAASL-End-0f-Sampling-2021/lakes_rivers.html#

1. cities = yes
2. samples = yes
3. locations = yes
4. quantity = No it is short 50 pieces
5. start and end date = yes

In [4]:
# startint varaibles
period = "iqaasl"
survey_areas = ["rhone", "ticino", "linth", "aare"]
start, end = [*period_dates[period]]
# the surveys from the survey areas of intersest
survey_data = slice_data_by_date(surveys.copy(), start, end)

# the survey data sliced by the start and end data
feature_d= survey_data[survey_data.parent_boundary.isin(survey_areas)].copy()

# convert codes to gfrags, gcaps and gfoams
feature_data = use_gfrags_gfoams_gcaps(feature_d.copy(), codes)

# check the numbers
feature_vitals = collect_vitals(feature_d)
print(make_a_summary(feature_vitals))


    Number of objects: 54694
    
    Median pieces/meter: 0.0
    
    Number of samples: 386
    
    Number of unique codes: 235
    
    Number of sample locations: 143
    
    Number of features: 28
    
    Number of cities: 77
    
    Start date: 2020-03-08
    
    End date: 2021-05-12
    
    


In [5]:
# when the codes are changed to gfrags, gfoams and gcaps that creates 
# multiple code results for the same code at the same sample
# note that the code_result_columns do not have the groupname column
# this is because the code is changed and not the groupname
code_result_df = aggregate_dataframe(feature_data.copy(), code_result_columns, unit_agg)
code_result_df = code_result_df.merge(codes.groupname, left_on="code", right_index=True)
code_result_df = code_result_df.merge(beaches[["canton","feature_type"]], left_on='slug', right_index=True, validate="many_to_one")

### Number of lakes, rivers, parcs, cities and cantons

In [6]:
l_map = pd.read_csv('data/end_process/french_code_translations.csv')

lang = 'fr'

l_mapi = l_map.set_index('en', drop=True)

In [7]:
summary_of_work_data = summarize_work_data(code_result_df, code_result_df.canton.unique())
summary_of_work_data = summary_of_work_data[['samples', 'cities', 'lakes', 'rivers', 'parks', 'quantity']]
translated_and_style_for_display(summary_of_work_data, l_mapi, lang, gradient=False)

Unnamed: 0,Échantillons,Municipalités,Lacs,Rivières,Parcs,Quantité
St. Gallen,38,5,2,3,0,3'614
Aargau,4,4,0,2,0,101
Bern,88,21,4,4,0,8'786
Solothurn,3,2,0,1,0,66
Vaud,85,14,2,2,0,16'911
Tessin,28,7,2,3,0,3'023
Genève,20,2,1,1,0,4'962
Neuchâtel,16,4,2,0,0,2'375
Glarus,16,2,1,2,0,1'016
Valais,17,6,1,1,0,8'141


### aggregate to sample

The assessments are made on a per sample basis. That means that we can look at an individual object value at each sample. The sum of all the individual objects in a survey is the total for that survey. Dividing the totals by the length of the survey gives the assessment metric: _pieces of trash per meter_.

1. Are the quantiles of the current data  = to the federal report? Yes
2. Are the material totals = to the federal report? No,plastics if off by 50 pcs
3. Are the fail rates of the most common objects = to the federal report? Yes
4. Is the % of total of the most common objects = to the fedral report? yes
5. Is the median pieces/meter of the most common objects = to the federal report? yes
6. Is the quantity of the most common objects = to the federal report? yes

#### The summary of survey totals

fig 1.5 in IQAASL

In [8]:
# the sample is the basic unit
# loc_date is the unique identifier for each sample
unit_columns = ["loc_date", "slug", "parent_boundary"]

# the quantiles of the sample-total pcs/m  
vector_summary = a_summary_of_one_vector(code_result_df.copy(), unit_columns, unit_agg, describe='pcs_m')

translated_and_style_for_display(vector_summary,l_mapi, lang, gradient=False)

Unnamed: 0,Pcs/M
Échantillons,386
Moyenne,395
Écart-Type,706
Min,002
25%,082
50%,190
75%,387
Max,6617
Total,54'694


#### Material totals and proportions

fig 1.5 iqaal

In [9]:
# add the material label to each code
merged_result = merge_dataframes_on_column_and_index(code_result_df.copy(), codes["material"], 'code', how='inner', validate=True)

# sum the materials for the data frame
materials = aggregate_dataframe(merged_result.copy(), ["material"], {"quantity":"sum"})

# add 5 of total for display
materials["%"] = materials.quantity/materials.quantity.sum()

translated_and_style_for_display(materials.set_index('material', drop=True),l_mapi, lang, gradient=False)

Unnamed: 0,Quantité,% Du Total
Chimique,140,0
Tissu,343,1
Verre,2'919,5
Métal,1'874,3
Papier,1'527,3
Plastique,47'093,86
Caoutchouc,390,1
Non-Identifié,2,0
Bois,406,1


#### Quantity, median pcs/m, fail rate, and % of total

Sumary results for all the codes in the parent_boundary

In [10]:
# sum the cumulative quantity for each code and calculate the median pcs/meter
code_totals = aggregate_dataframe(code_result_df.copy(), ["code"], {"quantity":"sum", "pcs_m":"median"})

# collect 
abundant = get_top_x_records_with_max_quantity(code_totals.copy(), "quantity", "code", len(code_totals))

# identify the objects that were found in at least 50% of the samples
# calculate the quantity per sample for each code and sample
occurrences = aggregate_dataframe(code_result_df, ["loc_date", "code"], {"quantity":"sum"})

# count the number of times that an object was counted > 0
# and divide it by the total number of samples 
event_counts  = count_objects_with_positive_quantity(occurrences)

# calculate the rate of occurence per unit of measure
rates = calculate_rate_per_unit(code_result_df, code_result_df.code.unique())

# add the unit rates and fail rates
abundance = merge_dataframes_on_column_and_index(abundant, rates["pcs_m"], left_column="code", validate="one_to_one")
abundance["fail rate"] = abundance.code.apply(lambda x: event_counts.loc[x])

# this is the complete inventory with summary
# statistics for each objecabundance.sort_values(by="quantity", inplace=True, ascending=False)
abundance.reset_index(inplace=True, drop=True)

In [11]:
# codes = pd.read_csv(code_data)
# codes.rename(columns={"en":"code"}, inplace=True)
# codes.set_index("code", drop=True, inplace=True)
# codes.drop('code', inplace=True)
# codes.to_csv('data/end_process/codes.csv', index=True)

### The most common objects

fig 1.6 iqaasl

In [12]:
# arguments to slice the data by column
column_one = {
    'column': 'quantity',
    'val': abundance.loc[10, 'quantity']
}

column_two = {
    'column':'fail rate',
    'val': 0.5
}

# use the inventory to find the most common objects
the_most_common = display_tabular_data_by_column_values(abundance.copy(), column_one, column_two, 'code')

translated_and_style_for_display(the_most_common.copy(),l_mapi, lang, gradient=False)

Unnamed: 0,Quantité,% Du Total,Pcs/M,Taux D'Échec
Mégots Et Filtres À Cigarettes,8'485,16,20,88
"Fragments De Plastique: G80, G79, G78, G75",7'400,14,18,86
"Fragments De Polystyrène Expansé: G76, G81, G82, G83",5'559,10,5,69
"Emballages De Bonbons, De Snacks",3'325,6,9,85
"Bâche, Feuille Plastique Industrielle",2'534,5,5,70
Verre Brisé,2'136,4,3,65
Pellets Industriels (Gpi),1'968,4,0,31
"Couvercles En Plastique Bouteille: G21, G22, G23, G24",1'844,3,3,65
Mousse De Plastique Pour L'Isolation Thermique,1'656,3,1,53
Coton-Tige,1'406,3,1,51


### Results by groupname and feature boundary

In [13]:
cumulative_columns = ["loc_date", "groupname"]
unit_columns = ["parent_boundary", "loc_date", "groupname"]
object_labels = code_result_df.groupname.unique()
object_columns = ["groupname"]
boundary_labels = code_result_df.parent_boundary.unique()

args = {
    'cumulative_columns':cumulative_columns,
    'object_labels':object_labels,
    'boundary_labels':boundary_labels,
    'object_columns':object_columns,
    'unit_agg':unit_agg,
    'unit_columns':unit_columns,
    'agg_groups':agg_groups
}

tix = summary_of_parent_and_child_features(code_result_df.copy(), **args)
translated_and_style_for_display(tix,l_mapi, lang, gradient=True)

Unnamed: 0,Linth,Aare,Rhône,Ticino,Cumulé
Agriculture,3,6,14,6,7
Nourriture Et Boissons,28,25,70,28,34
Infrastructures,12,14,55,21,20
Micro-Plastiques (< 5Mm),0,1,11,0,1
Emballage Non Alimentaire,13,9,21,8,13
Articles Personnels,4,4,10,7,6
Morceaux De Plastique,11,18,48,10,18
Loisirs,4,6,17,4,6
Tabac,27,15,50,18,25
Non Classé,0,0,2,0,0


### Most common codes by feature boundary

In [14]:
cumulative_columns = ["loc_date", "code"]
unit_columns = ["parent_boundary", "loc_date", "code"]
codes_of_interest = the_most_common.index
object_columns = ["code"]
boundary_labels = code_result_df.parent_boundary.unique()

data = code_result_df[code_result_df.code.isin(codes_of_interest)].copy()

args = {
    'cumulative_columns':cumulative_columns,
    'object_labels':codes_of_interest,
    'boundary_labels':boundary_labels,
    'object_columns':object_columns,
    'unit_agg':unit_agg,
    'unit_columns':unit_columns,
    'agg_groups':agg_groups
}

tix = summary_of_parent_and_child_features(data.copy(), **args)

translated_and_style_for_display(tix,l_mapi, lang, gradient=True)

Unnamed: 0,Linth,Aare,Rhône,Ticino,Cumulé
Pellets Industriels (Gpi),0,0,0,0,0
Polystyrène < 5Mm,0,0,0,0,0
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",1,0,3,1,1
Verre Brisé,4,3,2,8,3
Mégots Et Filtres À Cigarettes,23,11,42,15,20
"Emballages De Bonbons, De Snacks",6,8,19,4,9
"Bâche, Feuille Plastique Industrielle",2,5,9,4,5
Mousse De Plastique Pour L'Isolation Thermique,0,0,7,3,1
Déchets De Construction En Plastique,0,0,6,3,1
Coton-Tige,0,0,11,0,1


### Most common codes by canton



In [15]:
unit_columns = ["canton", "loc_date", "code"]
object_columns = ["code"]
boundary_labels = code_result_df.canton.unique()

data = code_result_df[code_result_df.code.isin(codes_of_interest)].copy()

args = {
    'cumulative_columns':cumulative_columns,
    'object_labels':codes_of_interest,
    'boundary_labels':boundary_labels,
    'object_columns':object_columns,
    'unit_agg':unit_agg,
    'unit_columns':unit_columns,
    'agg_groups':agg_groups
}

tix = summary_of_parent_and_child_features(data, **args)

translated_and_style_for_display(tix.T,l_mapi, lang, gradient=True)

Unnamed: 0,Aargau,Bern,Fribourg,Genève,Glarus,Luzern,Neuchâtel,Schwyz,Solothurn,St. Gallen,Tessin,Valais,Vaud,Zug,Zürich,Cumulé
Pellets Industriels (Gpi),0,0,0,3,0,0,0,16,0,0,0,0,0,0,0,0
Polystyrène < 5Mm,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",0,0,0,7,1,0,5,0,3,0,1,0,2,0,5,1
Verre Brisé,0,3,3,1,0,0,21,0,0,5,8,0,4,0,12,3
Mégots Et Filtres À Cigarettes,1,11,11,17,17,8,33,0,6,23,15,4,47,7,46,20
"Emballages De Bonbons, De Snacks",3,8,6,16,7,5,12,20,2,10,4,45,13,1,7,9
"Bâche, Feuille Plastique Industrielle",0,8,0,0,4,0,7,15,0,11,4,53,8,0,0,5
Mousse De Plastique Pour L'Isolation Thermique,0,1,0,1,1,0,1,0,0,0,3,11,9,0,0,1
Déchets De Construction En Plastique,0,1,0,0,1,0,4,0,0,1,3,25,4,1,0,1
Coton-Tige,0,0,0,2,0,0,2,1,3,0,0,53,10,0,0,1


### Most common codes: canton-municipal

#### Bern

In [16]:
canton = "Bern"

with_cantons = code_result_df[code_result_df.canton == canton].copy()

unit_columns = ["city", "loc_date", "code"]
# the column that holds the labels of interest
object_columns = ["code"]
# the labels of interest for the boundary conditions
boundary_labels = with_cantons.city.unique()

ddata = with_cantons[(with_cantons.code.isin(codes_of_interest)) & (with_cantons.canton == "Bern")].copy()

args = {
    'cumulative_columns':cumulative_columns,
    'object_labels':codes_of_interest,
    'boundary_labels':boundary_labels,
    'object_columns':object_columns,
    'unit_agg':unit_agg,
    'unit_columns':unit_columns,
    'agg_groups':agg_groups
}

tix = summary_of_parent_and_child_features(ddata, **args)
translated_and_style_for_display(tix.T,l_mapi, lang, gradient=True)

Unnamed: 0,Beatenberg,Bern,Biel/Bienne,Brienz (Be),Brügg,Burgdorf,Bönigen,Erlach,Gals,Kallnach,Köniz,Ligerz,Lüscherz,Nidau,Port,Rubigen,Spiez,Thun,Unterseen,Vinelz,Walperswil,Cumulé
Pellets Industriels (Gpi),3,0,4,0,0,0,4,0,0,10,0,0,0,8,1,0,0,0,0,10,0,0
Polystyrène < 5Mm,0,0,6,0,0,0,0,0,17,0,0,0,0,4,0,0,0,0,2,0,0,0
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",0,0,2,2,0,0,0,2,6,3,0,7,0,0,3,0,0,0,0,0,0,0
Verre Brisé,2,0,5,0,36,0,0,2,7,0,0,100,20,12,1,0,13,0,2,8,0,3
Mégots Et Filtres À Cigarettes,55,1,81,0,28,7,119,44,9,0,9,76,4,0,63,6,4,23,55,4,0,11
"Emballages De Bonbons, De Snacks",12,1,34,39,0,2,6,5,12,8,0,78,2,60,7,11,0,9,11,18,0,8
"Bâche, Feuille Plastique Industrielle",3,1,17,67,0,22,15,0,3,32,0,119,5,40,0,0,1,13,13,37,0,8
Mousse De Plastique Pour L'Isolation Thermique,42,0,5,2,0,0,10,0,0,0,0,21,0,0,1,0,0,0,9,0,19,1
Déchets De Construction En Plastique,0,0,6,0,0,0,4,2,0,0,0,0,0,4,0,0,0,0,1,7,0,1
Coton-Tige,4,0,5,6,0,0,6,12,6,7,0,12,0,0,0,0,0,1,2,0,0,0


#### Valais

In [17]:
canton = "Valais"

with_cantons = code_result_df[code_result_df.canton == canton].copy()

unit_columns = ["city", "loc_date", "code"]
# the column that holds the labels of interest
object_columns = ["code"]
# the labels of interest for the boundary conditions
boundary_labels = with_cantons.city.unique()

ddata = with_cantons[(with_cantons.code.isin(codes_of_interest))].copy()

args = {
    'cumulative_columns':cumulative_columns,
    'object_labels':codes_of_interest,
    'boundary_labels':boundary_labels,
    'object_columns':object_columns,
    'unit_agg':unit_agg,
    'unit_columns':unit_columns,
    'agg_groups':agg_groups
}

tix = summary_of_parent_and_child_features(ddata, **args)
translated_and_style_for_display(tix,l_mapi, lang, gradient=True)

Unnamed: 0,Saint-Gingolph,Lavey-Morcles,Riddes,Sion,Leuk,Salgesch,Cumulé
Pellets Industriels (Gpi),3,0,0,0,0,0,0
Polystyrène < 5Mm,18,0,0,0,0,0,0
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",3,0,3,0,0,0,0
Verre Brisé,3,0,0,0,0,0,0
Mégots Et Filtres À Cigarettes,13,1,0,4,0,0,4
"Emballages De Bonbons, De Snacks",77,10,0,2,0,0,45
"Bâche, Feuille Plastique Industrielle",92,50,0,0,6,30,53
Mousse De Plastique Pour L'Isolation Thermique,58,0,0,0,0,0,11
Déchets De Construction En Plastique,85,13,0,0,8,0,25
Coton-Tige,125,0,0,0,0,0,53


In [18]:
%watermark -a hammerdirt-analyst -co --iversions

Author: hammerdirt-analyst

conda environment: cantonal_report

matplotlib: 3.7.1
pandas    : 2.0.3
numpy     : 1.25.2

