In [1]:
%load_ext watermark
import pandas as pd
import numpy as np
from typing import Type, Optional, Callable
from typing import List, Dict, Union, Tuple

from review_methods_tests import collect_vitals, find_missing, find_missing_loc_dates
from review_methods_tests import use_gfrags_gfoams_gcaps, make_a_summary

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap

import setvariables as conf_
import methods_iqaasl as mi

# Code

In [2]:
def slice_data_by_date(data: pd.DataFrame, start: str, end: str):
    mask = (data.date >= start) & (data.date <= end)
    return data[mask]


def aggregate_dataframe(df: pd.DataFrame,
                        groupby_columns: List[str],
                        aggregation_functions: Dict[str, Union[str, callable]],
                        index: bool = False) -> pd.DataFrame:
    """
    Aggregate specified columns in a Pandas DataFrame using given aggregation functions.

    Args:
        df (pd.DataFrame): The input DataFrame.
        groupby_columns (List[str]): List of column names to group by.
        aggregation_functions (Dict[str, Union[str, callable]]): 
            A dictionary where keys are column names to aggregate, 
            and values are either aggregation functions (e.g., 'sum', 'mean', 'max', 'min')
            or custom aggregation functions (callable functions).
        index (bool, optional): Whether to use the groupby columns as an index.
            Default is False.

    Returns:
        pd.DataFrame: A new DataFrame with aggregated values.
    """
    grouped = df.groupby(groupby_columns, as_index=index).agg(aggregation_functions)
    
    return grouped
    
def merge_dataframes_on_column_and_index(left_df: pd.DataFrame,
                                         right_df: pd.DataFrame,
                                         left_column: str,
                                         how: str = 'inner',
                                         validate: str = 'many_to_one') -> pd.DataFrame:
    """
    Merge two DataFrames where the left DataFrame is merged on a specified column and 
    the right DataFrame is merged on its index.

    Args:
        left_df (pd.DataFrame): The left DataFrame to be merged.
        right_df (pd.DataFrame): The right DataFrame to be merged on its index.
        left_column (str): The column in the left DataFrame to merge on.
        how (str, optional): The type of merge to be performed ('left', 'right', 'outer', or 'inner'). 
            Default is 'inner'.
        validate (str, optional): Whether to perform merge validation checks. 
            Default is 'many_to_one'.

    Returns:
        pd.DataFrame: A new DataFrame resulting from the merge operation.
    """
  
    merged_df = left_df.merge(right_df, left_on=left_column, right_index=True, how=how)
    return merged_df

def get_top_x_records_with_max_quantity(df: pd.DataFrame, quantity_column: str, id_column: str, x: int):
    """
    Get the top x records with the greatest quantity and their proportion to the total from a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        quantity_column (str): The name of the quantity column.
        id_column (str): The name of the ID column.
        x (int): The number of records to return.

    Returns:
        A data frame
    """
    # Sort the DataFrame by the quantity column in descending order, take the top x records, and select the ID column
    top_x_records = df.nlargest(x, quantity_column)[[id_column, quantity_column]]
    top_x_records["%"] = top_x_records[quantity_column]/top_x_records[quantity_column].sum()
    
    return top_x_records[[id_column, quantity_column, "%"]]



def calculate_rate_per_unit(df: pd.DataFrame,
                            objects_to_check: List[str],
                            column_of_interest: str = "code",
                            groupby_columns: List[str] = ['code'],
                            method: Dict[str, str] = {"pcs_m": "median", "quantity":"sum"},
                            )-> pd.DataFrame:
    """
    Calculate the rate of occurence of object(s) for a given unit measurement. Adds the label
    'all' to each record.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'
        objects_to_check (List[str]): The list of objects to calculate proportions for.
        column_of_interest (str): The column label of the objects being compared.
        groupby_columns Dict[str]: The columns used for the aggregation.
        method (Dict[str]): Dictionary specifying the aggregation functions for the unit_measurement.

    Returns:
        pd.DataFrame: A dataframe where index is column_of_interest and the value column is the rate
            and the label is 'all'.
    """
    # Filter the DataFrame to include rows where 'object' is in 'objects_to_check'
    filtered_df = df[df[column_of_interest].isin(objects_to_check)]

    # Calculate the total quantity for each object
    object_rates = filtered_df.groupby(groupby_columns, as_index=False).agg(method)

    # Calculate the proportion for each object
    rates = object_rates[[column_of_interest, *method.keys()]].set_index(column_of_interest, drop=True)
    rates["label"] = "all"    

    return rates


def count_objects_with_positive_quantity(df: pd.DataFrame, value_column: str = 'quantity', object_column: str = 'code') -> Dict[str, int]:
    """
    Count how many times each object had a quantity greater than zero in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame with columns 'sample,' 'object,' and 'quantity.'

    Returns:
        pd.Series: A Series with the count of positive quantity occurrences for each object.
    """
    # Filter the DataFrame to include rows where quantity is greater than zero
    positive_quantity_df = df[df[value_column] > 0]
    no_count_df = df[(df[value_column] == 0)]

    # Count the occurrences of positive quantities for each object
    object_counts = positive_quantity_df[object_column].value_counts()
    failed = object_counts/df.loc_date.nunique()

    # identify the objects with a zero count
    no_counts = no_count_df[object_column].value_counts()
    zeroes = no_counts[~no_counts.index.isin(object_counts.index)]
    zeroes.loc[:] = 0

    return pd.concat([failed, zeroes])

# pieces per merter for a set of data
def rate_per_unit_cumulative(df: pd.DataFrame, groupby_columns: List, object_labels: List, objects: List, agg_methods: Dict)-> pd.DataFrame:
    """
    Calculate cumulative rates per unit for specific objects and aggregation methods.

    This function takes a DataFrame and calculates cumulative rates per unit based on
    the specified groupby columns, object labels, objects of interest, and aggregation methods.

    Args:
        df (pd.DataFrame): The input DataFrame containing data for analysis.
        groupby_columns (List): List of columns to group by in the DataFrame.
        object_labels (List): List of labels to identify objects of interest.
        objects (List): List of objects for which cumulative rates are calculated.
        agg_methods (Dict): Dictionary specifying aggregation methods for calculating rates.

    Returns:
        pd.DataFrame: A DataFrame containing the cumulative rates per unit.

    Example:
        groupby_columns = ['Region', 'Year']
        object_labels = ['Object A', 'Object B']
        objects = ['A', 'B']
        agg_methods = {'Value': 'sum', 'Count': 'count'}

        cumulative_rates = rate_per_unit_cumulative(df, groupby_columns, object_labels, objects, agg_methods)
    """
    parent_summary = aggregate_dataframe(df, groupby_columns, agg_methods)
    parent_boundary_summary = calculate_rate_per_unit(parent_summary, object_labels, objects[0], objects)
    parent_boundary_summary.reset_index(drop=False, inplace=True)

    return parent_boundary_summary


def aggregate_boundaries(df: pd.DataFrame, unit_columns: list, unit_agg: dict, boundary_labels: list, boundary_columns: list, group_agg: dict)-> pd.DataFrame:
    """
    Aggregate data from a dataframe by boundaries and groups.

    Aggregates a dataframe in two steps. First, it performs
    aggregation at the 'unit' level defined by 'unit_columns' and 'unit_agg' to obtain
    test statistics. Then, it aggregates these 'unit' statistics further at the
    'boundary' level defined by 'boundary_labels' and 'boundary_columns', and computes
    the test statistics for each boundary.

    Args:
        df (pd.DataFrame): The input DataFrame containing data to be aggregated.
        unit_columns (list): List of columns for 'unit' level aggregation.
        unit_agg (dict): Dictionary specifying the aggregation functions for 'unit' level.
        boundary_labels (list): List of boundary labels to define 'boundaries' for further aggregation.
        boundary_columns (list): List of columns for 'boundary' level aggregation.
        group_agg (dict): Dictionary specifying the aggregation functions for 'boundary' level.

    Returns:
        pd.DataFrame: A DataFrame containing aggregated data at the 'boundary' level with
        additional 'label' column indicating the boundary label. 
    """
    

    unit_aggregate = aggregate_dataframe(df, unit_columns, unit_agg)
    boundary_summaries = []
    for label in boundary_labels:
        boundary_mask = unit_aggregate[unit_columns[0]] == label
        boundary_aggregate = unit_aggregate[boundary_mask].groupby(boundary_columns, as_index=False).agg(group_agg)
        boundary_aggregate['label'] = label
        boundary_summaries.append(boundary_aggregate)

    return pd.concat(boundary_summaries)

def color_gradient(val, cmap: ListedColormap = None, min: float = 0.0, max: float = .9):
    """
    Apply a color gradient to a numerical value for cell styling.

    This function takes a numerical value 'val' and applies a color gradient based on the provided
    colormap ('cmap') and the specified range defined by 'min' and 'max'. It returns a CSS style
    for cell background color.

    Args:
        val (float): The numerical value to be colored using the gradient.
        cmap (ListedColormap, optional): The colormap to use for the color gradient. Defaults to None.
        min (float, optional): The minimum value of the data range. Defaults to 0.0.
        max (float, optional): The maximum value of the data range. Defaults to 1.0.

    Returns:
        str: A CSS style string for cell background color and text color.

    Example:
        # Apply a color gradient using a custom colormap 'cmap' to the DataFrame
        df.style.applymap(color_gradient, cmap=my_colormap, min=0.0, max=100.0)
    """
    # Normalize the value to a range [0, 1] 
    # min, max should be the min max for the
    # data frame in question
    normalized_val = (val - min) / max
    
    r, g, b, a = cmap(normalized_val)

    
    # Calculate the color based on the normalized value    
    hex_color = f"rgba({int(r*255)},{int(g*255)},{int(b*255)}, .5)"
    
       
    # Return the CSS style with the background color
    return f'background-color: {hex_color}; color:black'


def boundary_summary(parent_boundary: pd.DataFrame, boundary_summary: pd.DataFrame, object_columns: List, unit: str)-> pd.DataFrame:
    """
    Create a boundary summary DataFrame based on parent and boundary summaries.

    This function combines parent and boundary summaries to create a consolidated boundary summary
    DataFrame. The aggregation is based on the specified object columns and the 'unit' of interest.

    Args:
        parent_boundary (pd.DataFrame): The parent boundary summary DataFrame.
        boundary_summaries (pd.DataFrame): The boundary summaries for individual objects.
        object_columns (List): List of columns identifying the objects.
        unit (str): The unit of interest for aggregation.

    Returns:
        pd.DataFrame: A boundary summary DataFrame that combines parent and individual object summaries.

    Example:
        parent_boundary = ...
        boundary_summaries = ...
        object_columns = ['Object']
        unit = 'pcs_m'

        boundary_result = boundary_summary(parent_boundary, boundary_summaries, object_columns, unit)
    """
    boundary_limits = pd.concat([parent_boundary, boundary_summary])
    objects = boundary_limits[object_columns[0]].nunique()
    boundaries = boundary_limits.label.nunique()
    if objects >= boundaries:
        b = boundary_limits.pivot(index=object_columns[0], columns="label", values=unit)
        b = b[[*boundary_summary.label.unique(), *parent_boundary.label.unique()]]
    else:
        b = boundary_limits.pivot(columns=object_columns[0], index="label", values=unit)
        
    return b


def translate_word(X: str, amap: pd.DataFrame, lan: str):
    """
    Translate a word or phrase using a language mapping DataFrame.

    This function takes a word or phrase 'X' and attempts to translate it into another language
    specified by 'lan' using a language mapping DataFrame 'map'. If the word is found in the index
    of the mapping DataFrame, the translation is returned; otherwise, the original word is returned.

    Args:
        X (str): The word or phrase to be translated.
        map (pd.DataFrame): A DataFrame containing language mappings.

    Returns:
        str: The translated word or phrase, or the original word if not found in the mapping.

    Example:
        # Create a DataFrame for language mapping
        language_map = pd.DataFrame({'English': ['apple', 'banana', 'cherry'],
                                    'French': ['pomme', 'banane', 'cerise']})

        # Translate a word into French
        translated_word = translate_word('apple', language_map, 'French')
    """
    
    if X in amap.index:
        return amap.loc[X, lan]
    else:
        return X

def capitalize_index(X):
    return X.title()

def translate_for_display(df: pd.DataFrame, amap: pd.DataFrame, lan: str):
    """
    Translate column names and index labels of a DataFrame for display.

    This function takes a DataFrame 'df' and translates its column names and index labels using a
    language mapping DataFrame 'map' for display in a specified language 'lan'. The translated
    column names are used as new column names in the DataFrame, and the index labels are replaced
    with their translations.

    Args:
        df (pd.DataFrame): The input DataFrame for translation.
        map (pd.DataFrame): A DataFrame containing language mappings.
        lan (str): The target language code for translation.

    Returns:
        pd.DataFrame: The DataFrame with translated column names and index labels for display.

    Example:
        # Create a DataFrame to be translated
        data = {'apple': [1, 2, 3], 'banana': [4, 5, 6]}
        original_df = pd.DataFrame(data)

        # Create a language mapping DataFrame
        language_map = pd.DataFrame({'English': ['apple', 'banana'],
                                    'French': ['pomme', 'banane']})

        # Translate the column names and index labels for display in French
        translated_df = translate_for_display(original_df, language_map, 'French')
    """
    
    new_columns = {x: translate_word(x, amap, lan) for x in df.columns}
    df.rename(columns=new_columns, inplace=True)
    
    new_index = {x:translate_word(x, amap, lan) for x in df.index}
    df['new_index'] = new_index
    df.set_index('new_index', drop=True, inplace=True)
    
    # either change the labels to something significant for
    # display or remove them fron the data frame
    df.index.name = None
    df.columns.name = None
    
    return df

def translated_and_style_for_display(df, amap, lan, gradient: bool = True):
    """
    Translate, style, and format a DataFrame for display.

    This function translates column names and index labels, applies styling, and optionally
    adds a color gradient to a DataFrame to prepare it for display in a specified language 'lan'.
    
    Args:
        df (pd.DataFrame): The input DataFrame to be translated and styled.
        map (pd.DataFrame): A DataFrame containing language mappings.
        lan (str): The target language code for translation.
        gradient (bool, optional): Whether to apply a color gradient to the DataFrame. Defaults to True.

    Returns:
        Styler: A styled DataFrame ready for display with translated labels and styling.

    Example:
        # Create a DataFrame to be translated and styled
        data = {'apple': [1, 2, 3], 'banana': [4, 5, 6]}
        original_df = pd.DataFrame(data)

        # Create a language mapping DataFrame
        language_map = pd.DataFrame({'English': ['apple', 'banana'],
                                    'French': ['pomme', 'banane']})

        # Translate, style, and format the DataFrame for display in French
        styled_df = translated_and_style_for_display(original_df, language_map, 'French', gradient=True)
    """
    d = translate_for_display(df, amap, lan)
    d = d.style.format(**conf_.format_kwargs).set_table_styles(conf_.table_css_styles)
    if gradient:
        d = d.applymap(color_gradient, cmap=conf_.newcmp)
    return d.format_index(str.title, axis=1).format_index(str.title, axis=0)


def display_tabular_data_by_column_values(df, column_one: dict, column_two: dict, index: str):
    """
    Display tabular data based on column values.

    This function filters a DataFrame 'df' to include rows where either 'column_one' or 'column_two'
    meet specified conditions. The resulting DataFrame is then set to have 'index' as the index, and
    the index name is removed for cleaner tabular display.

    Args:
        df (pd.DataFrame): The input DataFrame containing tabular data.
        column_one (dict): A dictionary specifying the column and value condition for 'column_one'.
        column_two (dict): A dictionary specifying the column and value condition for 'column_two'.
        index (str): The column to be set as the index for the resulting DataFrame.

    Returns:
        pd.DataFrame: The filtered DataFrame with 'index' as the index and the index name removed.

    Example:
        # Create a sample DataFrame 'data_df'
        data_df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'],
                                'Age': [25, 30, 35],
                                'Salary': [50000, 60000, 70000]})

        # Define filtering conditions for 'Age' and 'Salary'
        column_one = {'column': 'Age', 'val': 30}
        column_two = {'column': 'Salary', 'val': 65000}

        # Display filtered tabular data by 'Name' where either 'Age' or 'Salary' meets the conditions
        filtered_data = display_tabular_data_by_column_values(data_df, column_one, column_two, 'Name')
    """
    d = df.sort_values(by=column_one['column'], ascending=False)
    the_min_val = d.iloc[int(column_one['val'])]['quantity']
    d = df[(df[column_one["column"]] >= the_min_val) | (df[column_two["column"]] >= column_two["val"])].copy()
    d.set_index(index, inplace=True, drop=True)
    d.index.name = None       
    return d



def summary_of_parent_and_child_features(df: pd.DataFrame,
                                         cumulative_columns: List = None,
                                         boundary_labels: List = None,
                                         object_labels: List = None,
                                         object_columns: List = None,
                                         unit_agg: dict = None,
                                         unit_columns: List = None,
                                         agg_groups: dict = None)-> pd.DataFrame:
    """
    Generate a summary of parent and child features based on a DataFrame.

    This function computes a summary of parent and child features based on the provided DataFrame 'df'.
    It calculates cumulative values, aggregates boundary summaries, and generates a comprehensive summary
    DataFrame that includes both parent and child features.

    Args:
        df (pd.DataFrame): The input DataFrame containing data for analysis.
        cumulative_columns (List, optional): List of columns to be considered for cumulative values.
        boundary_labels (List, optional): List of labels for boundary summaries.
        object_labels (List, optional): List of labels for individual objects.
        object_columns (List, optional): List of columns identifying objects.
        unit_agg (dict, optional): Aggregation methods for unit summaries.
        unit_columns (List, optional): List of columns for unit summaries.
        agg_groups (dict, optional): Aggregation methods for boundary summaries.

    Returns:
        pd.DataFrame: A summary of parent and child features with comprehensive information.

    Example:
        # Define parameters for generating the summary
        cumulative_columns = ['quantity', 'total_weight']
        boundary_labels = ['Boundary 1', 'Boundary 2']
        object_labels = ['Object 1', 'Object 2']
        object_columns = ['object_id', 'object_name']
        unit_agg = {'quantity': 'sum', 'total_weight': 'mean'}
        unit_columns = ['unit_id', 'unit_name']
        agg_groups = {'quantity': 'sum', 'total_weight': 'mean'}

        # Generate the summary of parent and child features
        summary_df = summary_of_parent_and_child_features(data_df, cumulative_columns, boundary_labels,
                                                         object_labels, object_columns, unit_agg, unit_columns, agg_groups)
    """
                                            

    parent_boundary = rate_per_unit_cumulative(df, cumulative_columns, object_labels, object_columns, unit_agg)
    boundary_summaries = aggregate_boundaries(df, unit_columns, unit_agg, boundary_labels, object_columns, agg_groups)
    x = boundary_summary(parent_boundary, boundary_summaries, object_columns, "pcs_m")

    return x

In [3]:
def collect_survey_data_for_report(a_func: Callable = None, **kwargs)-> pd.DataFrame:
    
    if a_func is not None:
        return a_func(**kwargs)
    else:
        survey_files = conf_.survey_files
        data = mi.combine_survey_files(survey_files)
    return data

def collect_env_data_for_report(a_func: Callable = None, **kwargs)-> pd.DataFrame:
    
    if a_func is not None:
        return a_func(**kwargs)
    else:
        codes = pd.read_csv(conf_.code_data).set_index("code")
        beaches = pd.read_csv(conf_.beach_data).set_index("slug")
        land_cover = pd.read_csv(conf_.land_cover_data)
        land_use = pd.read_csv(conf_.land_use_data)
        streets = pd.read_csv(conf_.street_data)
        river_intersect_lakes = pd.read_csv(conf_.intersection_attributes)
        
    return codes, beaches, land_cover, land_use, streets, river_intersect_lakes

def language_maps(func: Callable = None, **kwargs):
    if func is not None:
        return func(**kwargs)
    else:
        maps = {k: pd.read_csv(v).set_index('en') for k, v in conf_.language_maps.items()}
        return maps

def check_for_top_label(alabel: str = None, df: pd.DataFrame = None, a_map: pd.DataFrame = None)-> pd.DataFrame:
    if alabel in df.columns:
        return df
    else:
        new_map = a_map[alabel]
        newdf = df.merge(new_map, left_on='slug', right_index=True, validate='many_to_one')
        return newdf
    
def use_parent_groups_or_gfrags(df, label: str = None, gfrags: bool = True, parent_group: bool = False, func: Callable = None, **kwargs)-> pd.DataFrame:
    
    if func is not None:
        return func(**kwargs)
    if gfrags and parent_group:
        d = use_gfrags_gfoams_gcaps(df, codes)
    if gfrags and not parent_group:
        d = use_gfrags_gfoams_gcaps(df, codes)
    
    # the surveys need to be aggregated to the object level
    # after changeing code names there will be duplicates on
    # the columns loc_date and code. Which is not allowed.
    groupby_cols = list(set([label, *d.columns, *conf_.code_result_columns]))
    d = aggregate_dataframe(d, groupby_cols, conf_.unit_agg)
    
    return d
def add_column_to_work_data(df, key: str = 'slug', feature: str = None, amap: pd.DataFrame = None)-> pd.DataFrame:
    
    d = df.merge(amap[feature], left_on=key, right_index=True, validate='many_to_one')
    
    return d

def add_columns_to_work_data(df, keys_features)-> pd.DataFrame:
    
    d = df.copy()
    
    for k_f in keys_features:
        d = add_column_to_work_data(d, key=k_f['key'], feature=k_f['feature'], amap=k_f['map'])
    
    d.reset_index(inplace=True, drop=True)
    return d

def report_data(a_start, df, add_columns: List = None, use_gfrags: bool = True):
    
    # the first input variable sets the limit ot the report
    # that means we are interested about the summary of this data
    # or something contained withing it. This variable is used
    # the reporting process
    top_label = [list(a_start.keys())[0], list(a_start.values())[0]]
    # print("adding columns")
    # print(top_label)
    # print(df.columns)
    
    # slice the survey data by the provided date
    # do this straight away is save memory
    w_d = slice_data_by_date(df.copy(), start=a_start['start_date'], end=a_start['end_date'])
      
    # check for and add to the survey data the group
    # and label for this report if it is missing
    w_d = check_for_top_label(top_label[0], df=w_d, a_map=beaches)
    
    # use gfrags or add columns to the survey data
    # by default the feature_type and code groupname is
    # is added to the survey data.
    if use_gfrags:
        w_d = use_parent_groups_or_gfrags(w_d, label=top_label[0])
    
    if add_columns is not None:
        w_d = add_columns_to_work_data(w_d, add_columns)

    # this is the data for report
    w_df = w_d[w_d[top_label[0]].isin([top_label[1]])].copy()
    
    return top_label, a_start['language'], w_df, w_d


geo_h = conf_.geo_h

def categorize_work_data(df, labels, columns_of_interest: List[str] = geo_h, sample_id: str = 'loc_date'):
        
    data = df[df[labels[0]] == labels[1]].copy()
    
    summaries = columns_of_interest
           
    if labels[0] == columns_of_interest[-1]:
        summaries = columns_of_interest[:-2]
    if labels[0] == columns_of_interest[-2]:
        summaries = [*columns_of_interest[:-2], columns_of_interest[-1]]
    
    new_columns = list(set([sample_id, *summaries]))
    d = data[new_columns].copy()
    
    res = {}
    for an_attribute in new_columns:
        datt = d[an_attribute].unique()
        res.update({an_attribute: datt})
    
    res['samples'] = res.pop('loc_date')
    
    return {labels[1]:res}

def a_summary_of_one_vector(df, unit_columns, unit_agg, describe='pcs_m', label: str = None):
    
    sample_totals = aggregate_dataframe(df, unit_columns, unit_agg)
    sample_summary = sample_totals[describe].describe()
    sample_summary["total"] = sample_totals.quantity.sum()
    sample_summary = pd.DataFrame(sample_summary)
    sample_summary[describe] = sample_summary[describe].astype(object)
    sample_summary.loc['count', describe] = int(sample_summary.loc['count', describe])
    sample_summary.loc['total', describe] = int(sample_summary.loc['total',describe])
    
    if label is not None:
        sample_summary['label'] = label
        # sample_summary.drop(describe, inplace=True)
        
    
    return sample_summary

def a_cumulative_report(df, feature_name: str = 'feature_type', 
                        object_column: str = 'groupname', 
                        sample_id: str = 'loc_date')-> pd.DataFrame:
    

    cumulative_columns = [sample_id, object_column]
    unit_columns = [feature_name, sample_id, object_column]
    object_labels = df[object_column].unique()
    object_columns = [object_column]
    boundary_labels = df[feature_name].unique()

    args = {
        'cumulative_columns':cumulative_columns,
        'object_labels':object_labels,
        'boundary_labels':boundary_labels,
        'object_columns':object_columns,
        'unit_agg':conf_.unit_agg,
        'unit_columns':unit_columns,
        'agg_groups':conf_.agg_groups
    }

    tix = summary_of_parent_and_child_features(df.copy(), **args)
    
    return tix

## a report class

### basic requirements

1. define the limits of the request
   * temporal
   * geographic (includes features and parent boundaries)
   * object types
   * level of aggregation

2. define what codes are being used

The default setting is to combine all the fragmented plastics into one group (all sizes) and the same for fragmented expanded polystyrene and plastic bottle tops. This results in three codes that represent objects that are very similar. This topic has been addressed many times. These groups register not-trivial quantities at most surveys. However, the differentiation of these objects into their respective subgroups ie. plastic caps for drinnking v/s plastic caps for household cleaners is not considered a priority by all groups that have collected data in the past.

* Gfrags
* Gfoams
* Gcaps

3. define the reporting language

The reporting language can be either French, German or English. We would like italian but we have no resource for that service in Ticino.

__Note:__ The reporting language is only applied at the moment of display. The column names, feature labels and other underlying identifying criteria for the data remain unchanged. The column name definitions and translations are in the _random variables_ section.
   
From the testing_data_models note book it is shown that given the following set of variables summary reports and test statistics can be generated for any combination of data:

   * `df (pd.DataFrame)`: The input DataFrame containing data for analysis.
   * `cumulative_columns (List, optional)`: List of columns to be considered for cumulative values.
   * `boundary_labels (List, optional)`: List of labels for boundary summaries.
   * `object_labels (List, optional)`: List of labels for individual objects.
   * `object_columns (List, optional)`: List of columns identifying objects.
   * `unit_agg (dict, optional)`: Aggregation methods for unit summaries.
   * `unit_columns (List, optional)`: List of columns for unit summaries.
   * `agg_groups (dict, optional)`: Aggregation methods for boundary summaries.


### Work data

A report can be defined by providing the temporal and geographic bounds of interest. Below is the current method. 

```python

# request
canton = 'Bern'
start_date = '2019-01-01'
end_date = '2022-01-01'
language = 'fr'

# starting data, can be MySQL or NoSQL calls
# the three methods accept Callables, as long
# as the out put is pd.DataFrame
c_l = language_maps()
surveys = collect_survey_data_for_report()
codes, beaches, land_cover, land_use, streets, river_intersect_lakes = collect_env_data_for_report()

# temporal and geographic boundaries
# user defined input
boundaries = dict(canton='Valais', language='fr', start_date='2019-01-01', end_date='2022-01-01')

# columns to be added to the survey data
# not stored with the survey data. Note that codes
# and beaches are part of the initial data. The index of
# codes contains the values of surveys.code and the index
# of beaches contains the values of surves.slug
add_columns = [
    {'key':'code', 'feature':'groupname', 'map':codes},
    {'key':'slug', 'feature':'feature_type', 'map':beaches}
]

# the level and label of the report
# the language for display
# the data for the report and all other
# from the data range
top_label, language, w_df, w_di = report_data(boundaries, surveys, add_columns=add_columns)

# define the language map
w_df.head().style.set_table_styles(conf_.table_css_styles)
```

Which produces the following untranslated output.

## Start

In [4]:
# starting data, can be MySQL or NoSQL calls
# the three methods accept Callables, as long
# as the out put is pd.DataFrame
c_l = language_maps()
surveys = collect_survey_data_for_report()
codes, beaches, land_cover, land_use, streets, river_intersect_lakes = collect_env_data_for_report()

survey_data = surveys.copy()
survey_data = survey_data.merge(beaches['canton'], left_on='slug', right_index=True, validate='many_to_one')

# temporal and geographic boundaries
# user defined input
boundaries = dict(canton='Valais', language='fr', start_date='2019-01-01', end_date='2022-01-01')

# columns to be added to the survey data
# not stored with the survey data. Note that codes
# and beaches are part of the initial data. The index of
# codes contains the values of surveys.code and the index
# of beaches contains the values of surves.slug
# add_columns = [
#     {'key':'code', 'feature':'groupname', 'map':codes},
#     {'key':'slug', 'feature':'feature_type', 'map':beaches}
# ]

# the level and label of the report
# the language for display
# the data for the report and all other
# from the data range
top_label, language, w_df, w_di = report_data(boundaries, survey_data.copy())

# define the language map
w_df.head().style.set_table_styles(conf_.table_css_styles)

Unnamed: 0,parent_boundary,loc_date,feature_type,length,code,slug,groupname,date,city,feature_name,canton,quantity,pcs_m
33814,les-alpes,"('clean-up-tour-crans-montana', '2021-06-12')",p,43,G1,crans-montana,food and drink,2021-06-12,Lens,alpes-valaisannes,Valais,0,0.0
33815,les-alpes,"('clean-up-tour-crans-montana', '2021-06-12')",p,43,G10,crans-montana,food and drink,2021-06-12,Lens,alpes-valaisannes,Valais,0,0.0
33816,les-alpes,"('clean-up-tour-crans-montana', '2021-06-12')",p,43,G100,crans-montana,waste water,2021-06-12,Lens,alpes-valaisannes,Valais,0,0.0
33817,les-alpes,"('clean-up-tour-crans-montana', '2021-06-12')",p,43,G101,crans-montana,personal items,2021-06-12,Lens,alpes-valaisannes,Valais,0,0.0
33818,les-alpes,"('clean-up-tour-crans-montana', '2021-06-12')",p,43,G102,crans-montana,personal items,2021-06-12,Lens,alpes-valaisannes,Valais,0,0.0


### Reporting categories

The first variable of the input is used to define the hierarchy of the report. For administrative purposes a vertical approach that reflects areas of responsibility is important. For estimating values the geographic/topographic attributes are more important.

The survey data is labeled for these purposes. The columns `parent_boundary`, `feature_type` and `feature_name` are the topographic features. 

1. `parent_boundary`: the name of the: river basin, catchment area, park, name of geograhphic region or other zone defined by swiss geo admin.
2. `feature_type`: lake, river or park
3. `feature_name`: the name of the lake, river or park

The `geo_h` array sets the order for reporting. Reports for cantons can contain subreports for all the values in the array, by default the cantonal results will reference the IQAASL report for threshold or prior results. Reports for cities will contain only geographic categories with reference to cantonal results.

```python


geo_h = ['parent_boundary', 'feature_type',  'feature_name','canton', 'city']


def categorize_work_data(df, labels, columns_of_interest: List[str] = geo_h, sample_id: str = 'loc_date'):
       
    data = df[df[labels[0]] == labels[1]].copy()
    
    summaries = columns_of_interest
    print(summaries)
    
    # if city is selected the available boundaries
    # are geographic. A city is in only one canton
    # if canton is selected then city becomes a category
    # for which a report can be produced    
    if labels[0] == columns_of_interest[-1]:
        summaries = columns_of_interest[:-2]
    if labels[0] == columns_of_interest[-2]:
        summaries = [*columns_of_interest[:-2], columns_of_interest[-1]]
    
    new_columns = list(set([sample_id, *summaries]))
    d = data[new_columns].copy()
    res = {}
    for an_attribute in new_columns:
        datt = d[an_attribute].unique()
        res.update({an_attribute: datt})
    
    res['samples'] = res.pop('loc_date')
    
    return {labels[1]:res}

# this categorizes the survey data into search terms
# the available data or reporting categories are retrieved
# by getting the length of the array for each category
# if the category is not present then the data is not available
parent_categories = categorize_work_data(w_df, top_label)
p_vals = parent_categories[boundaries[top_label[0]]]

# the type and number of reports available
reporting_categories = {k:len(v) for k, v in p_vals.items()}
reporting_categories
```

Which gives the following result:

In [5]:
# this categorizes the survey data into search terms
# the available data or reporting categories are retrieved
# by getting the length of the array for each category
# if the category is not present then the data is not available
parent_categories = categorize_work_data(w_df, top_label)
p_vals = parent_categories[boundaries[top_label[0]]]

# the type and number of reports available
reporting_categories = {k:len(v) for k, v in p_vals.items()}
reporting_categories

{'parent_boundary': 2,
 'feature_type': 3,
 'city': 10,
 'feature_name': 3,
 'samples': 22}

The same operation can be performed at each level. The first call to `categorize_work_data` gives the structure of the report. For each key value of the reporting categories there wil be a set of descriptive statistics.

For example, a detailed report on all feature types within the canton would look include the following summary data for each feature type

```python
# identify and count the results from parcs
parc_features = categorize_work_data(w_df[w_df.feature_type == p_vals['feature_type'][0]], top_label)

# count the contents in each attribute
{k:len(v) for k, v in parc_features[top_label[1]].items()}

# out =>

{'city': 6,
 'feature_type': 1,
 'parent_boundary': 1,
 'feature_name': 1,
 'samples': 7}

```

In this example there are 7 samples from 6 cities in the parcs feature_type. 

The summary of each label for each feature in the current data set can be obtained by providing the feature of interest to the groupby columns. By default the sample id: `loc_date` and the location name `slug` are required.

In summary: the boundaries variable defines the top level data structure, for example: `{'canton': 'Valais', 'language': 'fr', 'start_date': '2019-01-01', 'end_date': '2022-01-01'}` will produce a dataframe with data only from the canton of Valais within the dates defined. Tables and charts will be translated to french.

The report class uses the resulting data structure to define reports for the different features within the dataframe.

## The report class

```python
class ReportClass:
    """
    ReportClass is a class for generating reports and summaries from dataframes.

    Parameters:
        w_df (pd.DataFrame, optional): A dataframe containing the data of interest.
        w_di (pd.DataFrame, optional): A dataframe containing data for all locations within the same date range.
        boundaries (dict, optional): Dictionary of boundaries and labels. The boundaries dictionary contains the
            the variables necessary to define w_df.
        top_label (List[str], optional): List of top-level labels. The top level labels define the parent structure
             of the report and the argument used to identify the data of interest.
        language (str, optional): Language for translation. One of 'de', 'fr' or 'en'
        lang_maps (pd.DataFrame, optional): A dataframe for language mapping. The index is english and there is a
             column for each language/

    Attributes:
        features (dict): Categorized features of the data of interest. The features dictionary uses the column labels
            of the dataframe as keys and the unique values of the columns as values. Columns/features that are not pres
            sent in the data of interest will not be in the features variable
        available_features (List[str]): List of available features. Those features that appear at least once in the data
             of interest.

    Methods:
        summarize_feature_labels(feature: str = None, sample_id: str = 'loc_date', location: str = 'slug')
            Summarize feature labels for a specific feature. Provides the quantiles, number of samples and
            the total quantity of the vector for a given feature.
        the_number_of_attributes_in_a_feature(feature: str = None)
            Calculate the number of attributes in a feature
        inventory(code: str = 'code', sample_id: str = 'loc_date')
            Generate an inventory report based on code and sample data. Includes % of total, median pcs/m,
            fail rate and quantity.

    __repr__(): A string representation of the class instance.

    Example usage:
        report = ReportClass(w_df, w_di, boundaries, top_label, language, lang_maps)
        report.summarize_feature_labels()
    """
    # default arguments that define the most common objects
    # this assumes that the columns quantity and fail rate exist
    column_one = {
        'column': 'quantity',
        'val': 10
    }

    column_two = {
        'column':'fail rate',
        'val': 0.5
    }
    
    object_of_interest = 'code'

    def __init__(self, w_df: pd.DataFrame = None,
                 w_di: pd.DataFrame = None, 
                 boundaries: dict = None,
                 top_label: List[str] = None,
                 language: str = None,
                 lang_maps: pd.DataFrame = None,
                 mc_criteria_one: dict = column_one,
                 mc_criteria_two: dict = column_two,
                 ooi = object_of_interest
                ):
        self.w_df = w_df
        self.w_di = w_di
        self.boundaries = boundaries
        self.top_label = top_label
        self.language = language
        self.lang_maps = lang_maps
        self.criteria_one = mc_criteria_one
        self.criteria_two = mc_criteria_two
        self.ooi = ooi
        
    @property
    def features(self):
        args = dict(df=self.w_df, labels=self.top_label, columns_of_interest=geo_h)
        some_features = categorize_work_data(**args)
        return some_features[top_label[1]]
    
    @property
    def available_features(self):
        available = [x for x in conf_.geo_h if x in self.features.keys()]
        return available
    
    @property
    def inventory(self, code: str = 'code', sample_id: str = 'loc_date'):
        # sum the cumulative quantity for each code and calculate the median pcs/meter
        code_totals = aggregate_dataframe(self.w_df.copy(), [code], conf_.agg_groups)

        # collect 
        abundant = get_top_x_records_with_max_quantity(code_totals.copy(), "quantity", code, len(code_totals.code.unique()))

        # identify the objects that were found in at least 50% of the samples
        # calculate the quantity per sample for each code and sample
        occurrences = aggregate_dataframe(self.w_df, [sample_id, code], {"quantity":"sum"})

        # count the number of times that an object was counted > 0
        # and divide it by the total number of samples 
        event_counts  = count_objects_with_positive_quantity(occurrences)

        # calculate the rate of occurence per unit of measure
        rates = calculate_rate_per_unit(self.w_df.copy(), self.w_df.code.unique())

        # add the unit rates and fail rates
        abundance = merge_dataframes_on_column_and_index(abundant, rates["pcs_m"], left_column=code, validate="one_to_one")
        abundance["fail rate"] = abundance.code.apply(lambda x: event_counts.loc[x])

        # this is the complete inventory with summary
        # statistics for each object
        abundance.sort_values(by="quantity", inplace=True, ascending=False)
        abundance.reset_index(inplace=True, drop=True)

        return abundance
    
    @property
    def most_common(self):
        # use the criteria to find the objects of interest
        mc = display_tabular_data_by_column_values(self.inventory, self.criteria_one, self.criteria_two, self.ooi)
        return mc
                                                       
        
    def summarize_feature_labels(self, feature: str = None, sample_id: str = 'loc_date', location: str = 'slug'):
        
        if feature is None:
            feature = self.available_features[0]
            print('\nThis is the default summary. A column label can be specified.')
            print(f'This summary is for {feature}')
            print('To specify a feature call class.summarize_feature_labels(<column-label>)')
            print(f'these are your choices {self.available_features}\n')
        
        unit_columns = [sample_id, location, feature]
        labels = self.features[feature]
        
        X = []
        for the_label in labels:
            d = self.w_df[self.w_df[feature] == the_label].copy()
            ds = a_summary_of_one_vector(d.copy(), unit_columns, conf_.unit_agg, describe='pcs_m',label=the_label)
            X.append(ds)
        
        return pd.concat(X).pivot(columns='label')
    
    def the_number_of_attributes_in_a_feature(self, feature: str = None):
        
        if feature is None:
            feature = self.available_features[0]
            print('\nThis the default attribute count. A column label can be specified.')
            print(f'This count is for {feature}')
            print('To specify call the_number_of_attributes_in_a_feature(<column-label>)')
            print(f'these are your choices {self.available_features}\n')
        
        labels = self.features[feature]
    
        feature_attributes = []
        for a_label in labels:
            these_attributes = categorize_work_data(self.w_df[self.w_df[feature] == a_label].copy(), self.top_label)
            summed = {k:len(v) for k, v in these_attributes[self.top_label[1]].items()}
            feature_attributes.append(summed)
        counts = pd.DataFrame(feature_attributes, index=labels)
        counts.drop(feature, axis=1, inplace=True)
        return counts
    
    def __repr__(self):
        return f'Report: {boundaries}, features: {self.available_features}'    


```

### The number and types of features in a report

For example calling the following code once a class is instantiated will report the number and type of each feature

```python
a_report = ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)
a_report.the_number_of_attributes_in_a_feature('feature_type')
```

In [6]:
class ReportClass:
    """
    ReportClass is a class for generating reports and summaries from dataframes.

    Parameters:
        w_df (pd.DataFrame, optional): A dataframe containing the data of interest.
        w_di (pd.DataFrame, optional): A dataframe containing data for all locations within the same date range.
        boundaries (dict, optional): Dictionary of boundaries and labels. The boundaries dictionary contains the
            the variables necessary to define w_df.
        top_label (List[str], optional): List of top-level labels. The top level labels define the parent structure
             of the report and the argument used to identify the data of interest.
        language (str, optional): Language for translation. One of 'de', 'fr' or 'en'
        lang_maps (pd.DataFrame, optional): A dataframe for language mapping. The index is english and there is a
             column for each language/

    Attributes:
        features (dict): Categorized features of the data of interest. The features dictionary uses the column labels
            of the dataframe as keys and the unique values of the columns as values. Columns/features that are not pres
            sent in the data of interest will not be in the features variable
        available_features (List[str]): List of available features. Those features that appear at least once in the data
             of interest.

    Methods:
        summarize_feature_labels(feature: str = None, sample_id: str = 'loc_date', location: str = 'slug')
            Summarize feature labels for a specific feature. Provides the quantiles, number of samples and
            the total quantity of the vector for a given feature.
        the_number_of_attributes_in_a_feature(feature: str = None)
            Calculate the number of attributes in a feature
        inventory(code: str = 'code', sample_id: str = 'loc_date')
            Generate an inventory report based on code and sample data. Includes % of total, median pcs/m,
            fail rate and quantity.

    __repr__(): A string representation of the class instance.

    Example usage:
        report = ReportClass(w_df, w_di, boundaries, top_label, language, lang_maps)
        report.summarize_feature_labels()
    """
    # default arguments that define the most common objects
    # this assumes that the columns quantity and fail rate exist
    column_one = {
        'column': 'quantity',
        'val': 10
    }

    column_two = {
        'column':'fail rate',
        'val': 0.5
    }
    
    object_of_interest = 'code'

    def __init__(self, w_df: pd.DataFrame = None,
                 w_di: pd.DataFrame = None, 
                 boundaries: dict = None,
                 top_label: List[str] = None,
                 language: str = None,
                 lang_maps: pd.DataFrame = None,
                 mc_criteria_one: dict = column_one,
                 mc_criteria_two: dict = column_two,
                 ooi = object_of_interest
                ):
        self.w_df = w_df
        self.w_di = w_di
        self.boundaries = boundaries
        self.top_label = top_label
        self.language = language
        self.lang_maps = lang_maps
        self.criteria_one = mc_criteria_one
        self.criteria_two = mc_criteria_two
        self.ooi = ooi
        
    @property
    def features(self):
        args = dict(df=self.w_df, labels=self.top_label, columns_of_interest=geo_h)
        some_features = categorize_work_data(**args)
        return some_features[self.top_label[1]]
    
    @property
    def available_features(self):
        available = [x for x in conf_.geo_h if x in self.features.keys()]
        return available
    
    @property
    def inventory(self, code: str = 'code', sample_id: str = 'loc_date'):
        # sum the cumulative quantity for each code and calculate the median pcs/meter
        code_totals = aggregate_dataframe(self.w_df.copy(), [code], conf_.agg_groups)

        # collect 
        abundant = get_top_x_records_with_max_quantity(code_totals.copy(), "quantity", code, len(code_totals.code.unique()))

        # identify the objects that were found in at least 50% of the samples
        # calculate the quantity per sample for each code and sample
        occurrences = aggregate_dataframe(self.w_df, [sample_id, code], {"quantity":"sum"})

        # count the number of times that an object was counted > 0
        # and divide it by the total number of samples 
        event_counts  = count_objects_with_positive_quantity(occurrences)

        # calculate the rate of occurence per unit of measure
        rates = calculate_rate_per_unit(self.w_df.copy(), self.w_df.code.unique())

        # add the unit rates and fail rates
        abundance = merge_dataframes_on_column_and_index(abundant, rates["pcs_m"], left_column=code, validate="one_to_one")
        abundance["fail rate"] = abundance.code.apply(lambda x: event_counts.loc[x])

        # this is the complete inventory with summary
        # statistics for each object
        abundance.sort_values(by="quantity", inplace=True, ascending=False)
        abundance.reset_index(inplace=True, drop=True)

        return abundance
    
    @property
    def most_common(self):
        # use the criteria to find the objects of interest
        mc = display_tabular_data_by_column_values(self.inventory, self.criteria_one, self.criteria_two, self.ooi)
        return mc
                                                       
        
    def summarize_feature_labels(self, feature: str = None, sample_id: str = 'loc_date', location: str = 'slug'):
        
        if feature is None:
            feature = self.available_features[0]
            print('\nThis is the default summary. A column label can be specified.')
            print(f'This summary is for {feature}')
            print('To specify a feature call class.summarize_feature_labels(<column-label>)')
            print(f'these are your choices {self.available_features}\n')
        
        unit_columns = [sample_id, location, feature]
        labels = self.features[feature]
        
        X = []
        for the_label in labels:
            d = self.w_df[self.w_df[feature] == the_label].copy()
            ds = a_summary_of_one_vector(d.copy(), unit_columns, conf_.unit_agg, describe='pcs_m',label=the_label)
            X.append(ds)
        
        return pd.concat(X).pivot(columns='label')
    
    def the_number_of_attributes_in_a_feature(self, feature: str = None):
        
        if feature is None:
            feature = self.available_features[0]
            print('\nThis the default attribute count. A column label can be specified.')
            print(f'This count is for {feature}')
            print('To specify call the_number_of_attributes_in_a_feature(<column-label>)')
            print(f'these are your choices {self.available_features}\n')
        
        labels = self.features[feature]
    
        feature_attributes = []
        for a_label in labels:
            these_attributes = categorize_work_data(self.w_df[self.w_df[feature] == a_label].copy(), self.top_label)
            summed = {k:len(v) for k, v in these_attributes[self.top_label[1]].items()}
            feature_attributes.append(summed)
        counts = pd.DataFrame(feature_attributes, index=labels)
        # counts.drop(feature, axis=1, inplace=True)
        return counts
    
    def __repr__(self):
        return f'Report: {boundaries}, features: {self.available_features}'    

# boundaries = dict(canton='Valais', language='fr', start_date='2019-01-01', end_date='2022-01-01')

# columns to be added to the survey data
# not stored with the survey data. Note that codes
# and beaches are part of the initial data. The index of
# codes contains the values of surveys.code and the index
# of beaches contains the values of surves.slug
# add_columns = [
#     {'key':'code', 'feature':'groupname', 'map':codes},
#     {'key':'slug', 'feature':'feature_type', 'map':beaches}
# ]

# the level and label of the report
# the language for display
# the data for the report and all other
# from the data range


top_label, language, w_df, w_di = report_data(boundaries, survey_data.copy())          
a_report = ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)
translated_and_style_for_display(a_report.the_number_of_attributes_in_a_feature('feature_type'), a_report.lang_maps[a_report.language], a_report.language, gradient=False)

Unnamed: 0,Région,Zone,Ville,Feature_Name,Échantillons
Parc,1,1,6,1,7
Lac,1,1,1,1,11
Rivière,1,1,4,1,4


#### A top level description

The first out put says there are three feature types (lakes, rivers and parks) in the data. There is one lake that was sampled 11 times, a river was sampled 4 times and the parks were sampled 7 times. In total there were 11 cities, 4 on the river, 6 in the parcs feature and 1 on the lake.

Recall that the geographic column names are: `['feature_type', 'feature_name', 'parent_name']`.  The suvey results from each sector can be compared by selecting the column name of interest. Depending on the value of `boundaries` all the column names may not be available. There is a method to identify exactly what features are available. Note in the example below, canton is not an option. This is because the boundaries were set for a canton.

```python
my_feautures = my_report_class.available_features()

print(my_features)

=> ['parent_boundary', 'feature_type', 'feature_name', 'city']
```

The `summarize_feature_labels` method in the `ReportClass` creates a summary of the sample totals for each label of the selected feature. Calling the `translated_and_style_for_display` method puts the table to html and applies language specific formatting using the `dataframe.style` method. The index and column names are translated using the language maps.

```python
feature_type_summary =  my_report_class.summarize_feature_labels('feature_type')

translated_and_style_for_display(feature_type_summary, my_report.language_maps, my_report.language, gradient=False)
```

Combined with the output from above a desctription of the data and how it was collected can be constructed, the higlighted text is from the active variables.
> There were `13'782` objects identified in for the period between `2019-01-01` and `2021-12-31` in the `canton` of `Valais`. In total, `22` samples were recorded, `11` on the `lake-shore`, `7` in `ski-areas` and `4` on `riverbanks`.  The lake samples were recorded from `one` `city` on the other hand the alpes and rivers were taken from `10` `different cities`. The `median` sample total of _pieces of trash per meter_ `pcs/m` is highest at the `lakeside`, followed by the `parcs` and `rivers`.

In [7]:
translated_and_style_for_display(a_report.summarize_feature_labels('feature_type'), a_report.lang_maps[a_report.language], a_report.language, gradient=False)

Unnamed: 0_level_0,Pcs_M,Pcs_M,Pcs_M
label,L,P,R
25%,1475,136,16
50%,1771,287,24
75%,3267,479,48
Échantillons,11,7,4
Max,5273,41505,102
Moyenne,2364,6161,40
Min,258,106,9
Écart-Type,1691,15586,42
Total,7'560,6'144,78


#### By criteria

Objects can be selected by criteria. The default criteria requires that the quantity be in the top ten or the fail rate >= .5. This can be changed at any time using the keywords when the class is called or setting the class variables in the form `my_report_class.criteria_one = anewvalue`.

```python
objects_selected_by_criteria = my_report_class.most_common
translated_and_style_for_display(a_report.most_common, a_report.lang_maps[a_report.language], a_report.language, gradient=False)
``` 

Calling `my_report_class.most_common` will return a dataframe that has the test statistic and description of all objects that meet the criteria.

In [8]:
translated_and_style_for_display(a_report.most_common, a_report.lang_maps[a_report.language], a_report.language, gradient=False)

Unnamed: 0,Quantité,% Du Total,Pcs/M,Taux D'Échec
Brosse De Télésiège,5'181,38,0,9
"Fragments De Polystyrène Expansé: G76, G81, G82, G83",1'476,11,2,59
"Fragments De Plastique: G80, G79, G78, G75",1'299,9,23,91
"Couvercles En Plastique Bouteille: G21, G22, G23, G24",589,4,6,64
"Emballages De Bonbons, De Snacks",564,4,35,82
"Bâche, Feuille Plastique Industrielle",516,4,15,59
Mousse De Plastique Pour L'Isolation Thermique,453,3,3,55
Coton-Tige,453,3,7,50
Déchets De Construction En Plastique,307,2,15,59
Mégots Et Filtres À Cigarettes,221,2,9,73


#### Results by criteria and feature type

Once the the objects of interested are identified (criteria) they can be compared accross the diferent feature_types and labels.

```python
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_type', object_column='code')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)
``` 
For example the most common objects are found at different densitiies depending on the feature type.

In [9]:
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_type', object_column='code')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Parc,Lac,Rivière,Cumulé
Emballage Fast Food,0,27,0,3
Médical Conteneurs/Tubes/ Emballages,0,15,0,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",2,3,0,2
Tabac Emballages En Plastique,0,16,0,1
Mégots Et Filtres À Cigarettes,16,13,0,9
"Emballages De Bonbons, De Snacks",17,77,0,35
Bâtonnets De Sucette,0,28,0,1
Jouets Et Faveurs De Fête,0,14,0,3
"Gobelets, Couvercles, Mousse À Usage Unique Et Plastique Dur",5,21,0,7
Pailles Et Agitateurs,0,16,0,3


#### Alternate object groups

If the column has other labeled values for object identification it can be used to aggregate results for each sample id. Here we consider the groupname, there is more than one object in group. They represent use cases.

```python
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_type', object_column='groupname')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)
``` 
For example the most common objects are found at different densitiies depending on the feature type.

In [10]:
t = a_cumulative_report(w_df, feature_name='feature_type', object_column='groupname')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Parc,Lac,Rivière,Cumulé
Agriculture,2,96,3,23
Nourriture Et Boissons,26,314,1,102
Infrastructures,58,572,6,216
Micro-Plastiques (< 5Mm),3,54,0,7
Emballage Non Alimentaire,12,61,1,21
Articles Personnels,7,34,1,14
Morceaux De Plastique,23,256,1,68
Loisirs,11,99,3,38
Tabac,18,28,0,20
Non Classé,2,6,0,2


In [11]:
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='parent_boundary', object_column='code')
translated_and_style_for_display(t,a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Alpes Et Jura,Rhône,Cumulé
Emballage Fast Food,0,18,3
Médical Conteneurs/Tubes/ Emballages,0,11,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",2,2,2
Tabac Emballages En Plastique,0,13,1
Mégots Et Filtres À Cigarettes,16,6,9
"Emballages De Bonbons, De Snacks",17,50,35
Bâtonnets De Sucette,0,11,1
Jouets Et Faveurs De Fête,0,9,3
"Gobelets, Couvercles, Mousse À Usage Unique Et Plastique Dur",5,11,7
Pailles Et Agitateurs,0,11,3


In [12]:
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_name', object_column='code')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Alpes-Valaisannes,Lac-Leman,Rhône,Cumulé
Emballage Fast Food,0,27,0,3
Médical Conteneurs/Tubes/ Emballages,0,15,0,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",2,3,0,2
Tabac Emballages En Plastique,0,16,0,1
Mégots Et Filtres À Cigarettes,16,13,0,9
"Emballages De Bonbons, De Snacks",17,77,0,35
Bâtonnets De Sucette,0,28,0,1
Jouets Et Faveurs De Fête,0,14,0,3
"Gobelets, Couvercles, Mousse À Usage Unique Et Plastique Dur",5,21,0,7
Pailles Et Agitateurs,0,16,0,3


In [13]:
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='city', object_column='code')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Lens,Riddes,Val-D'Illiez,Troistorrents,Nendaz,Val De Bagnes,Saint-Gingolph,Sion,Leuk,Salgesch,Cumulé
Emballage Fast Food,0,0,0,0,0,0,27,0,0,0,3
Médical Conteneurs/Tubes/ Emballages,0,1,0,3,4,0,15,0,0,0,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",2,1,0,4,5,14,3,0,0,0,2
Tabac Emballages En Plastique,0,1,0,0,18,0,16,0,0,0,1
Mégots Et Filtres À Cigarettes,16,4,0,24,155,152,13,4,0,0,9
"Emballages De Bonbons, De Snacks",9,6,6,17,118,24,77,2,0,0,35
Bâtonnets De Sucette,0,0,0,0,0,0,28,0,0,0,1
Jouets Et Faveurs De Fête,0,0,0,0,0,0,14,0,2,0,3
"Gobelets, Couvercles, Mousse À Usage Unique Et Plastique Dur",5,0,5,3,7,0,21,0,0,0,7
Pailles Et Agitateurs,2,0,0,1,0,0,16,0,0,0,3


## Testing

There are 318'478 rows in the survey data. We can test the sorting and grouping functions by running a report class on all possible combinations of the features of interest.

```python
some_features = ['feature_type', 'parent_boundary', 'feature_name', 'canton', 'city']

def produce_reports_for_testing(survey_data, some_features):
    reports = {}
    for a_feature in some_features:
        labels = survey_data[a_feature].unique()
        label_reports = {}
        for label in labels:
            start_date = survey_data[survey_data[a_feature] == label]['date'].min()
            end_date = survey_data[survey_data[a_feature] == label]['date'].max()
            
            boundaries = {a_feature:label, 'language':'fr', 'start_date':start_date, 'end_date':end_date}
            top_label, language, w_df, w_di = report_data(boundaries, survey_data.copy())
            a_report = ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)
            label_reports.update({label:a_report.features})
        reports.update({a_feature:label_reports})
    return reports
   
t = produce_reports_for_testing(survey_data, some_features)
```

In [20]:
survey_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 318478 entries, 0 to 200785
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   loc_date         318478 non-null  object 
 1   date             318478 non-null  object 
 2   feature_name     318478 non-null  object 
 3   slug             318478 non-null  object 
 4   code             318478 non-null  object 
 5   pcs_m            318478 non-null  float64
 6   quantity         318478 non-null  int64  
 7   parent_boundary  318478 non-null  object 
 8   length           318478 non-null  int64  
 9   groupname        318478 non-null  object 
 10  city             318478 non-null  object 
 11  feature_type     318478 non-null  object 
 12  canton           318478 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 34.0+ MB


In [14]:
some_features = ['feature_type', 'parent_boundary', 'feature_name', 'canton', 'city']

def produce_reports_for_testing(survey_data, some_features):
    reports = {}
    for a_feature in some_features:
        labels = survey_data[a_feature].unique()
        label_reports = {}
        for label in labels:
            start_date = survey_data[survey_data[a_feature] == label]['date'].min()
            end_date = survey_data[survey_data[a_feature] == label]['date'].max()
            
            boundaries = {a_feature:label, 'language':'fr', 'start_date':start_date, 'end_date':end_date}
            top_label, language, w_df, w_di = report_data(boundaries, survey_data.copy())
            a_report = ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)
            label_reports.update({label:a_report.features})
        reports.update({a_feature:label_reports})
    return reports
   
t = produce_reports_for_testing(survey_data, some_features)

In [16]:
%watermark -a hammerdirt-analyst -co --iversions

Author: hammerdirt-analyst

conda environment: cantonal_report

numpy     : 1.25.2
matplotlib: 3.7.1
pandas    : 2.0.3

