In [1]:
%load_ext watermark
import pandas as pd
import numpy as np
from typing import Type, Optional, Callable
from typing import List, Dict, Union, Tuple

from review_methods_tests import collect_vitals, find_missing, find_missing_loc_dates
from review_methods_tests import make_a_summary

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors
from matplotlib.colors import LinearSegmentedColormap, ListedColormap

import setvariables as conf_
import reportclass as r_class

# Report class

The report class is used to generate descriptive statistics and identify objects of interest for a query defined by geographic, adminsitrative and temporal bounds. The reference is the swiss federal report published in 2022.

## Defining and collecting the data

The survey records are stored separately from the environmental data. Each survey is identified by an id that is a combination of the location and date of the survey `loc_date`. There can be up to 228 codes associated to one `loc_date`, most of them will be zero. The combination of `loc_date` and `code` should result in a unique value.

### basic requirements

__Define the limits of the request:__
   * temporal
   * geographic (includes features and parent boundaries)
   * object types
   * level of aggregation

Define what codes are being used

The default setting is to combine all the fragmented plastics into one group (all sizes) and the same for fragmented expanded polystyrene and plastic bottle tops. This results in three codes that represent objects that are very similar. This topic has been addressed many times. These groups register not-trivial quantities at most surveys. However, the differentiation of these objects into their respective subgroups ie. plastic caps for drinnking v/s plastic caps for household cleaners is not considered a priority by all groups that have collected data in the past.

* Gfrags
* Gfoams
* Gcaps

__Define the reporting language:__

The reporting language can be either French, German or English.

__Note:__ The reporting language is only applied at the moment of display. The column names, feature labels and other underlying identifying criteria for the data remain unchanged. The column name definitions and translations are in the _random variables_ section.

__Summary__
   
From the Annex in `testing_data_models` we identified the column combinantions needed to slice the data depending on the report request. At the same time we identify the operations to be performed and when they are to be performed as different columns are used to group the data. This was summrised as follows:

   * `df (pd.DataFrame)`: The input DataFrame containing data for analysis.
   * `cumulative_columns (List, optional)`: List of columns to be considered for cumulative values.
   * `boundary_labels (List, optional)`: List of labels for boundary summaries.
   * `object_labels (List, optional)`: List of labels for individual objects.
   * `object_columns (List, optional)`: List of columns identifying objects.
   * `unit_agg (dict, optional)`: Aggregation methods for unit summaries.
   * `unit_columns (List, optional)`: List of columns for unit summaries.
   * `agg_groups (dict, optional)`: Aggregation methods for boundary summaries.


### Work data

A report can be defined by providing the temporal and geographic bounds of interest. Below is the current method.

__code sample:__

```python
# starting data, can be MySQL or NoSQL calls
# the three methods accept Callables, as long
# as the out put is pd.DataFrame
c_l = r_class.language_maps()
surveys = r_class.collect_survey_data_for_report()
codes, beaches, land_cover, land_use, streets, river_intersect_lakes = r_class.collect_env_data_for_report()

survey_data = surveys.copy()
survey_data = survey_data.merge(beaches['canton'], left_on='slug', right_index=True, validate='many_to_one')

# temporal and geographic boundaries
# user defined input
boundaries = dict(canton='Valais', language='fr', start_date='2019-01-01', end_date='2022-01-01')

# the level and label of the report
# the language for display
# the data for the report and all other
# from the data range
top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)

# define the language map
w_df.head().style.set_table_styles(conf_.table_css_styles)
```

Which produces the following untranslated output.

In [2]:
# starting data, can be MySQL or NoSQL calls
# the three methods accept Callables, as long
# as the out put is pd.DataFrame
c_l = r_class.language_maps()
surveys = r_class.collect_survey_data_for_report()
codes, beaches, land_cover, land_use, streets, river_intersect_lakes = r_class.collect_env_data_for_report()

survey_data = surveys.copy()
survey_data = survey_data.merge(beaches['canton'], left_on='slug', right_index=True, validate='many_to_one')

# temporal and geographic boundaries
# user defined input
boundaries = dict(feature_name='lac-leman', language='fr', start_date="2020-03-01", end_date="2021-05-31")

# the level and label of the report
# the language for display
# the data for the report and all other
# from the data range
top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)

# define the language map
w_df.head().style.set_table_styles(conf_.table_css_styles)

Unnamed: 0,length,parent_boundary,feature_type,date,loc_date,city,canton,code,feature_name,groupname,slug,quantity,pcs_m
228,5,rhone,l,2020-06-30,"('vidy-ruines', '2020-06-30')",Lausanne,Vaud,G1,lac-leman,food and drink,vidy-ruines,0,0.0
229,5,rhone,l,2020-06-30,"('vidy-ruines', '2020-06-30')",Lausanne,Vaud,G10,lac-leman,food and drink,vidy-ruines,0,0.0
230,5,rhone,l,2020-06-30,"('vidy-ruines', '2020-06-30')",Lausanne,Vaud,G101,lac-leman,personal items,vidy-ruines,0,0.0
231,5,rhone,l,2020-06-30,"('vidy-ruines', '2020-06-30')",Lausanne,Vaud,G102,lac-leman,personal items,vidy-ruines,0,0.0
232,5,rhone,l,2020-06-30,"('vidy-ruines', '2020-06-30')",Lausanne,Vaud,G104,lac-leman,micro plastics (< 5mm),vidy-ruines,0,0.0


## Reporting categories

The first variable of the input is used to define the hierarchy of the report. For administrative purposes a vertical approach that reflects areas of responsibility is important. For estimating values the geographic/topographic attributes are more important.

The survey data is labeled for these purposes. The columns `parent_boundary`, `feature_type` and `feature_name` are the topographic features. 

1. `parent_boundary`: the name of the: river basin, catchment area, park, name of geograhphic region or other zone defined by swiss geo admin.
2. `feature_type`: lake, river or park
3. `feature_name`: the name of the lake, river or park

The `geo_h` array sets the order for reporting. Reports for cantons can contain subreports for all the values in the array, by default the cantonal results will reference the IQAASL report for threshold or prior results. Reports for cities will contain only geographic categories with reference to cantonal results.

__code sample:__


```python


geo_h = ['parent_boundary', 'feature_type',  'feature_name','canton', 'city']


def categorize_work_data(df, labels, columns_of_interest: List[str] = geo_h, sample_id: str = 'loc_date'):
       
    data = df[df[labels[0]] == labels[1]].copy()
    
    summaries = columns_of_interest
    print(summaries)
    
    # if city is selected the available boundaries
    # are geographic. A city is in only one canton
    # if canton is selected then city becomes a category
    # for which a report can be produced    
    if labels[0] == columns_of_interest[-1]:
        summaries = columns_of_interest[:-2]
    if labels[0] == columns_of_interest[-2]:
        summaries = [*columns_of_interest[:-2], columns_of_interest[-1]]
    
    new_columns = list(set([sample_id, *summaries]))
    d = data[new_columns].copy()
    res = {}
    for an_attribute in new_columns:
        datt = d[an_attribute].unique()
        res.update({an_attribute: datt})
    
    res['samples'] = res.pop('loc_date')
    
    return {labels[1]:res}

# this categorizes the survey data into search terms
# the available data or reporting categories are retrieved
# by getting the length of the array for each category
# if the category is not present then the data is not available
parent_categories = categorize_work_data(w_df, top_label)
p_vals = parent_categories[boundaries[top_label[0]]]

# the type and number of reports available
reporting_categories = {k:len(v) for k, v in p_vals.items()}
reporting_categories
```

Which gives the following result:

In [3]:
# this categorizes the survey data into search terms
# the available data or reporting categories are retrieved
# by getting the length of the array for each category
# if the category is not present then the data is not available
parent_categories = r_class.categorize_work_data(w_df, top_label)
p_vals = parent_categories[boundaries[top_label[0]]]

# the type and number of reports available
reporting_categories = {k:len(v) for k, v in p_vals.items()}
reporting_categories

{'feature_type': 1,
 'parent_boundary': 1,
 'city': 13,
 'canton': 3,
 'feature_name': 1,
 'samples': 98}

The same operation can be performed at each level. The first call to `categorize_work_data` gives the structure of the report. For each key value of the reporting categories there wil be a set of descriptive statistics.

For example, a detailed report on all feature types within the canton includes the following summary data for each feature type.

__code sample:__

```python
# identify and count the results from parcs
parc_features = categorize_work_data(w_df[w_df.feature_type == p_vals['feature_type'][0]], top_label)

# count the contents in each attribute
{k:len(v) for k, v in parc_features[top_label[1]].items()}

# out =>

{'city': 6,
 'feature_type': 1,
 'parent_boundary': 1,
 'feature_name': 1,
 'samples': 7}

```

In this example there are 7 samples from 6 cities in the parcs feature_type. 

The summary of each label for each feature in the current data set can be obtained by providing the feature of interest to the groupby columns. By default the sample id: `loc_date` and the location name `slug` are required.

__Essential:__ The boundaries variable defines the top level data structure, for example: `boundaries={'canton': 'Valais', 'language': 'fr', 'start_date': '2019-01-01', 'end_date': '2022-01-01'}` will produce a dataframe with records only from the canton of Valais within the dates defined. Tables and charts will be translated to french.

The report class uses the resulting data structure to define reports for the different features within the dataframe.

## The report class

The report class provides the set of arguments that define the structure of the report based on the user input. Those arguments are a property of the class `ReportClass.feautures`. The `ReportClass` also identifies objects that meet the criteria defined by `mc_criteria_one` and `mc_criteria_two`.

To start a `ReportClass` call it with the dataframes of interest, the boundaries, the top level report, the language and the language map.

```python
a_report = r_class.ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)
a_report.the_number_of_attributes_in_a_feature('feature_type')
```

### The number and types of features in a report

Once a `ReportClass` is initiated a summary of the attributes can be obtained:

In [4]:
top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)          
a_report = r_class.ReportClass(w_df,boundaries, top_label, 'fr', c_l)
r_class.translated_and_style_for_display(a_report.the_number_of_attributes_in_a_feature('feature_type'), a_report.lang_maps[a_report.language], a_report.language, gradient=False)
# a_report.features

Unnamed: 0,Zone,Région,Ville,Canton,Feature_Name,Échantillons
Lac,1,1,13,3,1,98


#### A top level description

The first out put says there are three feature types (lakes, rivers and parks) in the data. There is one lake that was sampled 11 times, a river was sampled 4 times and the parks were sampled 7 times. In total there were 11 cities, 4 on the river, 6 in the parcs feature and 1 on the lake.

Recall that the geographic column names are: `['feature_type', 'feature_name', 'parent_name']`.  The suvey results from each sector can be compared by selecting the column name of interest. Depending on the value of `boundaries` all the column names may not be available. There is a method to identify exactly what features are available. Note in the example below, canton is not an option. This is because the boundaries were set for a canton.

```python
my_feautures = my_report_class.available_features()

print(my_features)

=> ['parent_boundary', 'feature_type', 'feature_name', 'city']
```

The `summarize_feature_labels` method in the `ReportClass` creates a summary of the sample totals for each label of the selected feature. Calling the `translated_and_style_for_display` method puts the table to html and applies language specific formatting using the `dataframe.style` method. The index and column names are translated using the language maps.

```python
feature_type_summary =  my_report_class.summarize_feature_labels('feature_type')

translated_and_style_for_display(feature_type_summary, my_report.language_maps, my_report.language, gradient=False)
```

Combined with the output from above a desctription of the data and how it was collected can be constructed, the higlighted text can be called from the active variables.
> There were `13'782` objects identified in the period between `2019-01-01` and `2021-12-31` in the `canton` of `Valais`. In total, `22` samples were recorded, `11` on the `lake-shore`, `7` in `ski-areas` and `4` on `riverbanks`.  The lake samples were recorded from `one` `city` on the other hand the alpes and rivers were taken from `10` `different cities`. The `median` sample total of _pieces of trash per meter_ `pcs/m` is highest at the `lakeside`, followed by the `parcs` and `rivers`.

```{admonition} Ships search terms
The `ReportClass.features` is a dictionary or .JSON file that contains the common labled geographic name of the region in question. Structuring these into search terms is a way to integrate a LLM into the analysis.
```

In [5]:
r_class.translated_and_style_for_display(a_report.summarize_feature_labels('feature_type'), a_report.lang_maps[a_report.language], a_report.language, gradient=False)

Unnamed: 0_level_0,Pcs_M
label,L
25%,230
50%,451
75%,968
Échantillons,98
Max,6617
Moyenne,866
Min,022
Écart-Type,1167
Total,27'447


#### By criteria

Objects can be selected by criteria. The default criteria requires that the quantity be in the top ten or the fail rate >= .5. This can be changed at any time using the keywords when the class is called or setting the class variables in the form `my_report_class.criteria_one = anewvalue`.

```python
objects_selected_by_criteria = my_report_class.most_common
translated_and_style_for_display(a_report.most_common, a_report.lang_maps[a_report.language], a_report.language, gradient=False)
``` 

Calling `my_report_class.most_common` will return a dataframe that has the test statistic and description of all objects that meet the criteria.

In [6]:
r_class.translated_and_style_for_display(a_report.most_common, a_report.lang_maps[a_report.language], a_report.language, gradient=False)

Unnamed: 0,Quantité,% Du Total,Pcs/M,Taux D'Échec
"Fragments De Plastique: G80, G79, G78, G75",4'212,15,10,97
"Fragments De Polystyrène Expansé: G76, G81, G82, G83",3'586,13,4,84
Mégots Et Filtres À Cigarettes,3'116,11,47,95
"Emballages De Bonbons, De Snacks",1'679,6,21,96
Pellets Industriels (Gpi),1'387,5,0,47
"Couvercles En Plastique Bouteille: G21, G22, G23, G24",1'212,4,2,89
Coton-Tige,1'112,4,12,82
Mousse De Plastique Pour L'Isolation Thermique,1'097,4,4,78
"Bâche, Feuille Plastique Industrielle",1'058,4,9,78
Polystyrène < 5Mm,689,3,0,30


#### Results by criteria and feature type

Once the the objects of intereste are identified (criteria) they can be compared accross the diferent feature_types and labels.

```python
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_type', object_column='code')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)
``` 
For example the most common objects are found at different densitiies depending on the feature type.

In [7]:
t= r_class.a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_type', object_column='code')
r_class.translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Cumulé,Lac
Emballage Fast Food,1,1
Médical Conteneurs/Tubes/ Emballages,2,2
Pellets Industriels (Gpi),0,0
Polystyrène < 5Mm,0,0
Papier D'Emballage D'Aluminium,2,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",3,3
Verre Brisé,3,3
Tabac Emballages En Plastique,2,2
Mégots Et Filtres À Cigarettes,47,47
"Emballages De Bonbons, De Snacks",21,21


#### Alternate object groups

If the column has other labeled values for object identification it can be used to aggregate results for each sample id. Here we consider `groupname, there is more than one object in a group. They represent use cases.

```python
t = a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_type', object_column='groupname')
translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)
``` 
For example the different use cases are found at different densitiies depending on the feature type.

In [8]:
t = r_class.a_cumulative_report(w_df, feature_name='feature_type', object_column='groupname')
r_class.translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Cumulé,Lac
Agriculture,14,14
Nourriture Et Boissons,88,88
Infrastructures,71,71
Micro-Plastiques (< 5Mm),16,16
Emballage Non Alimentaire,13,13
Articles Personnels,10,10
Morceaux De Plastique,61,61
Loisirs,19,19
Tabac,52,52
Non Classé,2,2


#### By Survey area or `parent_boundary`

There are two parent boundaries in the Valais, the _Alpes and Jura_ and the _Rhône_ river basin.

In [9]:
t = r_class.a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='parent_boundary', object_column='code')

r_class.translated_and_style_for_display(t,a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Cumulé,Rhône
Emballage Fast Food,1,1
Médical Conteneurs/Tubes/ Emballages,2,2
Pellets Industriels (Gpi),0,0
Polystyrène < 5Mm,0,0
Papier D'Emballage D'Aluminium,2,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",3,3
Verre Brisé,3,3
Tabac Emballages En Plastique,2,2
Mégots Et Filtres À Cigarettes,47,47
"Emballages De Bonbons, De Snacks",21,21


#### By feature name:

There are three different features with samples: Tha Alpes, the Rhône river and Lake Geneva.

In [10]:
t = r_class.a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='feature_name', object_column='code')
r_class.translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Cumulé,Lac-Leman
Emballage Fast Food,1,1
Médical Conteneurs/Tubes/ Emballages,2,2
Pellets Industriels (Gpi),0,0
Polystyrène < 5Mm,0,0
Papier D'Emballage D'Aluminium,2,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",3,3
Verre Brisé,3,3
Tabac Emballages En Plastique,2,2
Mégots Et Filtres À Cigarettes,47,47
"Emballages De Bonbons, De Snacks",21,21


#### By city:

In [11]:
t = r_class.a_cumulative_report(w_df[w_df.code.isin(a_report.most_common.index)], feature_name='city', object_column='code')
r_class.translated_and_style_for_display(t, a_report.lang_maps[a_report.language], a_report.language, gradient=True)

Unnamed: 0,Allaman,Bourg-En-Lavaux,Genève,Gland,La Tour-De-Peilz,Lausanne,Montreux,Préverenges,Saint-Gingolph,Saint-Sulpice (Vd),Tolochenaz,Versoix,Vevey,Cumulé
Emballage Fast Food,3,0,0,3,4,0,0,0,27,5,3,0,2,1
Médical Conteneurs/Tubes/ Emballages,8,8,0,2,2,20,0,1,15,27,3,2,4,2
Pellets Industriels (Gpi),0,4,4,0,0,53,0,58,3,68,0,2,0,0
Polystyrène < 5Mm,0,0,0,0,0,0,0,0,18,2,19,0,26,0
Papier D'Emballage D'Aluminium,0,4,3,3,0,13,0,2,0,17,0,0,5,2
"Bouchons De Bouteilles En Métal, Couvercles Et Tirettes",5,0,7,3,3,12,3,2,3,5,4,0,5,3
Verre Brisé,0,21,1,3,4,40,29,0,3,43,8,1,9,3
Tabac Emballages En Plastique,8,0,0,1,1,7,0,4,16,0,0,3,4,2
Mégots Et Filtres À Cigarettes,31,34,11,17,35,121,119,76,13,86,107,38,111,47
"Emballages De Bonbons, De Snacks",62,8,15,8,12,50,9,16,77,49,1,17,42,21


## Retrieving properties 

There are 318'478 rows in the survey data. We can test the sorting and grouping functions by running a report class on all possible combinations of the features of interest. The test should produce the set of arguments that define the survey locations and surveys that define the boundaries of a report.

```python
some_features = ['feature_type', 'parent_boundary', 'feature_name', 'canton', 'city']

def produce_reports_for_testing(survey_data, some_features):
    reports = {}
    for a_feature in some_features:
        labels = survey_data[a_feature].unique()
        label_reports = {}
        for label in labels:
            start_date = survey_data[survey_data[a_feature] == label]['date'].min()
            end_date = survey_data[survey_data[a_feature] == label]['date'].max()
            
            boundaries = {a_feature:label, 'language':'fr', 'start_date':start_date, 'end_date':end_date}
            top_label, language, w_df, w_di = report_data(boundaries, survey_data.copy())
            a_report = ReportClass(w_df, w_di, boundaries, top_label, 'fr', c_l)
            label_reports.update({label:a_report.features})
        reports.update({a_feature:label_reports})
    return reports
   
t = produce_reports_for_testing(survey_data, some_features)

t['canton']['Valais']
```

In [12]:
# some_features = ['feature_type', 'parent_boundary', 'feature_name', 'canton', 'city']

# def produce_reports_for_testing(survey_data, some_features):
#     reports = {}
#     for a_feature in some_features:
#         labels = survey_data[a_feature].unique()
#         label_reports = {}
#         for label in labels:
#             start_date = survey_data[survey_data[a_feature] == label]['date'].min()
#             end_date = survey_data[survey_data[a_feature] == label]['date'].max()
            
#             boundaries = {a_feature:label, 'language':'fr', 'start_date':start_date, 'end_date':end_date}
#             top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)
#             a_report = r_class.ReportClass(w_df, boundaries, top_label, 'fr', c_l)
#             label_reports.update({label:a_report.features})
#         reports.update({a_feature:label_reports})
#     return reports
   
# t = produce_reports_for_testing(survey_data, some_features)

In [13]:
# from joblib import Parallel, delayed
# from reportclass import ReportClass

# def process_feature(a_feature, survey_data, beaches, codes):
#     labels = survey_data[a_feature].unique()
#     label_reports = {}

#     for label in labels:
#         start_date = survey_data[survey_data[a_feature] == label]['date'].min()
#         end_date = survey_data[survey_data[a_feature] == label]['date'].max()

#         boundaries = {a_feature: label, 'language': 'fr', 'start_date': start_date, 'end_date': end_date}
#         top_label, language, w_df, w_di = r_class.report_data(boundaries, survey_data.copy(), beaches, codes)
#         a_report = r_class.ReportClass(w_df, boundaries, top_label, 'fr', c_l)
#         label_reports.update({label: a_report.features})

#     return a_feature, label_reports

# def produce_reports_for_testing_parallel(survey_data, some_features, num_jobs=-1):
#     reports = Parallel(n_jobs=num_jobs)(delayed(process_feature)(feature, survey_data, beaches, codes) for feature in some_features)

#     result_dict = dict(reports)
#     return result_dict
# t = produce_reports_for_testing_parallel(survey_data, some_features)

In [14]:
# t['canton']['Valais']

The properties should contain the arguments for cities in the example report

In [15]:
# t['city']['Saint-Gingolph']

In [16]:
top_label

['feature_name', 'lac-leman']

## Unit tests

### The repor class

#### The `ReportClass` takes the following parameters:

* w_df (pd.DataFrame, optional): The survey data DataFrame for report generation.
* boundaries (dict, optional): A dictionary defining the reporting boundaries, including 'start_date', end_date', and 'language'.
* top_label (List, optional): A list containing two elements - [label_column, label_value].
* language (str, optional): The language in which the report is generated.
* lang_maps (pd.DataFrame, optional): A DataFrame containing language mapping data.
* mc_criteria_one (dict, optional): The first criteria for identifying objects of interest.
* mc_criteria_two (dict, optional): The second criteria for identifying objects of interest.
* ooi (str, optional): The name of the object of interest column.

#### And has the following methods:

- features: Get a list of available features for report generation.
- available_features: Get a list of available features based on predefined criteria.
- inventory: Get an inventory of objects with summary statistics.
- most_common: Find the most common objects based on criteria.
- summarize_feature_labels: Summarize data for a specific feature.
- the_number_of_attributes_in_a_feature: Count attributes in a feature.
- `__repr__` : Return a string representation of the ReportClass instance.

#### Other methods in the `reportclass` module.



```python
from inspect import getmembers, isfunction

functions_list = getmembers(reportclass, isfunction)
[x[0] for x in functions_list]

```


* __'a_cumulative_report'__,
* 'a_summary_of_one_vector',
* 'add_column_to_work_data',
* 'add_columns_to_work_data',
* __'aggregate_boundaries'__,
* __'aggregate_dataframe'__,
* __'calculate_rate_per_unit'__,
* 'capitalize_index',
* 'categorize_work_data',
* 'check_for_top_label',
* 'collect_env_data_for_report',
* 'collect_survey_data_for_report',
* 'color_gradient',
* 'combine_survey_files',
* 'count_objects_with_positive_quantity',
* 'display_tabular_data_by_column_values',
* 'get_top_x_records_with_max_quantity',
* 'language_maps',
* 'merge_dataframes_on_column_and_index',
* 'report_data',
* 'slice_data_by_date',
* 'translate_for_display',
* 'translate_word',
* 'translated_and_style_for_display',
* __'use_gfrags_gfoams_gcaps'__,
* __'use_parent_groups_or_gfrags__'

In [17]:
# from inspect import getmembers, isfunction

# functions_list = getmembers(r_class, isfunction)
# [x[0] for x in functions_list]

### Combining codes using parent groups

#### Specific to beach litter data

* `reportclass.use_parent_groups_or_gfrags`
* `reportclass.use_gfrags_gfoams_gcaps`

config setting: `setvariables.code_result_columns`

> Defines the set of columns to use when aggregating to the object level

```python
import unittest
class TestUseGfragsGfoamsGcaps(unittest.TestCase):

    def test_use_gfrags_gfoams_gcaps(self):
        # Sample data and code mappings
        data = pd.DataFrame({'code': ['A', 'B', 'C', 'D', 'E', 'F'],
                             'sample_id': [1, 2, 1, 2, 1, 1],
                             'density': [1.5, 0.5, 1.5, 0.5, 1.5, 1],
                             'quantity': [2, 1, 2, 1, 2, 1]})
        codes = pd.DataFrame({'parent_code': ['Gfoams', 'Gfrags', 'Gcaps','Gfoams', 'Gcaps', 'F'],
                              'code': ['A', 'B', 'C', 'D', 'E', 'F']})
        codes.set_index('code', inplace=True)
        # Expected result
        expected_result = pd.DataFrame(
            {'code': {0: 'Gfoams', 1: 'Gfrags', 2: 'Gcaps', 3: 'Gfoams',  4: 'Gcaps',  5: 'F'},
             'sample_id': {0: 1, 1: 2, 2: 1, 3: 2, 4: 1, 5: 1},
             'density': {0: 1.5, 1: 0.5, 2: 1.5, 3: 0.5, 4: 1.5, 5: 1.0},
             'quantity': {0: 2, 1: 1, 2: 2, 3: 1, 4: 2, 5: 1}}
        )

        # Call the function
        updated_data = r_class.use_gfrags_gfoams_gcaps(data, codes)

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(updated_data, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestUseGfragsGfoamsGcaps)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)
```

In [18]:
import unittest
class TestUseGfragsGfoamsGcaps(unittest.TestCase):

    def test_use_gfrags_gfoams_gcaps(self):
        # Sample data and code mappings
        data = pd.DataFrame({'code': ['A', 'B', 'C', 'D', 'E', 'F'],
                             'sample_id': [1, 2, 1, 2, 1, 1],
                             'density': [1.5, 0.5, 1.5, 0.5, 1.5, 1],
                             'quantity': [2, 1, 2, 1, 2, 1]})
        codes = pd.DataFrame({'parent_code': ['Gfoams', 'Gfrags', 'Gcaps','Gfoams', 'Gcaps', 'F'],
                              'code': ['A', 'B', 'C', 'D', 'E', 'F']})
        codes.set_index('code', inplace=True)
        # Expected result
        expected_result = pd.DataFrame(
            {'code': {0: 'Gfoams', 1: 'Gfrags', 2: 'Gcaps', 3: 'Gfoams',  4: 'Gcaps',  5: 'F'},
             'sample_id': {0: 1, 1: 2, 2: 1, 3: 2, 4: 1, 5: 1},
             'density': {0: 1.5, 1: 0.5, 2: 1.5, 3: 0.5, 4: 1.5, 5: 1.0},
             'quantity': {0: 2, 1: 1, 2: 2, 3: 1, 4: 2, 5: 1}}
        )

        # Call the function
        updated_data = r_class.use_gfrags_gfoams_gcaps(data, codes)

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(updated_data, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestUseGfragsGfoamsGcaps)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)


test_use_gfrags_gfoams_gcaps (__main__.TestUseGfragsGfoamsGcaps) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.005s

OK


### Aggregating values

* `reportclass.aggregate_dataframe`

#### config settings

Accepts the following arguments for the methods given the defaults `code_result_columns` and `work_columns`:

* `setvariables.agg_groups`
* `setvariables.unit_agg`


```python
class TestAggregateDataFrame(unittest.TestCase):

    def test_aggregate_dataframe(self):
        # Sample data
        data = pd.DataFrame({'code': ['A', 'B', 'C', 'D', 'E', 'F'],
                             'sample_id': [1, 2, 1, 2, 1, 1],
                             'density': [1.5, 0.5, 1.5, 0.5, 1.5, 1],
                             'quantity': [2, 1, 2, 1, 2, 1],
                             'prop a': ['s1', 's2','s1','s1','s2', 's3'],
                             'prop b': ['x' ,'x', 'z','z','z', 'q']})
        group_by_columns = ['sample_id','prop a']
        aggregation_functions = {'quantity': 'sum','density': 'median'}

        # Expected result
        expected_result = pd.DataFrame({
            'sample_id': {0: 1, 1: 1, 2: 1, 3: 2, 4: 2},
            'prop a': {0: 's1', 1: 's2', 2: 's3', 3: 's1', 4: 's2'},
            'quantity': {0: 4, 1: 2, 2: 1, 3: 1, 4: 1},
            'density': {0: 1.5, 1: 1.5, 2: 1.0, 3: 0.5, 4: 0.5}})

        # Call the function
        result = r_class.aggregate_dataframe(data, groupby_columns=group_by_columns, aggregation_functions=aggregation_functions)

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAggregateDataFrame)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)
```

In [19]:
class TestAggregateDataFrame(unittest.TestCase):

    def test_aggregate_dataframe(self):
        # Sample data
        data = pd.DataFrame({'code': ['A', 'B', 'C', 'D', 'E', 'F'],
                             'sample_id': [1, 2, 1, 2, 1, 1],
                             'density': [1.5, 0.5, 1.5, 0.5, 1.5, 1],
                             'quantity': [2, 1, 2, 1, 2, 1],
                             'prop a': ['s1', 's2','s1','s1','s2', 's3'],
                             'prop b': ['x' ,'x', 'z','z','z', 'q']})
        group_by_columns = ['sample_id','prop a']
        aggregation_functions = {'quantity': 'sum','density': 'median'}

        # Expected result
        expected_result = pd.DataFrame({
            'sample_id': {0: 1, 1: 1, 2: 1, 3: 2, 4: 2},
            'prop a': {0: 's1', 1: 's2', 2: 's3', 3: 's1', 4: 's2'},
            'quantity': {0: 4, 1: 2, 2: 1, 3: 1, 4: 1},
            'density': {0: 1.5, 1: 1.5, 2: 1.0, 3: 0.5, 4: 0.5}})

        # Call the function
        result = r_class.aggregate_dataframe(data, groupby_columns=group_by_columns, aggregation_functions=aggregation_functions)

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAggregateDataFrame)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)


test_aggregate_dataframe (__main__.TestAggregateDataFrame) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.006s

OK


### Calculating rate per unit

* `reportclass.calculate_rate_per_unit`

#### config settings

Accepts the following arguments for the methods, the default is `setvariables.unit_agg`:

* `setvariables.agg_groups`
* `setvariables.unit_agg`


```python
class TestCalculateRatePerUnit(unittest.TestCase):

    def test_calculate_rate_per_unit(self):
        # Sample data
        data = pd.DataFrame({
            'sample': [1, 2, 3, 4, 5, 6],
            'object': ['A', 'B', 'A', 'A', 'B', 'B'],
            'quantity': [10, 20, 30, 40, 50, 60],
            'pcs_m': [1,1,2,1, 2, 2],
        })

        # Objects to check
        column_of_interest = 'object'
        objects_to_check = ['A', 'B']
        groupby_columns = ['object']
        
        # Aggregation functions
        aggregation_methods = {
            'quantity': 'sum',
            'pcs_m': 'median'
        }
        
        # Expected result
        expected_result = pd.DataFrame({
            
            'pcs_m': {'A': 1.0, 'B': 2.0},
            'quantity': {'A': 80, 'B': 130},
            'label': {'A': 'all', 'B': 'all'}})
        
        # Call the function
        result = r_class.calculate_rate_per_unit(data, objects_to_check,column_of_interest=column_of_interest, groupby_columns=groupby_columns)
        result.index.name = None

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCalculateRatePerUnit)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)

```

In [20]:
class TestCalculateRatePerUnit(unittest.TestCase):

    def test_calculate_rate_per_unit(self):
        # Sample data
        data = pd.DataFrame({
            'sample': [1, 2, 3, 4, 5, 6],
            'object': ['A', 'B', 'A', 'A', 'B', 'B'],
            'quantity': [10, 20, 30, 40, 50, 60],
            'pcs_m': [1,1,2,1, 2, 2],
        })

        # Objects to check
        column_of_interest = 'object'
        objects_to_check = ['A', 'B']
        groupby_columns = ['object']
        
        # Aggregation functions
        aggregation_methods = {
            'quantity': 'sum',
            'pcs_m': 'median'
        }
        
        # Expected result
        expected_result = pd.DataFrame({
            
            'pcs_m': {'A': 1.0, 'B': 2.0},
            'quantity': {'A': 80, 'B': 130},
            'label': {'A': 'all', 'B': 'all'}})
        
        # Call the function
        result = r_class.calculate_rate_per_unit(data, objects_to_check,column_of_interest=column_of_interest, groupby_columns=groupby_columns)
        result.index.name = None

        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected_result)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestCalculateRatePerUnit)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)

test_calculate_rate_per_unit (__main__.TestCalculateRatePerUnit) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.006s

OK


### Aggregate boundaries

* `reportclass.aggregate_boundaries`

Is called by `reportclass.a_cumulative_report` and calls `reportclass.aggregate_dataframe`.

```python

class TestAggregateBoundaries(unittest.TestCase):
    
    def setUp(self):
        # Create a sample DataFrame for testing
        data = pd.DataFrame({
            'Region': ['A', 'A', 'B', 'B', 'A', 'B'],
            'Year': [2019, 2019, 2020, 2020, 2021, 2021],
            'city': ['a1', 'a2', 'b1', 'b1', 'a2', 'b2'],
            'sample id' : [1, 1, 2, 2, 3, 4],
            'Objects': ['X', 'Y', 'X', 'Y', 'Y', 'X'],
            'Density': [10, 20, 30, 40, 50, 60],
            'Quantity': [1, 1, 1, 1, 1, 1]
        })
        # test data
        self.df = pd.DataFrame(data)
        
        # from user input
        self.feature_name = 'Year'
        self.object_column = 'Objects'
        self.sample_id = 'sample id'

        # from default or user input
        self.unit_methods = {'Density': 'sum', 'Quantity': 'count'}
        self.group_methods = {'Quantity': 'sum','Density': 'median'}

        # the feature name, sample id and object columns make up the groupby columns
        # the feauture name is used to mask the different child boundaries
        self.groupby_columns=[self.feature_name, self.sample_id, self.object_column]

        # the labels of the child boundaries are collected
        # using the feature name variable
        self.boundary_labels = self.df[self.feature_name].unique()

        # the boundary columns are used when aggregating the child boundaries
        self.boundary_columns = [self.object_column]


    def test_aggregate_boundaries_without_labels(self):
       
        args = {
            'groupby_columns':self.groupby_columns,
            'unit_agg':self.unit_methods,
            'group_agg': self.group_methods,
            'boundary_labels': None,
            'boundary_columns': self.boundary_columns}

        expected = pd.DataFrame(
            {'Objects': {0: 'X', 1: 'Y'},
             'Quantity': {0: 3, 1: 3},
             'Density': {0: 30.0, 1: 40.0},
             'label': {0: 'all', 1: 'all'}
            })
        
        result = r_class.aggregate_boundaries(self.df, **args)
        
        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)

    def test_aggregate_boundaries_with_labels(self):
        args = {
            'groupby_columns':self.groupby_columns,
            'unit_agg':self.unit_methods,
            'group_agg': self.group_methods,
            'boundary_labels': self.boundary_labels,
            'boundary_columns': self.boundary_columns}

        expected = pd.DataFrame(
            {'Objects': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
             'Quantity': [1, 1, 1, 1, 1, 1],
             'Density': [10., 20., 30., 40., 60., 50.],
             'label': [2019, 2019, 2020, 2020, 2021, 2021]
            })
        
        result = r_class.aggregate_boundaries(self.df, **args)
        result.reset_index(inplace=True, drop=True)
        
        
        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)

``` 

In [21]:


class TestAggregateBoundaries(unittest.TestCase):
    
    def setUp(self):
        # Create a sample DataFrame for testing
        data = pd.DataFrame({
            'Region': ['A', 'A', 'B', 'B', 'A', 'B'],
            'Year': [2019, 2019, 2020, 2020, 2021, 2021],
            'city': ['a1', 'a2', 'b1', 'b1', 'a2', 'b2'],
            'sample id' : [1, 1, 2, 2, 3, 4],
            'Objects': ['X', 'Y', 'X', 'Y', 'Y', 'X'],
            'Density': [10, 20, 30, 40, 50, 60],
            'Quantity': [1, 1, 1, 1, 1, 1]
        })
        # test data
        self.df = pd.DataFrame(data)
        
        # from user input
        self.feature_name = 'Year'
        self.object_column = 'Objects'
        self.sample_id = 'sample id'

        # from default or user input
        self.unit_methods = {'Density': 'sum', 'Quantity': 'count'}
        self.group_methods = {'Quantity': 'sum','Density': 'median'}

        # the feature name, sample id and object columns make up the groupby columns
        # the feauture name is used to mask the different child boundaries
        self.groupby_columns=[self.feature_name, self.sample_id, self.object_column]

        # the labels of the child boundaries are collected
        # using the feature name variable
        self.boundary_labels = self.df[self.feature_name].unique()

        # the boundary columns are used when aggregating the child boundaries
        self.boundary_columns = [self.object_column]


    def test_aggregate_boundaries_without_labels(self):
       
        args = {
            'groupby_columns':self.groupby_columns,
            'unit_agg':self.unit_methods,
            'group_agg': self.group_methods,
            'boundary_labels': None,
            'boundary_columns': self.boundary_columns}

        expected = pd.DataFrame(
            {'Objects': {0: 'X', 1: 'Y'},
             'Quantity': {0: 3, 1: 3},
             'Density': {0: 30.0, 1: 40.0},
             'label': {0: 'all', 1: 'all'}
            })
        
        result = r_class.aggregate_boundaries(self.df, **args)
        
        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)

    def test_aggregate_boundaries_with_labels(self):
        args = {
            'groupby_columns':self.groupby_columns,
            'unit_agg':self.unit_methods,
            'group_agg': self.group_methods,
            'boundary_labels': self.boundary_labels,
            'boundary_columns': self.boundary_columns}

        expected = pd.DataFrame(
            {'Objects': ['X', 'Y', 'X', 'Y', 'X', 'Y'],
             'Quantity': [1, 1, 1, 1, 1, 1],
             'Density': [10., 20., 30., 40., 60., 50.],
             'label': [2019, 2019, 2020, 2020, 2021, 2021]
            })
        
        result = r_class.aggregate_boundaries(self.df, **args)
        result.reset_index(inplace=True, drop=True)
        
        
        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAggregateBoundaries)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)


test_aggregate_boundaries_with_labels (__main__.TestAggregateBoundaries) ... ok
test_aggregate_boundaries_without_labels (__main__.TestAggregateBoundaries) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.018s

OK


### Cumulative reports

* reportclass.a_cumulative_report

Calls `reportclass.aggregate_boundaries`

```python

class TestAcumulativeReport(unittest.TestCase):
    
    def setUp(self):
        # Create a sample DataFrame for testing
        data = pd.DataFrame({
            'Region': ['A', 'A', 'B', 'B', 'A', 'B'],
            'Year': [2019, 2019, 2020, 2020, 2021, 2021],
            'city': ['a1', 'a2', 'b1', 'b1', 'a2', 'b2'],
            'sample id' : [1, 1, 2, 2, 3, 4],
            'Objects': ['X', 'Y', 'X', 'Y', 'Y', 'X'],
            'Density': [10, 20, 30, 40, 50, 60],
            'Quantity': [1, 1, 1, 1, 1, 1]
        })
        # test data
        self.df = pd.DataFrame(data)
        
        # from user input
        self.feature_name = 'Year'
        self.object_column = 'Objects'
        self.sample_id = 'sample id'

        # from default or user input
        self.unit_methods = {'Density': 'sum', 'Quantity': 'count'}
        self.group_methods = {'Quantity': 'sum','Density': 'median'}

        # the feature name, sample id and object columns make up the groupby columns
        # the feauture name is used to mask the different child boundaries
        self.groupby_columns=[self.feature_name, self.sample_id, self.object_column]

        # the labels of the child boundaries are collected
        # using the feature name variable
        self.boundary_labels = self.df[self.feature_name].unique()

        # the boundary columns are used when aggregating the child boundaries
        self.boundary_columns = [self.object_column]


    def test_acumulative_df(self):
       
        args = {
            'feature_name': self.feature_name,
            'object_column': self.object_column,
            'sample_id': self.sample_id,
            'unit_agg': self.unit_methods,
            'group_agg':self.group_methods,
            'pivot_values': 'Density'
          }

        expected = pd.DataFrame(
            {2019: {'X': 10.0, 'Y': 20.0},
            2020: {'X': 30.0, 'Y': 40.0},
            2021: {'X': 60.0, 'Y': 50.0},
            'all': {'X': 30.0, 'Y': 40.0}}
        )
        expected.index.name = self.object_column
        expected.columns.name = 'label'        
        
        result = r_class.a_cumulative_report(self.df, **args)
      
        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAcumulativeReport)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)
``` 


In [22]:
class TestAcumulativeReport(unittest.TestCase):
    
    def setUp(self):
        # Create a sample DataFrame for testing
        data = pd.DataFrame({
            'Region': ['A', 'A', 'B', 'B', 'A', 'B'],
            'Year': [2019, 2019, 2020, 2020, 2021, 2021],
            'city': ['a1', 'a2', 'b1', 'b1', 'a2', 'b2'],
            'sample id' : [1, 1, 2, 2, 3, 4],
            'Objects': ['X', 'Y', 'X', 'Y', 'Y', 'X'],
            'Density': [10, 20, 30, 40, 50, 60],
            'Quantity': [1, 1, 1, 1, 1, 1]
        })
        # test data
        self.df = pd.DataFrame(data)
        
        # from user input
        self.feature_name = 'Year'
        self.object_column = 'Objects'
        self.sample_id = 'sample id'

        # from default or user input
        self.unit_methods = {'Density': 'sum', 'Quantity': 'count'}
        self.group_methods = {'Quantity': 'sum','Density': 'median'}

        # the feature name, sample id and object columns make up the groupby columns
        # the feauture name is used to mask the different child boundaries
        self.groupby_columns=[self.feature_name, self.sample_id, self.object_column]

        # the labels of the child boundaries are collected
        # using the feature name variable
        self.boundary_labels = self.df[self.feature_name].unique()

        # the boundary columns are used when aggregating the child boundaries
        self.boundary_columns = [self.object_column]


    def test_acumulative_df(self):
       
        args = {
            'feature_name': self.feature_name,
            'object_column': self.object_column,
            'sample_id': self.sample_id,
            'unit_agg': self.unit_methods,
            'group_agg':self.group_methods,
            'pivot_values': 'Density'
          }

        expected = pd.DataFrame(
            {2019: {'X': 10.0, 'Y': 20.0},
            2020: {'X': 30.0, 'Y': 40.0},
            2021: {'X': 60.0, 'Y': 50.0},
            'all': {'X': 30.0, 'Y': 40.0}}
        )
        expected.index.name = self.object_column
        expected.columns.name = 'label'        
        
        result = r_class.a_cumulative_report(self.df, **args)
      
        # Check if the result matches the expected result
        pd.testing.assert_frame_equal(result, expected)

test_suite = unittest.TestLoader().loadTestsFromTestCase(TestAcumulativeReport)
test_runner = unittest.TextTestRunner(verbosity=3)
test_result = test_runner.run(test_suite)

test_acumulative_df (__main__.TestAcumulativeReport) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.017s

OK


In [23]:
data = pd.DataFrame({
            'Region': ['A', 'A', 'B', 'B', 'A', 'B'],
            'Year': [2019, 2019, 2020, 2020, 2021, 2021],
            'city': ['a1', 'a2', 'b1', 'b1', 'a2', 'b2'],
            'sample id' : [1, 1, 2, 2, 3, 4],
            'Objects': ['X', 'Y', 'X', 'Y', 'Y', 'X'],
            'Density': [10, 20, 30, 40, 50, 60],
            'Quantity': [1, 1, 1, 1, 1, 1]
        })

data

Unnamed: 0,Region,Year,city,sample id,Objects,Density,Quantity
0,A,2019,a1,1,X,10,1
1,A,2019,a2,1,Y,20,1
2,B,2020,b1,2,X,30,1
3,B,2020,b1,2,Y,40,1
4,A,2021,a2,3,Y,50,1
5,B,2021,b2,4,X,60,1


In [24]:
# from user input
feature_name = 'Year'
object_column = 'Objects'
sample_id = 'sample id'

# from default or user input
unit_methods = {'Density': 'sum', 'Quantity': 'count'}
group_methods = {'Quantity': 'sum','Density': 'median'}

# the feature name, sample id and object columns make up the groupby columns
# the feauture name is used to mask the different child boundaries
groupby_columns=[feature_name, sample_id, object_column]

# the labels of the child boundaries are collected
# using the feature name variable
boundary_labels = data[feature_name].unique()

# the boundary columns are used when aggregating the child boundaries
boundary_columns = [object_column]

unit_aggregate = r_class.aggregate_dataframe(data, groupby_columns=groupby_columns, aggregation_functions=unit_methods) 
unit_aggregate

Unnamed: 0,Year,sample id,Objects,Density,Quantity
0,2019,1,X,10,1
1,2019,1,Y,20,1
2,2020,2,X,30,1
3,2020,2,Y,40,1
4,2021,3,Y,50,1
5,2021,4,X,60,1


In [25]:
d = r_class.aggregate_dataframe(unit_aggregate.copy(), groupby_columns=groupby_columns[-1:], aggregation_functions=group_methods)
d['label'] = 'all'
d

Unnamed: 0,Objects,Quantity,Density,label
0,X,3,30.0,all
1,Y,3,40.0,all


In [26]:
# unit_aggregate = r_class.aggregate_boundaries(data, groupby_columns=groupby_columns, unit_agg=unit_methods, boundary_labels=None, group_agg=group_methods)
boundary_summaries = []
for label in boundary_labels:
    boundary_mask = unit_aggregate[groupby_columns[0]] == label
    boundary_aggregate = unit_aggregate[boundary_mask].groupby(boundary_columns, as_index=False).agg(group_methods)
    boundary_aggregate['label'] = label
    boundary_summaries.append(boundary_aggregate)
cumulative = pd.concat(boundary_summaries)

ex= {
    'Objects': cumulative.Objects.values,
    'Quantity': cumulative.Quantity.values,
    'Density': cumulative.Density.values,
    'label': cumulative.label.values
}

cumulative

Unnamed: 0,Objects,Quantity,Density,label
0,X,1,10.0,2019
1,Y,1,20.0,2019
0,X,1,30.0,2020
1,Y,1,40.0,2020
0,X,1,60.0,2021
1,Y,1,50.0,2021


In [27]:
a_test_report = pd.concat([cumulative, d])

a_test_report.pivot(columns=['label'], index=['Objects'], values='Density').to_dict()

{2019: {'X': 10.0, 'Y': 20.0},
 2020: {'X': 30.0, 'Y': 40.0},
 2021: {'X': 60.0, 'Y': 50.0},
 'all': {'X': 30.0, 'Y': 40.0}}

In [28]:
%watermark -a hammerdirt-analyst -co --iversions

Author: hammerdirt-analyst

conda environment: cantonal_report

matplotlib: 3.7.1
pandas    : 2.0.3
numpy     : 1.25.2

