# Aggregate Experiments

To be able to visualize aggregated data by author or language we need to sum up the counts in the distributions. As data is quite large it would be good to find an algorithm that would do it faster. This notebook consists of experiments to find this algorithm and create some test data. The final implemention itself is in `utf8/postprocessors/gutenberg_aggregate.py`.

Some ideas on how to do it are from [a geeksforgeeks article](https://www.geeksforgeeks.org/python-sum-list-of-dictionaries-with-same-key/). Also there is a method with Pandas Series summing.

We are ignoring `stats['doc']` data in this experiment as it has different nesting level and doesn't have many data.

The algorithm should:
1. get list of all_stats files from a folder and its subfolders
2. summ numbers from each file
3. return a dict with the same structure as the source files but with summed values

In [1]:
import pandas as pd
import gzip
import json   
import os
from pathlib import Path
from collections import Counter

In [22]:
# SOURCE_DIRECTORY = '../../gutenberg_stats_examples/'
# os.listdir(SOURCE_DIRECTORY)[:3]

['35902-8.stats_all.json.gz', '39340-8.stats.json.gz', '27711-8.stats.json.gz']

In [30]:
# using Italian data for testing as it's larger than example dir, but not so huge as English
SOURCE_DIRECTORY = '../../gutenberg_stats/clean_results/it/'
os.listdir(SOURCE_DIRECTORY)[:3]

['Panzini-Alfredo', 'Praga-Emilio', 'Levi-David']

## Collections Counter

In [40]:
%%timeit
def collections_counter(data_source_path):
    
    aggregated_stats = {
        'by_paragraph': {
            'sentence_length': Counter(), 
            'word_length': Counter(), 
            'token_length': Counter(), 
            'char_length': Counter(),
        }, 
        'by_sentence': { 
            'word_length': Counter(), 
            'token_length': Counter(), 
            'char_length': Counter(),
        }, 
        'by_word': {
            'words': Counter(),
        },
    }

    # get all files (including sub directories)
    paths = list(Path(data_source_path).rglob("*stats_all*.json.gz"))
    levels = list(aggregated_stats.keys())
    # sum up the stats
    for path in paths:
        with gzip.open(path) as f:
            book = json.loads(f.read())
        
        stats = book['stats_data']
        
        for level in levels:
            for dist_name, agg_dist in aggregated_stats[level].items():
                book_dist = stats[level][dist_name]
                
                agg_dist.update(book_dist)
                

    return aggregated_stats


collections_counter(SOURCE_DIRECTORY)

1.18 s ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Python for-each

In [38]:
%%timeit
def python_for_each_and_sets(data_source_path):
    
    aggregated_stats = {
        'by_paragraph': {
            'sentence_length': {}, 
            'word_length': {}, 
            'token_length': {}, 
            'char_length': {},
        }, 
        'by_sentence': { 
            'word_length': {}, 
            'token_length': {}, 
            'char_length': {},
        }, 
        'by_word': {
            'words': {},
        },
    }

    # get all files (including sub directories)
    paths = list(Path(data_source_path).rglob("*stats_all*.json.gz"))
    levels = list(aggregated_stats.keys())
    # sum up the stats
    for path in paths:
        with gzip.open(path) as f:
            book = json.loads(f.read())
        
        stats = book['stats_data']
        
        for level in levels:
            for dist_name, agg_dist in aggregated_stats[level].items():
                book_dist = stats[level][dist_name]
                
                keys_union = set(agg_dist.keys()) | set(book_dist.keys())
                
                for key in keys_union:
                    agg_dist[key] = agg_dist.get(key, 0) + book_dist.get(key, 0)
                

    return aggregated_stats


python_for_each_and_sets(SOURCE_DIRECTORY)

8.14 s ± 152 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pandas

In [37]:
%%timeit
def pandas_series_add(data_source_path):
    aggregated_stats = {
        'by_paragraph': {
            'sentence_length': pd.Series(dtype=int), 
            'word_length': pd.Series(dtype=int), 
            'token_length': pd.Series(dtype=int), 
            'char_length': pd.Series(dtype=int),
        }, 
        'by_sentence': { 
            'word_length': pd.Series(dtype=int), 
            'token_length': pd.Series(dtype=int), 
            'char_length': pd.Series(dtype=int),
        }, 
        'by_word': {
            'words': pd.Series(dtype=int),
        },
    }

    # get all files (including sub directories)
    paths = list(Path(data_source_path).rglob("*stats_all*.json.gz"))


    levels = list(aggregated_stats.keys())
    # sum up the stats
    for path in paths:
        with gzip.open(path) as f:
            book = json.loads(f.read())
        
        stats = book['stats_data']

        for level in levels:
            for key in aggregated_stats[level].keys():
                aggregated_stats[level][key] = aggregated_stats[level][key].add(
                    pd.Series(stats[level][key]), fill_value=0
                ).astype(int)

    # remove pandas
    for level in levels:
        for key, value in aggregated_stats[level].items():
            aggregated_stats[level][key] = value.to_dict()

    return aggregated_stats


pandas_series_add(SOURCE_DIRECTORY)
# print(res)

24.4 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Create Test Data

A simple example with the structure of the data

In [36]:
# data is stored as json, lots of nesting
books = []

for file in os.listdir('../postprocessors/test_data/'):
    with gzip.open(f'../postprocessors/test_data/{file}') as f:
        book = json.loads(f.read())
        books.append(book)
    
len(books)

3

In [27]:
list(book['stats_data'].keys())

['doc', 'by_paragraph', 'by_sentence', 'by_word']

In [24]:
book['stats_data']['doc']

{'paragraph_count': 1060,
 'sentence_count': 3636,
 'token_count': 23290,
 'word_count': 23290,
 'char_count': 293612}

In [51]:
list(test_book['stats_data']['by_sentence'].keys())

['word_length', 'token_length', 'char_length']

In [52]:
test_book['stats_data']['by_sentence']['char_length']

{'1': 1, '2': 2, '3': 3}

In [43]:
test_book = {
    'metadata': {}, 
    'file_stats': {}, 
    'stats_data': {
        'doc': {
            'paragraph_count': 1,
            'sentence_count': 1,
            'token_count': 1,
            'word_count': 1,
            'char_count': 1
        }, 
        'by_paragraph': {
            'sentence_length': {
                '1': 1,
                '2': 2,
                '3': 3
            }, 
            'word_length': {
                '1': 1,
                '2': 2,
                '3': 3
            }, 
            'token_length': {
                '1': 1,
                '2': 2,
                '3': 3
            }, 
            'char_length': {
                '1': 1,
                '2': 2,
                '3': 3
            },
        }, 
        'by_sentence': { 
            'word_length': {
                '1': 1,
                '2': 2,
                '3': 3
            }, 
            'token_length': {
                '1': 1,
                '2': 2,
                '3': 3
            }, 
            'char_length': {
                '1': 1,
                '2': 2,
                '3': 3
            },
        }, 
        'by_word': {
            'words': {
                'my': 1,
                'test': 2,
                'words': 3
            }
        }
    }, 
    'tokens': {},
}

In [64]:
# create test data

test_file_names = ['test_book_1_stats_all.json.gz', 'test_book_2_stats_all.json.gz', 'test_book_3_stats_all.json.gz']
book = json.dumps(test_book)

for file in test_file_names:
    with gzip.open(f'../postprocessors/test_data/{file}', 'wt', encoding='utf8') as f:
        f.write(book)