# Setup notebook

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd

assert sys.version_info > (3,8)

root_folder = os.path.join(os.getcwd().split('text_analytics')[0], 'text_analytics')
sys.path = sys.path + [ root_folder, globals()['_dh'][-1] ]

import text_analytic_tools.utility.utils as utility
import text_analytic_tools.common.textacy_utility as textacy_utility

corpus_folder = os.path.join(root_folder, 'data')
source_path = os.path.join(corpus_folder, 'legal_instrument_corpus.zip')
prepped_source_path = utility.path_add_suffix(source_path, '_preprocessed')

language = 'en'
nlp = textacy_utility.setup_nlp_language_model(language, disable=('ner', ))
textacy_corpus_path = textacy_utility.generate_corpus_filename(prepped_source_path, language)
document_index = pd.read_csv(os.path.join(corpus_folder, 'legal_instrument_index.csv'), sep=';', header=0)

2020-10-06 09:34:40,606 : INFO : Loading model: en...
2020-10-06 09:34:41,341 : INFO : loaded 'en_core_web_sm' spaCy language pipeline
2020-10-06 09:34:41,846 : INFO : Using pipeline: tagger parser
2020-10-06 09:34:41,847 : INFO : Call time [setup_nlp_language_model]: 1.2407 secs


## Prepare and load `SSI Legal Intruments` corpus

In [2]:
import text_analytic_tools.common.text_corpus as text_corpus

def get_document_stream(prepped_source_path, document_index):

    reader = text_corpus.CompressedFileReader(prepped_source_path)
    document_index = document_index.set_index('filename')

    for document_name, text in reader:

        metadata = document_index.loc[document_name].to_dict()
        document_id = metadata['unesco_id']

        yield document_name, document_id, text, metadata

if not os.path.isfile(prepped_source_path):
    textacy_utility.preprocess_text(source_path, prepped_source_path)

if not os.path.isfile(textacy_corpus_path):

    stream = get_document_stream(prepped_source_path, document_index)
    textacy_corpus = textacy_utility.create_textacy_corpus(stream, nlp)
   
    textacy_corpus.save(textacy_corpus_path)

else:
    textacy_corpus = textacy_utility.load_corpus(textacy_corpus_path, nlp)

print("Done! ")


2020-10-06 09:34:53,905 : INFO : Call time [load_corpus]: 8.4128 secs


Done! 


In [3]:
## Stats

In [4]:
import collections

POS_TO_COUNT = {
    'SYM': 0, 'PART': 0, 'ADV': 0, 'NOUN': 0, 'CCONJ': 0, 'ADJ': 0, 'DET': 0, 'ADP': 0, 'INTJ': 0, 'VERB': 0, 'NUM': 0, 'PRON': 0, 'PROPN': 0
}

POS_NAMES = list(sorted(POS_TO_COUNT.keys()))

def get_pos_statistics(doc):   
    
    pos_iter = ( x.pos_ for x in doc if x.pos_ not in ['NUM', 'PUNCT', 'SPACE'] )       
    pos_counts = dict(collections.Counter(pos_iter))
    stats = utility.extend(
        dict(
            document_id=doc.user_data['textacy']['meta']['document_id']),
            dict(POS_TO_COUNT),
            pos_counts
        )    
    return stats

def get_corpus_documents(corpus, document_index):
    
    metadata = [ get_pos_statistics(doc) for doc in corpus ]
    df = pd.DataFrame(metadata).set_index('document_id')
    df = df.merge(document_index, how='inner', left_index=True, right_on='unesco_id')
    df['words'] = df[POS_NAMES].apply(sum, axis=1)

    return df


def compute_corpus_statistics(    
    documents_index,
    corpus,
    group_by_column='year',
    target='lemma',
    include_pos=None,
    stop_words=None
):  

    documents = get_corpus_documents(textacy_corpus, document_index)
    value_columns = list(textacy_utility.POS_NAMES) if (len(include_pos or [])) == 0 else list(include_pos)
    
    documents['signed_lustrum'] = (documents.year - documents.year.mod(5)).astype(int)
    documents['signed_decade'] = (documents.year - documents.year.mod(10)).astype(int)
    documents['total'] = documents[value_columns].apply(sum, axis=1)
    
    aggregates = { x: ['sum'] for x in value_columns }
    aggregates['total'] = ['sum', 'mean', 'min', 'max', 'size' ]
    
    documents = documents.groupby(group_by_column).agg(aggregates)
    documents.columns = [ ('Total, ' + x[1].lower()) if x[0] == 'total' else x[0] for x in documents.columns ]
    columns = sorted(value_columns) + sorted([ x for x in documents.columns if x.startswith('Total')])
    return documents[columns]

corpus_stats = compute_corpus_statistics(document_index, textacy_corpus)

In [5]:
import notebooks.word_trends.ipyaggrid_plot as ipyaggrid_plot
display(ipyaggrid_plot.simple_plot(corpus_stats))

Grid(columns_fit='auto', compress_data=True, export_mode='buttons', height='350px', menu={'buttons': [{'name':…