## tCoIR - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [None]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re
import nltk, textacy, spacy 
#import pandas as pd
import ipywidgets as widgets

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import text_corpus
import gui_utility
import textacy_corpus_utility as textacy_utility

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()

from domain_logic_config import current_domain as domain_logic


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
import textacy_corpus_gui

try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(domain_logic.DATA_FOLDER, container=container, domain_logic=domain_logic)
except Exception as ex:
    raise
    logger.error(ex)


## <span style='color: green'>PREPARE </span> Extract Text From Corpus <span style='float: right; color: green'>TRY IT</span>

In [None]:
import gui_utility
import textacy_corpus_utility as textacy_utility
#import domain_logic_vatican as domain_logic
from domain_logic_config import current_domain as domain_logic

DF_TAGSET = pd.read_csv(os.path.join(domain_logic.DATA_FOLDER, 'tagset.csv'), sep='\t').fillna('')

def chunks(l, n):
    '''Returns list l in n-sized chunks'''
    if (n or 0) == 0:
        yield l
    else:
        for i in range(0, len(l), n):
            yield l[i:i + n]

def tokenize_docs(docs, **opts): 
    try:
        document_id = 0
        normalize = opts['normalize'] or 'orth'
        term_substitutions = opts.get('substitutions', {})
        word_counts = opts.get('word_counts', {})
        word_document_counts = opts.get('word_document_counts', {})
        extra_stop_words = set([])

        if opts['min_freq'] > 1:
            stop_words = utility.extract_counter_items_within_threshold(word_counts[normalize], 1, opts['min_freq'])
            extra_stop_words.update(stop_words)

        if opts['max_doc_freq'] < 100:
            stop_words = utility.extract_counter_items_within_threshold(word_document_counts[normalize], opts['max_doc_freq'], 100)
            extra_stop_words.update(stop_words)

        extract_args = dict(
            args=dict(
                ngrams=opts['ngrams'],
                named_entities=opts['named_entities'],
                normalize=opts['normalize'],
                as_strings=True
            ),
            kwargs=dict(
                min_freq=opts['min_freq'],
                include_pos=opts['include_pos'],
                filter_stops=opts['filter_stops'],
                filter_punct=opts['filter_punct']
            ),
            extra_stop_words=extra_stop_words,
            substitutions=(term_substitutions if opts.get('substitute_terms', False) else None),
        )

        for document_name, doc in docs:
            print(document_name)

            terms = [ x for x in textacy_utility.extract_document_terms(doc, extract_args)]
            
            chunk_size = opts.get('chunk_size', 0)
            chunk_index = 0
            for tokens in chunks(terms, chunk_size):
                yield document_id, document_name, chunk_index, tokens
                chunk_index += 1

            document_id += 1
                    
    except Exception as ex:
        raise
        logger.error(ex)
        
def store_tokenized_corpus(tokenized_docs, corpus_source_filepath, **opts): 
    
    filepath = utility.path_add_timestamp(corpus_source_filepath)
    filepath = utility.path_add_suffix(filepath, '.tokenized')
    
    file_stats = []
    process_count = 0
    
    # TODO: Enable store of all documents line-by-line in a single file
    with zipfile.ZipFile(filepath, "w") as zf:
        
        for document_id, document_name, chunk_index, tokens in tokenized_docs: 
            
            text = ' '.join([ t.replace(' ', '_') for t in tokens ])
            store_name  = utility.path_add_sequence(document_name, chunk_index, 4)
            
            zf.writestr(store_name, text, zipfile.ZIP_DEFLATED)
            
            file_stats.append((document_id, document_name, chunk_index, len(tokens)))
            
            if process_count % 100 == 0:
                logger.info('Stored {} files...'.format(process_count))
                
            process_count += 1
            
            
    df_summary = pd.DataFrame(file_stats, columns=['document_id', 'document_name', 'chunk_index', 'n_tokens'])
    
    return filepath, df_summary

def display_generate_tokenized_corpus_gui(corpus, corpus_source_filepath, subst_filename=None):
    
    filenames = [ doc._.meta['filename'] for doc in corpus ]
    
    try:
        document_index = domain_logic.compile_documents(corpus)
    except Exception as ex:
        document_index = domain_logic.compile_documents_by_filename(filenames)
    
    term_substitutions = { }
    
    if subst_filename is not None:
        logger.info('Loading term substitution mappings...')
        term_substitutions = textacy_utility.load_term_substitutions(subst_filename, default_term='_masked_', delim=';', vocab=corpus.spacy_lang.vocab)
        
    pos_tags = DF_TAGSET.groupby(['POS'])['DESCRIPTION'].apply(list).apply(lambda x: ', '.join(x[:1])).to_dict()
    pos_options = [('(All)', None)] + sorted([(k + ' (' + v + ')', k) for k,v in pos_tags.items() ])
    ngrams_options = { '1': [1], '1,2': [1,2], '1,2,3': [1,2,3]}
    
    lw = lambda width: widgets.Layout(width=width)
    gui = types.SimpleNamespace(
        progress=widgets.IntProgress(value=0, min=0, max=5, step=1, description='', layout=widgets.Layout(width='90%')),
        min_freq=widgets.IntSlider(description='Min word freq', min=0, max=10, value=2, step=1, layout=widgets.Layout(width='400px')),
        max_doc_freq=widgets.IntSlider(description='Min doc. %', min=75, max=100, value=100, step=1, layout=widgets.Layout(width='400px')),
        substitute_terms=widgets.ToggleButton(value=False, description='Mask GPE',  tooltip='Replace geographical entites with `_gpe_`'),
        ngrams=widgets.Dropdown(description='n-grams', options=ngrams_options, value=[1], layout=widgets.Layout(width='180px')),
        min_word=widgets.Dropdown(description='Min length', options=[1,2,3,4], value=1, layout=widgets.Layout(width='180px')),
        chunk_size=widgets.Dropdown(description='Chunk size', options=[('None', 0), ('500', 500), ('1000', 1000), ('2000', 2000) ], value=0, layout=widgets.Layout(width='180px')),
        normalize=widgets.Dropdown(description='Normalize', options=[ None, 'lemma', 'lower' ], value='lower', layout=widgets.Layout(width='180px')),
        filter_stops=widgets.ToggleButton(value=False, description='Filter stops',  tooltip='Filter out stopwords'),
        filter_punct=widgets.ToggleButton(value=False, description='Filter punct',  tooltip='Filter out punctuations'),
        named_entities=widgets.ToggleButton(value=False, description='Merge entities',  tooltip='Merge entities'),
        include_pos=widgets.SelectMultiple(description='POS', options=pos_options, value=list(), rows=10, layout=widgets.Layout(width='400px')),
        compute=widgets.Button(description='Compute', button_style='Success', layout=lw('100px')),
        output=widgets.Output(layout={'border': '1px solid black'}),
    )
    
    logger.info('Preparing corpus statistics...')
    logger.info('...word counts...')
    word_counts = { k: textacy_utility.generate_word_count_score(corpus, k, gui.min_freq.max) for k in [ 'lemma', 'lower', 'orth' ] }
    
    logger.info('...word document count...')
    word_document_counts = { k: textacy_utility.generate_word_document_count_score(corpus, k, gui.max_doc_freq.min) for k in [ 'lemma', 'lower', 'orth' ] }

    logger.info('...done!')
    
    gui.boxes = widgets.VBox([
        gui.progress,
        widgets.HBox([
            widgets.VBox([
                widgets.HBox([gui.normalize, gui.chunk_size]),
                widgets.HBox([gui.ngrams, gui.min_word]),
                gui.min_freq,
                gui.max_doc_freq
            ]),
            widgets.VBox([
                gui.include_pos
            ]),
            widgets.VBox([
                gui.filter_stops,
                gui.substitute_terms,
                gui.filter_punct,
                gui.named_entities
            ]),
            widgets.VBox([
                gui.compute
            ]),
        ]),
        gui.output
    ])
    
    display(gui.boxes)
    
    def compute_callback(*_args):
        gui.compute.disabled = True
        filepath = ''
        opts = dict(
            min_freq=gui.min_freq.value,
            max_doc_freq=gui.max_doc_freq.value,
            substitute_terms=gui.substitute_terms.value,
            ngrams=gui.ngrams.value,
            min_word=gui.min_word.value,
            normalize=gui.normalize.value,
            filter_stops=gui.filter_stops.value,
            filter_punct=gui.filter_punct.value,
            named_entities=gui.named_entities.value,
            include_pos=gui.include_pos.value,
            chunk_size=gui.chunk_size.value,
            term_substitutions=term_substitutions,
            word_counts=word_counts,
            word_document_counts=word_document_counts
        )
        
        with gui.output:

            docs = ((doc._.meta['filename'], doc) for doc in corpus)
            
            tokenized_docs = tokenize_docs(docs, **opts)
            
            filepath, df_summary = store_tokenized_corpus(tokenized_docs, corpus_source_filepath, **opts)
            
        gui.output.clear_output()
        
        with gui.output:
            logger.info('Process DONE!')
            logger.info("Result stored in '{}'".format(filepath))
            display(df_summary)
            
        gui.compute.disabled = False
        
    gui.compute.on_click(compute_callback)
    
    return gui

try:
    subst_filename = os.path.join(domain_logic.DATA_FOLDER, 'term_substitutions.txt')
    corpus = current_corpus_container().textacy_corpus
    corpus_path =  current_corpus_container().prepped_source_path
    #if not corpus is None :
    #    docs = textacy_corpus_document_stream(corpus)
    if corpus is None:
        logger.info('Please load corpus!')
    else:
        display_generate_tokenized_corpus_gui(corpus, corpus_path)
except Exception as ex:
    raise
    logger.error(ex)
