## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os
import nltk, textacy, spacy 
import pandas as pd
import ipywidgets
import bokeh, bokeh.plotting, bokeh.models, matplotlib.pyplot as plt
import text_analytic_tools.utility as utility
import text_analytic_tools.utility.widgets as widgets
import text_analytic_tools.common.text_corpus as text_corpus
import text_analytic_tools.common.textacy_utility as textacy_utility
import warnings

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

from text_analytic_tools.domain_logic_config import current_domain as domain_logic

utility.setup_default_pd_display(pd)

DATA_FOLDER, PATTERN = '../../data',  '*.txt'
DF_TAGSET = pd.read_csv(os.path.join(DATA_FOLDER, 'tagset.csv'), sep='\t').fillna('')

%matplotlib inline

# set_matplotlib_formats('svg')   
bokeh.plotting.output_notebook()

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
import text_analytic_tools.notebooks_gui.textacy_corpus_gui

try:
    container = current_corpus_container()
    # FIXME VARYING ASPECTS: document_index = WTI_INDEX for tCoIR
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, document_index=None, container=container, compute_ner=True, domain_logic=domain_logic)
except Exception as ex:
    raise
    logger.error(ex)

## <span style='color: green;'>MODEL</span> Display Named Entities<span style='color: green; float: right'>TRY IT</span>
Spacy NER, note that "ner" must be enabled in corpus pipeline.

In [None]:
# Display Named Entities
from spacy import displacy

def display_document_entities_gui(corpus, document_index):
    
    # FIXME VARYING ASPECT: Add "document_name" to document_index, or function that creates name
    filenames = document_index.filename
    document_options = list(sorted(zip(filenames,filenames.index), key=lambda x: x[0]))

    gui = types.SimpleNamespace(
        position=1,
        output=widgets.Output(layout={'border': '1px solid black'}),
        document_id=widgets.Dropdown(description='Document', options=document_options, value=document_options[1][1], layout=widgets.Layout(width='50%')),
        left=widgets.Button(description='<<', button_style='Success', layout=widgets.Layout(width='40px')),
        right=widgets.Button(description='>>', button_style='Success', layout=widgets.Layout(width='40px')),
    )

    def display_document_entities(corpus, document_id):
        gui.output.clear_output()
        with gui.output:        
            doc = textacy_utility.get_document_by_id(corpus, document_id)
            displacy.render(doc, style='ent', jupyter=True)
    
    def back_handler(*args):
        if gui.position == 0:
            return
        gui.output.clear_output()
        gui.position = (gui.position - 1) % len(document_options)
        gui.document_id.value = document_options[gui.position][1]
        #itw.update()
        
    def forward_handler(*args):
        gui.output.clear_output()
        gui.position = (gui.position + 1) % len(document_options)
        gui.document_id.value = document_options[gui.position][1]
    
    gui.left.on_click(back_handler)
    gui.right.on_click(forward_handler)
    
    display(widgets.VBox([
        widgets.HBox([gui.document_id, gui.left, gui.right]),
        widgets.VBox([gui.output], layout=widgets.Layout(margin_top='20px', height='600px',width='100%'))
    ]))
    
    itw = widgets.interactive(
        display_document_entities,
        corpus=widgets.fixed(corpus),
        document_id=gui.document_id
    )
    
    itw.update()
    
try:
    corpus = current_corpus()
    document_index = domain_logic.compile_documents(corpus)
    display_document_entities_gui(corpus, document_index=document_index)
except Exception as ex:
    logger.error(ex)


In [None]:
def extract_entities(doc, include_types=None, drop_determiners=True):
    
    entities = (x for x in doc.ents if not x.text.isspace())
    
    if include_types is not None:
        assert isinstance(include_types, (set, list, tuple))
        entities = (x for x in entities if x.label_ in include_types)

    if drop_determiners is True:
        entities = (x if x[0].pos != DET else SpacySpan(x.doc, x.start + 1, x.end, label=x.label, vector=x.vector) for x in entities)

    for x in entities:
        yield x

## <span style='color: green;'>MODEL</span> Extract Named Entities<span style='color: green; float: right'>TRY IT</span>
Spacy NER, note that "ner" must be enabled in corpus pipeline.

In [None]:
%%bash
nohup python3 run_ner_places.py &


## <span style='color: green;'>DESCRIBE</span> Display Named Entity Statistics<span style='color: green; float: right'>TRY IT</span>
Spacy NER, note that "ner" must be enabled in corpus pipeline.

In [None]:

def compile_named_entity_data(corpus, document_index, drop_determiners=True, min_freq=1):
    #textacy.extract.entities(doc, include_types=None, exclude_types=None, drop_determiners=True, min_freq=1)
    data = [[
        (doc._.meta['document_id'], ent[0].ent_type_, ent.text, ent.lemma_)
             for ent in textacy.extract.entities(doc, exclude_types=('CARDINAL',), drop_determiners=drop_determiners, min_freq=min_freq) ]
                for doc in corpus
    ]
    data = utility.flatten(data)
    df = pd.DataFrame(data, columns=['document_id', 'ent_type', 'text', 'lemma']).set_index('document_id')
    df = pd.merge(df, document_index, left_index=True, right_index=True, how='inner')
    return df[df.year > 0][['pope', 'year', 'genre', 'ent_type', 'text', 'lemma', 'filename']].reset_index()

def display_grouped_by_entities_gui(corpus, document_index):
    
    columns = compile_named_entity_data([corpus[0]], document_index).columns
    
    group_by_options = [ (x.title(), x) for x in columns if x not in [ 'ent_type', 'text', 'lemma', 'filename', 'index'] ]
    group_by_values = [ x for _, x in group_by_options ]
    gui = types.SimpleNamespace(
        group_by=widgets.SelectMultiple(description='Group by', options=group_by_options, value=group_by_values, rows=3, layout=widgets.Layout(width='180px')),
        filter_punct=widgets.ToggleButton(value=False, description='Filter punct',  tooltip='Filter out punctuations', icon='check'),
        drop_determiners=widgets.ToggleButton(value=True, description='Drop DET',  tooltip='Drop_determiners`', icon='check'),
        output=widgets.Output(layout={'border': '1px solid black'}),
        min_freq=widgets.IntSlider(description='Min freq', min=1, max=10, value=2, step=1, layout=widgets.Layout(width='400px')),
    )
        
    def display_grouped_by_entities(corpus, group_by, drop_determiners, min_freq):
        gui.output.clear_output()
        named_entities = compile_named_entity_data(corpus, document_index, drop_determiners, min_freq)
        with gui.output:
            df = named_entities.groupby(list(group_by) + ['ent_type', 'lemma']).size().reset_index()
            df = df.rename(columns={0:'Count'})
            df = df.sort_values('Count', ascending=False)
            display(df)

    itw = widgets.interactive(
        display_grouped_by_entities,
        corpus=widgets.fixed(corpus),
        #named_entities=widgets.fixed(named_entities),
        group_by=gui.group_by,
        drop_determiners=gui.drop_determiners,
        min_freq=gui.min_freq
    )

    display(widgets.VBox([
        widgets.HBox([gui.group_by, gui.drop_determiners, gui.min_freq]),
        widgets.VBox([gui.output]),
        itw.children[-1]
    ]))
    
    
try:
    corpus = current_corpus()
    document_index = domain_logic.compile_documents(corpus)
    display_grouped_by_entities_gui(corpus, document_index)
except Exception as ex:
    raise
    logger.error(ex)


## <span style='color: green;'>MODEL</span> Stanford NER Tagger (CoreNLP)<span style='color: green; float: right'>SKIP</span>

### <span style='color: green;'>PREPARE</span> Verify that Stanford CoreNLP is up and running<span style='color: green; float: right'>SKIP</span>
Stanford CoreNLP server must be started as described in:  https://stanfordnlp.github.io/CoreNLP/corenlp-server.html

With docker:
```bash
docker pull frnkenstien/corenlp
docker run -p 9000:9000 --name coreNLP --rm -i -t frnkenstien/corenlp
```

In [None]:
STANFORD_CORE_NLP_URL = 'http://localhost:9000'

try:
    from nltk.parse import corenlp
    corenlp_tagger = corenlp.CoreNLPParser(url=STANFORD_CORE_NLP_URL, encoding='utf8', tagtype='ner')
    input_tokens = 'Stony Brook University in NY'.split()
    tagged_output = corenlp_tagger.tag(input_tokens)
    print('Stanford tagger is up and running!')
    print(' Result: ' + ' '.join([ x + '/' + y for x,y in tagged_output]))
except: # (ConnectionError, ConnectionRefusedError):
    logger.error('Server not found! Please start Stanford CoreNLP Server!')


In [None]:
def load_document_index(path):
    reader = text_corpus.CompressedFileReader(path)
    df = domain_logic.compile_documents_by_filename(reader.filenames)
    df['document_id'] = df.document_id.astype(np.int32)
    df = df.rename(columns={'document_id': 'doc_id'})
    df = df.set_index('doc_id')
    df = df[['pope', 'genre', 'year', 'filename']]
    return df

def load_ner_result(path):
    df = pd.read_csv(path, sep='\t')
    df['doc_id'] = df.doc_id.astype(np.int32)
    df = df.set_index('id')
    return df

def compile_ner_result(source_path, result_path):
    document_index = load_document_index(source_path)
    df_ner = load_ner_result(result_path)
    df_agg = df_ner.groupby(['doc_id', 'ent_type', 'entity']).size().reset_index().rename(columns={0: 'entity_count'})
    df_doc_agg = document_index.merge(df_agg, left_index=True, right_on='doc_id', how='inner')
    return df_doc_agg

df_francis = compile_ner_result('../../data/benedict-xvi_curated_20190326.txt.zip', 'ner_benedict-xvi_curated_20190326.txt_20190319193045.txt')
df_benedict = compile_ner_result('../../data/francis_curated_20190326.txt.zip', 'ner_francis_curated_20190326.txt_20190326145420.txt')

df_popes = pd.concat([df_francis, df_benedict], ignore_index=True, axis=0)

df_popes_entity_agg = df_popes.groupby(['pope', 'entity']).agg({'entity_count': 'sum'}).reset_index()
df_entity_agg = df_popes.groupby(['entity']).agg({'entity_count': 'sum'}).reset_index()

print(df_popes_entity_agg.entity_count.sum(), df_entity_agg.entity_count.sum())
    
with pd.ExcelWriter('ner_francis_and_benedict_stanford_20190326.xlsx') as writer:
    df_popes.to_excel(writer, sheet_name='Pivot (document)'),
    df_popes_entity_agg.to_excel(writer, sheet_name='Pivot (pope, entity)'),
    df_entity_agg.to_excel(writer, sheet_name='Pivot (entity)'),
    


In [None]:
# df_ner[df_ner.doc_id == 3]
df_agg = df_ner.groupby(['doc_id', 'ent_type', 'entity']).size().reset_index().rename(columns={0: 'count'})
df_agg

df_pope_year_genre_type_agg = df_doc_agg.groupby(['pope', 'year', 'genre', 'ent_type', 'entity']).agg({'count': 'sum'}).reset_index()

