## Stanford Core NLP

### Download

Download CoreNLP and relevant languages models from https://stanfordnlp.github.io/CoreNLP/download.html

Dependencies: nltk python3-tk


In [24]:
%matplotlib inline

import os 
import warnings
import matplotlib as mpl

if os.environ.get('DISPLAY','') == '':
    print('no display found. Using non-interactive Agg backend')
    mpl.use('Agg')
    
import matplotlib.pyplot as plt

mpl.rcParams['figure.figsize'] = [9.0, 6.0]
warnings.simplefilter("ignore")


In [None]:
# Starting the CoreNLPServer 
from nltk.parse.corenlp import CoreNLPServer 


# The server needs to know the location of the following files:
#   - stanford-corenlp-X.X.X.jar 
#   - stanford-corenlp-X.X.X-models.jar 

STANFORD = '/home/roger/StanfordCoreNLP/stanford-corenlp-full-2018-10-05'

# Create the server 
server = CoreNLPServer(
    os.path.join(STANFORD, "stanford-corenlp-3.9.2.jar"),
    os.path.join(STANFORD, "stanford-corenlp-3.9.2-models.jar"),    
)

# Start the server in the background 
server.start()

In [65]:
from nltk.parse.corenlp import CoreNLPParser 
# https://www.nltk.org/_modules/nltk/parse/corenlp.html
mpl.use('Agg')
parser = CoreNLPParser()
parse = next(parser.raw_parse("I put the book in the box on the table."))
parse.pretty_print()

                         ROOT                              
                          |                                 
                          S                                
  ________________________|______________________________   
 |                        VP                             | 
 |    ____________________|________________              |  
 |   |       |            PP               PP            | 
 |   |       |         ___|____         ___|___          |  
 NP  |       NP       |        NP      |       NP        | 
 |   |    ___|___     |    ____|___    |    ___|____     |  
PRP VBD  DT      NN   IN  DT       NN  IN  DT       NN   . 
 |   |   |       |    |   |        |   |   |        |    |  
 I  put the     book  in the      box  on the     table  . 



In [69]:
from nltk.parse.corenlp import CoreNLPDependencyParser 

parser = CoreNLPDependencyParser()

parse = next(parser.raw_parse("I put the book in the box on the table."))

print(parse.tree())

print("")

print(parse.to_conll(4))

for governor, dep, dependent in parse.triples(): 
     print(governor, dep, dependent)
        
server.stop()  

(put I (book the) (box in the) (table on the) .)

I	PRP	2	nsubj
put	VBD	0	ROOT
the	DT	4	det
book	NN	2	dobj
in	IN	7	case
the	DT	7	det
box	NN	2	nmod
on	IN	10	case
the	DT	10	det
table	NN	2	nmod
.	.	2	punct

('put', 'VBD') nsubj ('I', 'PRP')
('put', 'VBD') dobj ('book', 'NN')
('book', 'NN') det ('the', 'DT')
('put', 'VBD') nmod ('box', 'NN')
('box', 'NN') case ('in', 'IN')
('box', 'NN') det ('the', 'DT')
('put', 'VBD') nmod ('table', 'NN')
('table', 'NN') case ('on', 'IN')
('table', 'NN') det ('the', 'DT')
('put', 'VBD') punct ('.', '.')


In [1]:
import spacy 
from spacy import displacy

# Load the english language models
# Required: python -m spacy download en 
EN = spacy.load("en")


# Parse and analyze the sentence/document 
doc = EN("The apple dropped not far from his head.")

# Draw the dependency tree into the notebook 
displacy.render(doc, style='dep', minify=True, jupyter=True)

In [71]:
%load_ext autoreload
%autoreload 2

import treaty_corpus
import os
source_folder = '../data'
source_path = os.path.join(source_folder, 'treaty_text_corpora_20181018.zip')

pattern = '132090_en.txt'
corpus_stream = treaty_corpus.CompressedFileReader(source_path, pattern=pattern)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
for document, content in corpus_stream:
    print(document)

In [84]:
spacy_doc = EN(content)

In [95]:
from pprint import pprint as pp
from collections import Counter

def document_statistics(spacy_doc):
    
    spacy_sentences = list(spacy_doc.sents)
    token_length_histogram = Counter((len(x) for x in spacy_doc))
    
    return {
        'word_count': len(spacy_doc),
        'sentence_count': len(spacy_sentences),
        'avg_words_per_sentence': len(spacy_doc) / len(spacy_sentences),
        'token_lengths': dict(token_length_histogram)
    }

pp(document_statistics(spacy_doc))

import pandas as pd

df = pd.DataFrame(spacy_doc)


{'avg_words_per_sentence': 31.396551724137932,
 'sentence_count': 58,
 'token_lengths': {1: 170,
                   2: 442,
                   3: 263,
                   4: 129,
                   5: 144,
                   6: 72,
                   7: 171,
                   8: 114,
                   9: 120,
                   10: 87,
                   11: 65,
                   12: 23,
                   13: 9,
                   14: 8,
                   15: 2,
                   17: 2},
 'word_count': 1821}


ValueError: DataFrame constructor not properly called!

In [116]:
from tabulate import tabulate 

def spacy_doc_2_df(spacy_doc, ignore_tags=None, ignore_pos=None):
    ignore_tags = ignore_tags or ['_SP']
    ignore_pos = ignore_pos or []
    df_source = ({
        "Text": w.text,
        "Lemma": w.lemma_,
        "POS": w.pos_,
        "Tag": w.tag_,
        "Dep": w.dep_,
        "Alpha": w.is_alpha,
        "Stop": w.is_stop
    } for w in spacy_doc
        if w.tag_ not in ignore_tags
        and w.pos_ not in ignore_pos)
    return pd.DataFrame(df_source)

def print_tablulated(spacy_doc, count):
    for token in spacy_doc[:count]:
        table.append([
            token.text, token.lemma_, token.pos_, token.tag_, 
            token.dep_, token.is_alpha, token.is_stop
        ])
    print(tabulate(table, tablefmt="simple", headers="firstrow"))
    
df = spacy_doc_2_df(spacy_doc, ignore_tags=['_SP'], ignore_pos=['PUNCT'])


In [124]:
df.groupby([df.POS]).size()

POS
ADJ      158
ADP      257
ADV       29
CCONJ     81
DET      189
INTJ       1
NOUN     362
NUM       38
PART      21
PRON      12
PROPN    149
VERB     221
dtype: int64

In [120]:
from spacy.parts_of_speech import IDS

In [122]:
IDS.keys()

dict_keys(['', 'VERB', 'ADJ', 'SYM', 'CCONJ', 'NUM', 'X', 'PART', 'AUX', 'NOUN', 'INTJ', 'ADP', 'ADV', 'CONJ', 'PROPN', 'SCONJ', 'PUNCT', 'PRON', 'EOL', 'SPACE', 'DET'])