### Linguistic Features

In [9]:
import spacy
import pandas as pd

In [None]:
nlp = spacy.load("en_core_web_sm")

In [2]:
test_sentence = "16. With improved financial management, the immediate focus of reform has now shifted toward increasing revenue mobilization. To achieve this, the authorities have started modernizing customs and are about to approve an ambitious customs policy reform package. Key elements of this package include using the market exchange rate for customs valuation and streamlining tariffs. This is to be followed by a reform of tax policy and administration. The authorities are preparing draft decrees to: (a) reduce the top marginal taxrate for individuals and increase the personal exemptions; (b) restore wage withholding on higher-income employees; (c) introduce a rent tax and an airport departure fee; and (d) expand the business receipts tax to cover certain services provided to expatriates."
print(test_sentence)

16. With improved financial management, the immediate focus of reform has now shifted toward increasing revenue mobilization. To achieve this, the authorities have started modernizing customs and are about to approve an ambitious customs policy reform package. Key elements of this package include using the market exchange rate for customs valuation and streamlining tariffs. This is to be followed by a reform of tax policy and administration. The authorities are preparing draft decrees to: (a) reduce the top marginal taxrate for individuals and increase the personal exemptions; (b) restore wage withholding on higher-income employees; (c) introduce a rent tax and an airport departure fee; and (d) expand the business receipts tax to cover certain services provided to expatriates.


In [19]:
def view_data(attributes):
    df = pd.DataFrame(attributes)
    return df


### Dependency parsing for Noun chunks

In [6]:
doc = nlp(test_sentence)
# for chunk in doc.noun_chunks:
#     print(chunk.text, chunk.root.text, chunk.root.dep_,
#             chunk.root.head.text)

In [14]:
token_attributes = [(chunk.text,
                     chunk.lemma_,
                     chunk.root.text,
                     chunk.root.dep_,
                     chunk.root.head.text)
                    for chunk in doc.noun_chunks]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'lemma',
                           'root_text',
                           'dependency',
                        'head_text'])
df.head(10)

Unnamed: 0,text,lemma,root_text,dependency,head_text
0,improved financial management,improved financial management,management,pobj,With
1,the immediate focus,the immediate focus,focus,nsubj,shifted
2,reform,reform,reform,pobj,of
3,increasing revenue mobilization,increase revenue mobilization,mobilization,pobj,toward
4,the authorities,the authority,authorities,nsubj,started
5,customs,custom,customs,dobj,modernizing
6,an ambitious customs policy reform package,an ambitious custom policy reform package,package,dobj,approve
7,Key elements,key element,elements,nsubj,include
8,this package,this package,package,pobj,of
9,the market exchange rate,the market exchange rate,rate,dobj,using


### Nevigate Depedency tree

In [24]:
doc = nlp(test_sentence)
features = [(token.text, token.dep_, token.head.text, token.head.pos_,[child for child in token.children])
            for token in doc]
viz_df = view_data(features)
viz_df[:10]

Unnamed: 0,0,1,2,3,4
0,16,ROOT,16,NUM,[.]
1,.,punct,16,NUM,[]
2,With,prep,shifted,VERB,[management]
3,improved,amod,management,NOUN,[]
4,financial,amod,management,NOUN,[]
5,management,pobj,With,ADP,"[improved, financial]"
6,",",punct,shifted,VERB,[]
7,the,det,focus,NOUN,[]
8,immediate,amod,focus,NOUN,[]
9,focus,nsubj,shifted,VERB,"[the, immediate, of]"


### Filtering based dependency rules

In [26]:
from spacy.symbols import nsubj, VERB
doc = nlp(test_sentence)
# Finding a verb with a subject from below — good
verbs = set()
nsubjs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
        nsubjs.add(possible_subject)
print("verbs {}".format(verbs))
print("nsubjs {}".format(nsubjs))

verbs {is, include, shifted, started, preparing}
nsubjs {This, focus, authorities, authorities, elements}


### Nevigating in local trees

In [27]:
doc = nlp(u"Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['holders', 'submit']
and cc 0 0 ['Credit', 'holders', 'submit']
mortgage compound 0 0 ['account', 'Credit', 'holders', 'submit']
account conj 1 0 ['Credit', 'holders', 'submit']
holders nsubj 1 0 ['submit']
