In [63]:
#from docx import Document
import os 
import pickle
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

In [14]:
def read_doc(f_path):
    doc = Document(f_path)
    text_list = [p.text for p in doc.paragraphs if len(p.text)>0]#[3:]
    text_list = [p.replace('\xa0',' ') for p in text_list] # some clean up 
    text_list = [p for p in text_list if len(p.split()) > 15]
    text = ' '.join(text_list)
    return text

def get_file_paths(folder_path):
    files = os.listdir(folder_path)
    files = [f for f in files if '.docx' in f]
    files = [f for f in files if not '~' in f]
    files_path = [os.path.join(folder_path,f) for f in files]

    return files,files_path

def load_text(files_path):
    docs = [read_doc(f) for f in files_path]
    return docs

In [3]:
#data_folder = '../toy_sample'
#file_names,files_paths = get_file_paths(data_folder)
#docs = load_text(files_paths)


In [15]:

# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)


In [16]:
test_doc = """Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund 1.The International Monetary and Financial Committee held its inaugural meeting in Washington, D.C. on April 16, 2000, under the Chairmanship of Mr. Gordon Brown, Chancellor of the Exchequér of the United Kingdom.The Committee's deliberations have taken place today against the background of a growing public debate about the directions in which the IMF and the international financial system should evolve to adapt to a rapidly changing economic environment."""
print(test_doc)


Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund 1.The International Monetary and Financial Committee held its inaugural meeting in Washington, D.C. on April 16, 2000, under the Chairmanship of Mr. Gordon Brown, Chancellor of the Exchequér of the United Kingdom.The Committee's deliberations have taken place today against the background of a growing public debate about the directions in which the IMF and the international financial system should evolve to adapt to a rapidly changing economic environment.


In [26]:
doc_nlp = [nlp(test_doc)]

In [28]:
## print sentances 
for num, sentence in enumerate(doc_nlp[0].sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence ,'\n')
    if num>5:
        break

Sentence 1:
Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund 

Sentence 2:
1.The International Monetary and Financial Committee held its inaugural meeting in Washington, D.C. on April 16, 2000, under the Chairmanship of Mr. Gordon Brown, Chancellor of the Exchequér of the United Kingdom. 

Sentence 3:
The Committee's deliberations have taken place today against the background of a growing public debate about the directions in which the IMF and the international financial system should evolve to adapt to a rapidly changing economic environment. 



In [29]:
### print all token features 
token_attributes = [(token.orth_,
                     token.lemma_,
                     token.pos_,
                     token.tag_,
                     token.ent_type_,
                     token.dep_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in doc_nlp[0]]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'lemma',
                           'pos',
                           'tag',
                           'entity',
                           'dependency',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])
## clean it up 
df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: 'Yes' if x else ''))

In [24]:
df.head(40)

Unnamed: 0,text,lemma,pos,tag,entity,dependency,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Communiqué,communiqué,PROPN,NNP,GPE,ROOT,-20.0,,,,,Yes
1,of,of,ADP,IN,,prep,-20.0,Yes,,,,Yes
2,the,the,DET,DT,ORG,det,-20.0,Yes,,,,Yes
3,International,international,PROPN,NNP,ORG,nmod,-20.0,,,,,Yes
4,Monetary,monetary,PROPN,NNP,ORG,nmod,-20.0,,,,,Yes
5,and,and,CCONJ,CC,ORG,cc,-20.0,Yes,,,,Yes
6,Financial,financial,PROPN,NNP,ORG,conj,-20.0,,,,,Yes
7,Committee,committee,PROPN,NNP,ORG,pobj,-20.0,,,,,Yes
8,of,of,ADP,IN,ORG,prep,-20.0,Yes,,,,Yes
9,the,the,DET,DT,ORG,det,-20.0,Yes,,,,Yes


#### Counting number of words

In [25]:
total_words = [token.orth_ for doc_n in doc_nlp for token in doc_n]
print('Total number of workds: {}'.format(len(total_words)))

##### filter specific pos

In [30]:
total_nun = [token.orth_ for doc_n in doc_nlp for token in doc_n if token.pos_ == 'PROPN' or token.pos_ == 'NOUN' ] 

In [32]:
total_nun[:10]

['Communiqué',
 'International',
 'Monetary',
 'Financial',
 'Committee',
 'Board',
 'Governors',
 'International',
 'Monetary',
 'Fund']

#### How to do noun trunking manuely

In [33]:
doc_nlp[0][0].left_edge.i

0

In [34]:
doc_nlp[0][0].right_edge.i+1

18

In [35]:
span = doc_nlp[0][doc_nlp[0][0].left_edge.i:doc_nlp[0][0].right_edge.i+1]

In [36]:
span.merge()

Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund

In [42]:
for token in doc_nlp[0][:10]:
    print(token.text)

Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund
1.The
International
Monetary
and
Financial
Committee
held
its
inaugural


#### get noun chunks automatically 

In [47]:
nt=list(doc_nlp[0].noun_chunks)
nt[:2]

[Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund,
 1.The International Monetary and Financial Committee]

In [48]:
len(nt)

18

#### add phrases

In [51]:
from spacy.matcher import PhraseMatcher

In [52]:
matcher = PhraseMatcher(nlp.vocab)
terminology_list = ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
patterns = [nlp(text) for text in terminology_list]
matcher.add('TerminologyList', None, *patterns)

doc = nlp(u"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Washington, D.C.


#### Use displacy to display parsing results

In [60]:
from spacy import displacy

In [65]:
test = 'This is just a test sentence'
doc = nlp(test)
displacy.serve(doc, style='dep')


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [06/Aug/2019 12:34:32] "GET / HTTP/1.1" 200 4556
127.0.0.1 - - [06/Aug/2019 12:34:33] "GET /favicon.ico HTTP/1.1" 200 4556



    Shutting down server on port 5000.

