In [13]:
from docx import Document
import os 
import pickle
import pandas as pd

In [2]:
def read_doc(f_path):
    doc = Document(f_path)
    text_list = [p.text for p in doc.paragraphs if len(p.text)>0]#[3:]
    text_list = [p.replace('\xa0',' ') for p in text_list] # some clean up 
    text_list = [p for p in text_list if len(p.split()) > 15]
    text = ' '.join(text_list)
    return text

def get_file_paths(folder_path):
    files = os.listdir(folder_path)
    files = [f for f in files if '.docx' in f]
    files = [f for f in files if not '~' in f]
    files_path = [os.path.join(folder_path,f) for f in files]

    return files,files_path

def load_text(files_path):
    docs = [read_doc(f) for f in files_path]
    return docs

In [3]:
data_folder = '../toy_sample'
file_names,files_paths = get_file_paths(data_folder)
docs = load_text(files_paths)


In [5]:
import en_core_web_md
nlp = en_core_web_md.load()

In [8]:
doc_nlp = [nlp(doc) for doc in docs]

In [25]:
## print sentances 
for num, sentence in enumerate(doc_nlp[0].sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence ,'\n')
    if num>5:
        break

Sentence 1:
Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund 1.   

Sentence 2:
The International Monetary and Financial Committee held its inaugural meeting in Washington, D.C. on April 16, 2000, under the Chairmanship of Mr. Gordon Brown, Chancellor of the Exchequér of the United Kingdom. 

Sentence 3:
2.   

Sentence 4:
The Committee's deliberations have taken place today against the background of a growing public debate about the directions in which the IMF and the international financial system should evolve to adapt to a rapidly changing economic environment. 

Sentence 5:
The debate also reflects a concern that the benefits the world economy is deriving from freer trade and more integrated and deeper international capital markets are not reaching everyone, especially in the developing countries. 

Sentence 6:
The Committee reaffirms its strong support for the Fund's uniqué role as the cornerstone of the

In [20]:
### print all token features 
token_attributes = [(token.orth_,
                     token.lemma_,
                     token.pos_,
                     token.ent_type_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in doc_nlp[0]]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'lemma',
                           'pos',
                           'entity',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])
## clean it up 
df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: 'Yes' if x else ''))

In [62]:
df.head(40)

Unnamed: 0,text,lemma,pos,entity,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Communiqué,communiqué,PROPN,PERSON,-20.0,,,,,Yes
1,of,of,ADP,PERSON,-4.275874,,,,,
2,the,the,DET,ORG,-3.528767,,,,,
3,International,international,PROPN,ORG,-11.928836,,,,,
4,Monetary,monetary,PROPN,ORG,-15.116435,,,,,
5,and,and,CCONJ,ORG,-4.113108,,,,,
6,Financial,financial,PROPN,ORG,-13.288259,,,,,
7,Committee,committee,PROPN,ORG,-13.318144,,,,,
8,of,of,ADP,ORG,-4.275874,,,,,
9,the,the,DET,ORG,-3.528767,,,,,


In [60]:
total_words = [token.orth_ for doc_n in doc_nlp for token in doc_n]
print('Total number of workds: {}'.format(len(total_words)))

Total number of workds: 74808


In [64]:
total_nun = [token.orth_ for doc_n in doc_nlp for token in doc_n if token.pos_ == 'PROPN' or token.pos_ == 'NOUN' ] 

In [67]:
total_nun[:100]

['Communiqué',
 'International',
 'Monetary',
 'Financial',
 'Committee',
 'Board',
 'Governors',
 'International',
 'Monetary',
 'Fund',
 'International',
 'Monetary',
 'Financial',
 'Committee',
 'meeting',
 'Washington',
 'D.C.',
 'April',
 'Chairmanship',
 'Mr.',
 'Gordon',
 'Brown',
 'Chancellor',
 'Exchequér',
 'United',
 'Kingdom',
 'Committee',
 'deliberations',
 'place',
 'today',
 'background',
 'debate',
 'directions',
 'IMF',
 'system',
 'environment',
 'debate',
 'concern',
 'benefits',
 'world',
 'economy',
 'trade',
 'capital',
 'markets',
 'everyone',
 'countries',
 'Committee',
 'support',
 'Fund',
 'role',
 'cornerstone',
 'system',
 'ability',
 'virtue',
 'character',
 'members',
 'support',
 'members',
 'IMF',
 'change',
 'members',
 'underpinnings',
 'stability',
 'sharing',
 'benefits',
 'opportunities',
 'world',
 'economy',
 'needs',
 'Committee',
 'IMF',
 'Committee',
 'recovery',
 'world',
 'economy',
 'prospect',
 'growth',
 'conditions',
 'year',
 'growth',


In [74]:
doc_nlp[0][0].left_edge.i

0

In [75]:
doc_nlp[0][0].right_edge.i+1

21

In [77]:
span = doc_nlp[0][doc_nlp[0][0].left_edge.i:doc_nlp[0][0].right_edge.i+1]

In [78]:
span.merge()

Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund 1.  

In [79]:
for token in doc_nlp[0][:100]:
    print(token.text)

Communiqué of the International Monetary and Financial Committee of the Board of Governors of the International Monetary Fund 1.  
The
International
Monetary
and
Financial
Committee
held
its
inaugural
meeting
in
Washington
,
D.C.
on
April
16
,
2000
,
under
the
Chairmanship
of
Mr.
Gordon
Brown
,
Chancellor
of
the
Exchequér
of
the
United
Kingdom
.
2
.
 
The
Committee
's
deliberations
have
taken
place
today
against
the
background
of
a
growing
public
debate
about
the
directions
in
which
the
IMF
and
the
international
financial
system
should
evolve
to
adapt
to
a
rapidly
changing
economic
environment
.
The
debate
also
reflects
a
concern
that
the
benefits
the
world
economy
is
deriving
from
freer
trade
and
more
integrated


In [82]:
nt=list(doc_nlp[0].noun_chunks)

In [83]:
len(nt)

977

In [86]:
from spacy.matcher import PhraseMatcher

In [87]:
matcher = PhraseMatcher(nlp.vocab)
terminology_list = ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
patterns = [nlp(text) for text in terminology_list]
matcher.add('TerminologyList', None, *patterns)

doc = nlp(u"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


In [89]:
for token in doc:
    print(token)

German
Chancellor
Angela
Merkel
and
US
President
Barack
Obama
converse
in
the
Oval
Office
inside
the
White
House
in
Washington
,
D.C.


In [26]:
test = doc_nlp[0]

In [37]:
noun_chunks = list(test.noun_chunks)

In [46]:
from spacy import displacy
doc = nlp(text)
displacy.serve(doc, style='ent')

(Communiqué of,
 the International Monetary and Financial Committee of the Board of Governors,
 the International Monetary Fund 1,
 The International Monetary and Financial Committee,
 Washington, D.C.,
 April 16, 2000,
 the Chairmanship of Mr.,
 Gordon Brown,
 Exchequér,
 the United Kingdom,
 2,
 The Committee's,
 today,
 IMF,
 Committee,
 Fund,
 IMF,
 Committee,
 3,
 Committee,
 1999,
 2000,
 the past year,
 Committee,
 the United States,
 Europe,
 Japan,
 4,
 Committee,
 Committee,
 5,
 Committee,
 North America,
 Europe,
 Asia,
 China,
 India,
 Committee,
 Latin America,
 1999,
 Russia,
 1998,
 Russia,
 the Middle East,
 Africa,
 Committee,
 6,
 The Committee,
 The Committee,
 WTO,
 Committee,
 IMF,
 the World Bank,
 WTO,
 Fund,
 Fund,
 7,
 The Committee,
 Fund,
 the Committee,
 Fund,
 Board,
 four,
 the Currency Stabilization Fund,
 Debt and Debt Service Reduction,
 the Buffer Stock Financing Facility,
 Facilityand,
 the Compensatory Financing Facility,
 8,
 The Committee,
 Fund,
