## Spacy tutorial

In [23]:
import spacy
from spacy import displacy
from IPython.core.display import display, HTML

In [15]:
%%cmd
python -m spacy download en

Microsoft Windows [Version 10.0.16299.248]
(c) 2017 Microsoft Corporation. All rights reserved.

C:\Users\john\Programming\Research\central-bank-nlp>python -m spacy download en
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)

    Error: Couldn't link model to 'en'
    Creating a symlink in spacy/data failed. Make sure you have the required
    permissions and try re-running the command as admin, or use a
    virtualenv. You can still import the model as a module and call its
    load() method, or create the symlink manually.

    C:\Users\john\AppData\Local\Continuum\anaconda3\lib\site-packages\en_core_web_sm
    -->
    C:\Users\john\AppData\Local\Continuum\anaconda3\lib\site-packages\spacy\data\en


    Download successful
    Creating a shortcut link for 'en' didn'

In [16]:
nlp = spacy.load('en_core_web_sm')

In [17]:
text = open('nz-central-bank/official-cash-rate/2006-01-26.txt', 'r').read()
text1 = open('nz-central-bank/official-cash-rate/2011-12-08.txt', 'r').read()

In [18]:
doc = nlp(text)
doc1 = nlp(text1)

#### Tokenizing text

In [19]:
for token in doc:
    print(token, token.pos_)

The DET
Official ADJ
Cash PROPN
Rate PROPN
( PUNCT
OCR PROPN
) PUNCT
will VERB
remain VERB
unchanged ADJ
at ADP
7.25 NUM
percent NOUN
. PUNCT


 SPACE
Reserve PROPN
Bank PROPN
Governor PROPN
Alan PROPN
Bollard PROPN
said VERB
: PUNCT
" PUNCT
The DET
economy NOUN
has VERB
continued VERB
to PART
slow VERB
in ADP
recent ADJ
months NOUN
, PUNCT
broadly ADV
in ADP
line NOUN
with ADP
the DET
outlook NOUN
contained VERB
in ADP
our ADJ
December PROPN
Monetary PROPN
Policy PROPN
Statement PROPN
. PUNCT

 SPACE
GDP PROPN
growth NOUN
slowed VERB
in ADP
the DET
third ADJ
quarter NOUN
of ADP
2005 NUM
, PUNCT
due ADP
to ADP
the DET
impact NOUN
of ADP
the DET
high ADJ
exchange NOUN
rate NOUN
on ADP
the DET
export NOUN
and CCONJ
import NOUN
- PUNCT
competing VERB
sectors NOUN
, PUNCT
and CCONJ
a DET
fall NOUN
in ADP
construction NOUN
. PUNCT

 SPACE
Looking VERB
to ADP
2006 NUM
, PUNCT
while ADP
there ADV
are VERB
some DET
early ADJ
indications NOUN
, PUNCT
we PRON
are VERB
yet ADV
to PART
see VERB
ha

### Splitting text into sentences

In [21]:
for sent in doc.sents:
    print([(token, token.pos_) for token in sent])

[(The, 'DET'), (Official, 'ADJ'), (Cash, 'PROPN'), (Rate, 'PROPN'), ((, 'PUNCT'), (OCR, 'PROPN'), (), 'PUNCT'), (will, 'VERB'), (remain, 'VERB'), (unchanged, 'ADJ'), (at, 'ADP'), (7.25, 'NUM'), (percent, 'NOUN'), (., 'PUNCT'), (

, 'SPACE')]
[(Reserve, 'PROPN'), (Bank, 'PROPN'), (Governor, 'PROPN'), (Alan, 'PROPN'), (Bollard, 'PROPN'), (said, 'VERB'), (:, 'PUNCT'), (", 'PUNCT'), (The, 'DET'), (economy, 'NOUN'), (has, 'VERB'), (continued, 'VERB'), (to, 'PART'), (slow, 'VERB'), (in, 'ADP'), (recent, 'ADJ'), (months, 'NOUN'), (,, 'PUNCT'), (broadly, 'ADV'), (in, 'ADP'), (line, 'NOUN'), (with, 'ADP'), (the, 'DET'), (outlook, 'NOUN'), (contained, 'VERB'), (in, 'ADP'), (our, 'ADJ'), (December, 'PROPN'), (Monetary, 'PROPN'), (Policy, 'PROPN'), (Statement, 'PROPN'), (., 'PUNCT'), (
, 'SPACE')]
[(GDP, 'PROPN'), (growth, 'NOUN'), (slowed, 'VERB'), (in, 'ADP'), (the, 'DET'), (third, 'ADJ'), (quarter, 'NOUN'), (of, 'ADP'), (2005, 'NUM'), (,, 'PUNCT'), (due, 'ADP'), (to, 'ADP'), (the, 'DET'), (impa

### Part-of-speech tagging and dependency parsing

In [22]:
for token in doc:
    print(token, token.pos_, token.dep_, token.head)

The DET det Rate
Official ADJ compound Rate
Cash PROPN compound Rate
Rate PROPN nsubj remain
( PUNCT punct OCR
OCR PROPN appos Rate
) PUNCT punct OCR
will VERB aux remain
remain VERB ROOT remain
unchanged ADJ acomp remain
at ADP prep remain
7.25 NUM nummod percent
percent NOUN pobj at
. PUNCT punct remain


 SPACE  .
Reserve PROPN compound Bank
Bank PROPN compound Governor
Governor PROPN compound Bollard
Alan PROPN compound Bollard
Bollard PROPN nsubj said
said VERB ROOT said
: PUNCT punct said
" PUNCT punct said
The DET det economy
economy NOUN nsubj continued
has VERB aux continued
continued VERB ccomp said
to PART aux slow
slow VERB xcomp continued
in ADP prep slow
recent ADJ amod months
months NOUN pobj in
, PUNCT punct slow
broadly ADV advmod in
in ADP prep slow
line NOUN pobj in
with ADP prep line
the DET det outlook
outlook NOUN pobj with
contained VERB acl outlook
in ADP prep contained
our ADJ poss Statement
December PROPN compound Statement
Monetary PROPN compound Policy
Polic

In [25]:
for ent in doc.ents:
    print(ent, ent.label_)

OCR ORG
7.25 percent PERCENT
Reserve Bank ORG
Alan Bollard PERSON
recent months DATE
December DATE

 GPE
the third quarter of 2005 DATE

 GPE
2006 DATE

 GPE

 GPE
recent years DATE

 GPE

 GPE
Annual DATE
3.2 percent PERCENT
the next couple of years DATE

 GPE

 GPE
OCR ORG

 GPE
OCR ORG

 GPE


### The name entity recognition is mislabeling the Official Cash Rate as an organization and labeling GPE (Geopolitical entity) incorrectly.

In [27]:
displacy.render(doc, style='ent', jupyter=True)