# SpaCy Tutorial

In [126]:
import json
import os
import re
import spacy
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER
from spacy.symbols import nsubj, NOUN, PUNCT, DET

In [121]:
s = "spaCy features a fast and accurate syntactic "\
    "dependency parser, and has a rich API for navigating the tree. " \
    "The parser also powers the sentence boundary detection, "\
    "and lets you iterate over base noun phrases, or 'chunks'."
nlp = spacy.load('en')
doc = nlp(s)

## POS (Part of Speech Tagging)

In [127]:
pairs = []
for token in doc:
    # Only print toknes that are not punctuation or determiners
    if token.pos is not PUNCT and token.pos is not DET:
        pairs.append((token.pos_, token))
print(pairs)

[('NOUN', spaCy), ('VERB', features), ('ADJ', fast), ('CCONJ', and), ('ADJ', accurate), ('ADJ', syntactic), ('NOUN', dependency), ('NOUN', parser), ('CCONJ', and), ('VERB', has), ('ADJ', rich), ('NOUN', API), ('ADP', for), ('VERB', navigating), ('NOUN', tree), ('NOUN', parser), ('ADV', also), ('VERB', powers), ('NOUN', sentence), ('ADJ', boundary), ('NOUN', detection), ('CCONJ', and), ('VERB', lets), ('PRON', you), ('VERB', iterate), ('ADP', over), ('NOUN', base), ('NOUN', noun), ('NOUN', phrases), ('CCONJ', or), ('NOUN', chunks)]


## SpaCy Sentence Tokenization

In [123]:
for token in doc.sents:
    print(token.string)

spaCy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree. 
The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or 'chunks'.


## Compare to Naive Regex Sentence Tokenization 

In [124]:
sentence_pattern = re.compile(r'([A-Z].*?[\.!?])', re.M)
sentences = sentence_pattern.findall(s)
for sentence in sentences:
    print(sentence)

Cy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree.
The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or 'chunks'.


## Entity Recognition

In [125]:
for token in doc.ents:
    print(token.label_, token.text)

ORG API


## Load the Arctic Fox wiki article and process the extract

In [187]:
arctic_fox = json.loads(open('./wikipedia/arctic-fox.json').read().lower())

In [188]:
arctic_fox_extract = arctic_fox['extract'].strip()
arctic_fox_doc = nlp(arctic_fox_extract)

entities = {}
for token in arctic_fox_doc.ents:
    entity_type_list = entities.get(token.label_, set())
    entities[token.label_] = entity_type_list
    entity_type_list.add(token.text)

for k in entities:
    print(k)
    for v in entities[k]:
        print('\t', v)
        

TIME
	 12
GPE
	 canada
	 lapland
	 iceland
	 russia
	 sweden
	 finland
	 norway
PERCENT
	 more than
	 more than 50%.
DATE
	 many decades
	 1996
	 summer
	 90 years
	 2005
	 the last decade
	 the 10th edition
	 the end of the last ice
	 25 to 30
	 several decades
	 between years due to the large population fluctuations
	 winter
	 april and may
	 9 weeks of age
	 five to eight kits
	 each day
	 46 to 68 cm
	 1997
	 many generations
	 1758
	 the 1920s
	 the late 19th century
	 the years
	 the 20th century
	 the 1970s
	 the last ice
	 about 52 days
	 4 weeks old
CARDINAL
	 6.4
	 about 30
	 as many as 25
	 several hundred thousand
	 140
	 11
	 20
	 two
	 dozens
	 fewer than 200
	 almost eradicated two
	 one
	 3
	 50
	 3.5
	 7.1
	 four
	 1,000
	 4-year
	 7.7
	 3.2 to 9.4
	 20.7
	 90
	 3.1
	 9.8
	 27
	 11.8
	 0-8018-8032-7
	 up to 3,000
	 2.9
	 1.4 to 3.2
	 60
QUANTITY
	 41 to
	 52 cm
	 46 to
	 22 in
	 9,800 ft
	 18 to
	 68 cm (
	 55 cm (
MONEY
	 1,200 sq yd
PRODUCT
	 3-
ORG
	 



## Custom Matchers

Attributes  from `scpacy.attrs.pxd`
```
NULL_ATTR, IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE, IS_TITLE, IS_UPPER, 
LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP, IS_OOV, IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, 
IS_RIGHT_PUNCT, ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LEMMA, 
POS, TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, SPACY, PROB, LANG
```

In [189]:
matcher = Matcher(nlp.vocab)
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
doc = nlp(u'Hello. world!')
matches = matcher(doc)
span = [(ent_id, label, start, end) for ent_id, label, start, end in matches]
print(span)

[(776986, 0, 0, 3)]


## Word Vectors

In [190]:
ai_article = json.loads(open('./wikipedia/artificial-intelligence.json').read().lower())
ai_extract = ai_article['extract']
ai_doc = nlp(ai_extract)

wolf_article = json.loads(open('./wikipedia/gray-wolf.json').read().lower())
wolf_extract = wolf_article['extract']
wolf_doc = nlp(wolf_extract)

print(ai_doc.similarity(arctic_fox_doc))
print(wolf_doc.similarity(arctic_fox_doc))

0.96890856024
0.993009238879


In [191]:
print(wolf_doc.vector.shape) 

(300,)
