# SpaCy Tutorial

In [45]:
import json
import os
import random
import re
import spacy

from spacy.gold import GoldParse
from spacy.language import EntityRecognizer
from spacy.matcher import Matcher
from spacy.tagger import Tagger
from spacy.attrs import IS_PUNCT, LOWER
from spacy.symbols import nsubj, NOUN, PUNCT, DET

In [46]:
s = "spaCy features a fast and accurate syntactic "\
    "dependency parser, and has a rich API for navigating the tree. " \
    "The parser also powers the sentence boundary detection, "\
    "and lets you iterate over base noun phrases, or 'chunks'."
nlp = spacy.load('en')
doc = nlp(s)

## POS (Part of Speech Tagging)

In [47]:
pairs = []
for token in doc:
    # Only print toknes that are not punctuation or determiners
    if token.pos is not PUNCT and token.pos is not DET:
        pairs.append((token.pos_, token))
print(pairs)

[('NOUN', spaCy), ('VERB', features), ('ADJ', fast), ('CCONJ', and), ('ADJ', accurate), ('ADJ', syntactic), ('NOUN', dependency), ('NOUN', parser), ('CCONJ', and), ('VERB', has), ('ADJ', rich), ('NOUN', API), ('ADP', for), ('VERB', navigating), ('NOUN', tree), ('NOUN', parser), ('ADV', also), ('VERB', powers), ('NOUN', sentence), ('ADJ', boundary), ('NOUN', detection), ('CCONJ', and), ('VERB', lets), ('PRON', you), ('VERB', iterate), ('ADP', over), ('NOUN', base), ('NOUN', noun), ('NOUN', phrases), ('CCONJ', or), ('NOUN', chunks)]


## SpaCy Sentence Tokenization

In [48]:
for token in doc.sents:
    print(token.string)

spaCy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree. 
The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or 'chunks'.


## Compare to Naive Regex Sentence Tokenization 

In [49]:
sentence_pattern = re.compile(r'([A-Z].*?[\.!?])', re.M)
sentences = sentence_pattern.findall(s)
for sentence in sentences:
    print(sentence)

Cy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree.
The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or 'chunks'.


## Entity Recognition

In [50]:
for token in doc.ents:
    print(token.label_, token.text)

ORG API


## Load the Arctic Fox wiki article and process the extract

In [51]:
arctic_fox = json.loads(open('./wikipedia/arctic-fox.json').read().lower())

In [52]:
arctic_fox_extract = arctic_fox['extract'].strip()
arctic_fox_doc = nlp(arctic_fox_extract)

entities = {}
for token in arctic_fox_doc.ents:
    entity_type_list = entities.get(token.label_, set())
    entities[token.label_] = entity_type_list
    entity_type_list.add(token.text)

for k in entities:
    print(k)
    for v in entities[k]:
        print('\t', v)
        

TIME
	 12
ORG
	 

PRODUCT
	 3-
MONEY
	 1,200 sq yd
GPE
	 canada
	 norway
	 russia
	 sweden
	 finland
	 lapland
	 iceland
DATE
	 between years due to the large population fluctuations
	 the 1920s
	 the late 19th century
	 46 to 68 cm
	 summer
	 1996
	 the last ice
	 april and may
	 many generations
	 4 weeks old
	 several decades
	 the end of the last ice
	 about 52 days
	 winter
	 2005
	 five to eight kits
	 1997
	 many decades
	 each day
	 25 to 30
	 the 20th century
	 90 years
	 9 weeks of age
	 the 10th edition
	 the last decade
	 the 1970s
	 1758
	 the years
QUANTITY
	 55 cm (
	 52 cm
	 18 to
	 41 to
	 22 in
	 68 cm (
	 46 to
	 9,800 ft
PERCENT
	 more than 50%.
	 more than
CARDINAL
	 3.1
	 3.2 to 9.4
	 50
	 as many as 25
	 27
	 3
	 11
	 fewer than 200
	 3.5
	 7.7
	 about 30
	 20
	 7.1
	 up to 3,000
	 two
	 11.8
	 four
	 dozens
	 9.8
	 20.7
	 one
	 4-year
	 60
	 140
	 almost eradicated two
	 90
	 2.9
	 0-8018-8032-7
	 6.4
	 1.4 to 3.2
	 1,000
	 several hundred thousand


## Custom Matchers

Attributes  from `scpacy.attrs.pxd`
```
NULL_ATTR, IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE, IS_TITLE, IS_UPPER, 
LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP, IS_OOV, IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, 
IS_RIGHT_PUNCT, ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LEMMA, 
POS, TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, SPACY, PROB, LANG
```

In [53]:
matcher = Matcher(nlp.vocab)
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
doc = nlp(u'Hello. world!')
matches = matcher(doc)
span = [(ent_id, label, start, end) for ent_id, label, start, end in matches]
print(span)

[(777061, 0, 0, 3)]


## Word Vectors

In [54]:
ai_article = json.loads(open('./wikipedia/artificial-intelligence.json').read().lower())
ai_extract = ai_article['extract']
ai_doc = nlp(ai_extract)

wolf_article = json.loads(open('./wikipedia/gray-wolf.json').read().lower())
wolf_extract = wolf_article['extract']
wolf_doc = nlp(wolf_extract)

# The similarity between two documents is a distance measure of the averages of the word vectors that make up the document
print('ai_doc, arctic_fox_doc:', ai_doc.similarity(arctic_fox_doc))
print('wolf_doc, arctic_fox_doc:', wolf_doc.similarity(arctic_fox_doc))

ai_doc, arctic_fox_doc: 0.96890856024
wolf_doc, arctic_fox_doc: 0.993009238879


In [55]:
# The similarity is a distance measure of the word vectors for each word
fox = arctic_fox_doc[2]
wolf = wolf_doc[5]
print('fox, wolf', fox.similarity(wolf))

fox, wolf 0.659473562956


In [56]:
artificial = ai_doc[0]
ai = ai_doc[3]
humans = ai_doc[19]
animals = ai_doc[22]
print('artificial, humans:', artificial.similarity(humans))
print('artificial, animals:', artificial.similarity(animals))
print('animals, humans:', animals.similarity(humans))

artificial, humans: 0.42094545699
artificial, animals: 0.346082360149
animals, humans: 0.733054539968


## Custom Entities
The following seeks to create entity recognition for units of measure. Interestingly, the results from this training can vary quite a bit from run to run. This is most likely due to the random initialization of weights in the linear models used within SpaCy. It's likely that this variation would be reduces with more training examples.

In [67]:
model_name = 'en'
entity_label = 'UNIT'
output_directory = './spacy/custom-ent-unit-model/'
train_data = [
    ('The bridge is 2.56 m long', [(18, 20, 'UNIT')]),
    ('The building is 256 ft shorter than the Eifel Tower', [(20, 22, 'UNIT')]),
    ('The desk is 3 ft wide, 2 ft deep and 3.5 ft. tall.', [(14, 16, 'UNIT'), (25, 27, 'UNIT'), (41, 44, 'UNIT')]), 
    ('The record has a diameter of 12 inches.', [(32, 37, 'UNIT')]), 
    ('It is 40 km to the next town', [(9, 11, 'UNIT')]), 
    ('It is 90 km to the next town', [(9, 11, 'UNIT')]), 
    ('I\'ll be there in 30 minutes', [(20, 26, 'UNIT')]), 
    ('San Fransisco is about 2913.6 mi from New York City', [(29, 31, 'UNIT')]), 
    ('It\'s 2,806 km from Austin, TX to New York City', [(11, 13, 'UNIT')]), 
    ('There are 1024 KB in an 1 MB', [[15, 17, 'UNIT'], [26, 27, 'UNIT']]),
    ('The floppy disk can store 1024 kilobytes of data', [[31, 40, 'UNIT']]),
    ('The hard drive on this computer can store 1 TB', [[44, 45, 'UNIT']]),
]

nlp.entity.add_label(entity_label)

def train_ner(nlp, train_data, output_dir):
    # Add new words to vocab
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    for itn in range(100):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            nlp.tagger(doc)
            loss = nlp.entity.update(doc, gold)
    nlp.end_training()
    nlp.save_to_directory(output_dir)
    
ner = train_ner(nlp, train_data, output_directory)

In [68]:
inputs = [
    'The channel is 39 km long.', 
    'Portland, Oregon is 4400 km. from New York.', 
    'The wall is 100 ft tall.', 
    'There are 2048 MB in 2 GB.', 
    'She is 5 ft tall'
]

for i in inputs:
    doc = nlp(i)
    print(i)
    for token in doc.ents:
        print('  ', token.label_, token.text)

The channel is 39 km long.
   DATE km long
Portland, Oregon is 4400 km. from New York.
   UNIT km
The wall is 100 ft tall.
   UNIT ft
There are 2048 MB in 2 GB.
   ORG MB
   PRODUCT in 2
   ORG GB
She is 5 ft tall
   UNIT ft
