# SpaCy Tutorial

In [74]:
import json
import os
import re
import spacy
from spacy.symbols import nsubj, NOUN, PUNCT, DET

In [107]:
s = "spaCy features a fast and accurate syntactic "\
    "dependency parser, and has a rich API for navigating the tree. " \
    "The parser also powers the sentence boundary detection, "\
    "and lets you iterate over base noun phrases, or 'chunks'"
nlp = spacy.load('en')
doc = nlp(s)

## POS (Part of Speech Tagging)

In [100]:
pairs = []
for token in doc:
    if token.pos is not PUNCT and token.pos is not DET:
        pairs.append((token.pos_, token))
print(pairs)

[('NOUN', spaCy), ('VERB', features), ('ADJ', fast), ('CCONJ', and), ('ADJ', accurate), ('ADJ', syntactic), ('NOUN', dependency), ('NOUN', parser), ('CCONJ', and), ('VERB', has), ('ADJ', rich), ('NOUN', API), ('ADP', for), ('VERB', navigating), ('NOUN', tree), ('NOUN', parser), ('ADV', also), ('VERB', powers), ('NOUN', sentence), ('ADJ', boundary), ('NOUN', detection), ('CCONJ', and), ('VERB', lets), ('PRON', you), ('VERB', iterate), ('ADP', over), ('NOUN', base), ('NOUN', noun), ('NOUN', phrases), ('CCONJ', or), ('NOUN', chunks), ('PROPN', Smithsonian), ('PROPN', Institution)]


## SpaCy Sentence Tokenization

In [101]:
for token in doc.sents:
    print(token.string)

spaCy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree. 
The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or 'chunks' Smithsonian Institution.


## Regex Sentence Tokenization 

In [102]:
sentence_pattern = re.compile(r'([A-Z].*?[\.!?])', re.M)
sentences = sentence_pattern.findall(s)
for sentence in sentences:
    print(sentence)

Cy features a fast and accurate syntactic dependency parser, and has a rich API for navigating the tree.
The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or 'chunks' Smithsonian Institution.


## Entity Recognition

In [108]:
for token in doc.ents:
    print(token.label_, token.text)

ORG API


## Load the Arctic Fox wiki article and process the extract

In [109]:
arctic_fox = json.loads(open('./wikipedia/arctic-fox.json').read())

In [119]:
arctic_fox_extract = arctic_fox['extract'].strip()
arctic_fox_doc = nlp(arctic_fox_extract)

entities = {}
for token in arctic_fox_doc.ents:
    entity_type_list = entities.get(token.label_, set())
    entities[token.label_] = entity_type_list
    entity_type_list.add(token.text)

for k in entities:
    print(k)
    for v in entities[k]:
        print('\t', v)
        

LOC
	 Asia
	 the Barents Sea
	 the North Pole
	 North America
	 Medny Island
	 the Aleutian Islands
	 North Atlantic
	 Europe
	 Pribilof Islands Arctic
	 the Bering Sea
	 the Kenai Peninsula
	 Arctic
	 the Northern Hemisphere
	 Aleutian Islands
ORG
	 ISBN
	 
Photo Gallery
	 Caninae of
	 Iceland Arctic
	 IUCN
	 Ancient Greek
	 World
	 Systema Naturae
	 The Arctic Fox Center
	 Hazardous Substances
	 Greenland Arctic
	 The Arctic fox
	 the Vindelfjällens Nature Reserve
	 Kola Peninsula
	 Arctic
	 Walker
	 the Environment Norway
	 Canis
	 Johns Hopkins Press
TIME
	 12
QUANTITY
	 41 to
	 52 cm
	 46 to
	 22 in
	 9,800 ft
	 18 to
	 68 cm (
	 55 cm (
MONEY
	 1,200 sq yd
PRODUCT
	 3-
GPE
	 Russia
	 Norway
	 Lapland
	 
Bering Islands Arctic
	 Alaska
	 Svalbard
	 Súðavík
	 Fennoscandia
	 Baltimore
	 Finland
	 New Zealand's
	 Aleutian Canada
	 Kola
	 Canada
	 Sweden
	 Vulpes
	 Siberia
	 Iceland
	 Greenland
WORK_OF_ART
	 Vulpes
NORP
	 Fennoscandian
	 Arctic
	 North American
	 Latin
	 Scandinavian
P