# Learning NLP with spaCy

In [4]:
# Set up spaCy
from spacy.lang.en import English
parser = English()

# Test Data
multiSentence = "There is an art, it says, or rather, a knack to flying." \
                 "The knack lies in learning how to throw yourself at the ground and miss." \
                 "In the beginning the Universe was created. This has made a lot of people "\
                 "very angry and been widely regarded as a bad move."

## Tokenization
### Iterating through Tokens
Each token is an object with lots of different properties
A property with an underscore at the end returns the string representation while a property without the underscore returns an index (int) into spaCy's vocabulary
The probability estimate is based on counts from a 3 billion word corpus, smoothed using the Simple Good-Turing method.

In [8]:
parsedData = parser(multiSentence)
for i, token in enumerate(parsedData):
    print(i,token)
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break

0 There
original: 6090035477591592277 There
lowercased: 2112642640949226496 there
lemma: 6090035477591592277 There
shape: 16072095006890171862 Xxxxx
prefix: 5582244037879929967 T
suffix: 18139757808136603089 ere
log probability: -20.0
Brown cluster id: 0
----------------------------------------
1 is
original: 3411606890003347522 is
lowercased: 3411606890003347522 is
lemma: 10382539506755952630 be
shape: 4370460163704169311 xx
prefix: 5097672513440128799 i
suffix: 3411606890003347522 is
log probability: -20.0
Brown cluster id: 0
----------------------------------------
2 an
original: 15099054000809333061 an
lowercased: 15099054000809333061 an
lemma: 11901859001352538922 a
shape: 4370460163704169311 xx
prefix: 11901859001352538922 a
suffix: 15099054000809333061 an
log probability: -20.0
Brown cluster id: 0
----------------------------------------
