### Installation

We follow the instructions from https://spacy.io/docs/usage/

In [None]:
!sudo python3 -m pip install -U spacy

In [None]:
# Support for english
!sudo python3 -m spacy.en.download all

In [None]:
# Support for german
!sudo python3 -m spacy.de.download all

### Starting with Spacy

We first import the library and create an `nlp` variable, instantiated for English (`'en'`).

In [1]:
import spacy

# Load the space library, instantiated for English
#note: the first time you run spaCy in a file it takes a little while to load up its modules
nlp = spacy.load('en') 

From https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

In [2]:
text = """There is an art, it says, or rather, a knack to flying. 
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created. This has made a lot of people
very angry and been widely regarded as a bad move.
This Prof. Panos, Ph.D. costs $12,345.67"""

In [3]:
# all you have to do to parse text is this:
doc = nlp(text)

In [6]:
tokens = [token for token in doc]
tokens[:10]

[There, is, an, art, ,, it, says, ,, or, rather]

In [10]:
# Let's look at the tokens
# All you have to do is iterate through the doc
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word corpus
for i, token in enumerate(doc):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("part of speech:", token.pos_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 5:
        break

original: 640 There
lowercased: 530 there
lemma: 530 there
shape: 489815 Xxxxx
prefix: 2907 T
suffix: 48458 ere
part of speech: ADV
log probability: -7.347356796264648
Brown cluster id: 1918
----------------------------------------
original: 474 is
lowercased: 474 is
lemma: 488 be
shape: 21581 xx
prefix: 570 i
suffix: 474 is
part of speech: VERB
log probability: -4.457748889923096
Brown cluster id: 762
----------------------------------------
original: 523 an
lowercased: 523 an
lemma: 523 an
shape: 21581 xx
prefix: 469 a
suffix: 523 an
part of speech: DET
log probability: -6.014852046966553
Brown cluster id: 3
----------------------------------------
original: 1630 art
lowercased: 1630 art
lemma: 1630 art
shape: 28983 xxx
prefix: 469 a
suffix: 1630 art
part of speech: NOUN
log probability: -9.584548950195312
Brown cluster id: 633
----------------------------------------
original: 416 ,
lowercased: 416 ,
lemma: 416 ,
shape: 416 ,
prefix: 416 ,
suffix: 416 ,
part of speech: PUNCT
log pro

#### Get some data

First let's get a few text files, so that we can run our examples.

In [None]:
!mkdir data
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/article.txt' -o data/article.txt
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/pride_and_prejudice.txt' -o data/pride_and_prejudice.txt
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/rand-terrorism-dataset.txt'  -o data/rand-terrorism-dataset.txt

Now we will read the text file and then we will use the `nlp` object from spacy to analyze the text.

In [None]:
filename = "data/article.txt"
text = open(filename, 'r').read()
doc = nlp(text)

##### Print tokens

In [None]:
# Print tokens, one token per line
# The enumerate function is just used to add a counter
for token in doc:
    print(token)

###### Print Sentences

In [None]:
# Print the first 5 sentences (one sentence per line)
# The enumerate function is just used to add a counter
for i, sent in enumerate(doc.sents):
    print(i, "==>", sent)
    if i>5:
        break

#### Named Entities 

In [63]:
entities = set([ent.lemma_ for ent in doc.ents])
entities

{'',
 '1',
 '1.e.1',
 '1.e.2',
 '1.e.3',
 '1.e.4',
 '1.e.7',
 '1.e.8',
 '1.f.3',
 '1.f.5',
 '1.f.6',
 '1342',
 '1342.txt',
 '15th october',
 '1998',
 '2',
 '2001',
 '26th',
 '3',
 '30 day',
 '33',
 '4',
 '5,000',
 '50',
 '596',
 '60 day',
 '64 - 6221541',
 '801',
 '809 north 1500 west',
 '90 day',
 'a',
 'a day',
 'a day later',
 'a few day',
 'a few day before',
 'a few hour',
 'a few line',
 'a few minute',
 'a few month ago',
 'a few struggle',
 'a few week',
 'a month',
 'a morning',
 'a thousand',
 'a week',
 'a year ago',
 'a year or',
 'about a fortnight',
 'about a month',
 'about a month ago',
 'about a year ago',
 'about eight year ago',
 'about fifteen or sixteen',
 'about five year ago',
 'about half',
 'about ten',
 'about thirty',
 'about three or four hundred',
 'about three year',
 'all day long',
 'almost a week',
 'almost every day',
 'an hour',
 'anne',
 'annesley',
 'anonymous volunteers',
 'as little as',
 'as little as \n possible',
 'as many as you',
 'ashworth',

In [64]:
entities_with_type = set([ent.lemma_+" # "+ent.label_ for ent in doc.ents])
entities_with_type

{' # CARDINAL',
 ' # GPE',
 ' # NORP',
 ' # ORG',
 ' # PERSON',
 '1 # CARDINAL',
 '1.e.1 # CARDINAL',
 '1.e.2 # CARDINAL',
 '1.e.3 # CARDINAL',
 '1.e.4 # CARDINAL',
 '1.e.7 # CARDINAL',
 '1.e.8 # CARDINAL',
 '1.f.3 # CARDINAL',
 '1.f.5 # CARDINAL',
 '1.f.6 # CARDINAL',
 '1342 # MONEY',
 '1342.txt # CARDINAL',
 '15th october # DATE',
 '1998 # DATE',
 '2 # CARDINAL',
 '2001 # DATE',
 '26th # ORDINAL',
 '3 # CARDINAL',
 '30 day # DATE',
 '33 # CARDINAL',
 '4 # CARDINAL',
 '5,000 # MONEY',
 '50 # CARDINAL',
 '596 # CARDINAL',
 '60 day # DATE',
 '64 - 6221541 # CARDINAL',
 '801 # CARDINAL',
 '809 north 1500 west # FAC',
 '90 day # DATE',
 'a # TIME',
 'a day # DATE',
 'a day later # DATE',
 'a few day # DATE',
 'a few day before # DATE',
 'a few hour # TIME',
 'a few line # TIME',
 'a few minute # TIME',
 'a few month ago # DATE',
 'a few struggle # MONEY',
 'a few week # DATE',
 'a month # DATE',
 'a morning # TIME',
 'a thousand # CARDINAL',
 'a week # DATE',
 'a year ago # DATE',
 'a yea

#### Noun chunks

In [70]:
chunks = [chunk.lemma_ for chunk in doc.noun_chunks if chunk.lemma_ not in entities]
chunks

['the project gutenberg ebook',
 'this ebook',
 'the use',
 'anyone',
 'no cost',
 'almost no restriction',
 'you',
 'it',
 'it',
 'it',
 'the term',
 'this ebook',
 'title',
 'ebook',
 'august 26 , 2008 [ ebook # 1342 ] \n release date',
 'language',
 'this project gutenberg ebook',
 'chapter',
 'it',
 'a truth',
 'a single man',
 'possession',
 'a good fortune',
 'want',
 'a wife',
 'the feeling',
 'view',
 'such a man',
 'a neighbourhood',
 'this truth',
 'the mind',
 'the surround family',
 'he',
 'their daughter',
 '" my dear mr. bennet',
 'his lady',
 'him',
 'you',
 'mr. bennet',
 'he',
 'it',
 'she',
 'mrs. long',
 'she',
 'me',
 'it',
 'mr. bennet',
 'no answer',
 'you',
 'who',
 'it',
 'his wife',
 'you',
 'me',
 'i',
 'no objection',
 'it',
 'invitation',
 'you',
 'mrs. long',
 'a young man',
 'large fortune',
 'the north',
 'he',
 'a chaise',
 'the place',
 'it',
 'he',
 'mr. morris',
 'he',
 'possession',
 'his servant',
 'the house',
 'the end',
 'what',
 'his name',
 '" 

In [71]:
from collections import Counter

keywords = Counter()
for chunk in chunks:
    # print(chunk, nlp.vocab[chunk].prob )
    if nlp.vocab[chunk].prob < -8: # probablity value -8 is arbitrarily selected threshold
        keywords[chunk] += 1

keywords.most_common(20)

[('-PRON-', 538),
 ('mr. darcy', 194),
 ('herself', 172),
 ('mrs. bennet', 118),
 ('mr. collins', 107),
 ('himself', 91),
 ('whom', 84),
 ('mr. bingley', 79),
 ('her sister', 73),
 ('myself', 72),
 ('the room', 67),
 ('mr. bennet', 58),
 ('town', 54),
 ('the house', 53),
 ('her mother', 53),
 ('the world', 52),
 ('mr. wickham', 51),
 ('the subject', 47),
 ('chapter', 45),
 ('yourself', 43)]

In [None]:
for ent1 in doc.ents:
    for ent2 in doc.ents:
        similarity = ent1.similarity(ent2)
        if similarity > 0.5:
            print('{} - {} - {}' .format(ent1, ent2, similarity))

In [83]:
from numpy import dot
from numpy.linalg import norm

# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

# Let's see if it can figure out this analogy
# B is to A as C is to ???
a = nlp.vocab['London']
b = nlp.vocab['UK']
c = nlp.vocab['France']

result = a.vector - b.vector + c.vector

# gather all known words, take only the lowercased versions
allWords = list({w for w in nlp.vocab if w.has_vector and w.is_title and w.lower_ not in set({a.lower_,b.lower_,c.lower_})})
# sort by similarity to the result
allWords.sort(key=lambda w: cosine(w.vector, result))
allWords.reverse()
print("\n----------------------------\nTop 3 closest results:")
for word in allWords[:3]:   
    print(word.orth_)
    
# it got it! 


----------------------------
Top 3 closest results:
Paris
Italy
Europe


In [None]:
filename = "data/rand-terrorism-dataset.txt"
text = open(filename, 'r').read()
doc = nlp(text)

In [61]:
filename = "data/pride_and_prejudice.txt"
text = open(filename, 'r').read()
doc = nlp(text)