### Installation

We follow the instructions from https://spacy.io/docs/usage/

In [1]:
!sudo -H python3 -m pip install -U spacy

Requirement already up-to-date: spacy in /usr/local/lib/python3.5/dist-packages
Requirement already up-to-date: thinc<6.6.0,>=6.5.0 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: pip<10.0.0,>=9.0.0 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: ftfy<5.0.0,>=4.4.2 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: preshed<2.0.0,>=1.0.0 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: numpy>=1.7 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: plac<1.0.0,>=0.9.6 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: murmurhash<0.27,>=0.26 in /usr/local/lib/python3.5/dist-packages (from spacy)
Requirement already up-to-date: six in /usr/local/lib/python3.5/dis

In [None]:
# Support for english
!sudo python3 -m spacy.en.download all

In [None]:
# Support for german
!sudo python3 -m spacy.de.download all

### Starting with Spacy

We first import the library and create an `nlp` variable, instantiated for English (`'en'`).

In [2]:
import spacy

# Load the space library, instantiated for English
#note: the first time you run spaCy in a file it takes a little while to load up its modules
nlp = spacy.load('en') 

From https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

In [None]:
text = """There is an art, it says, or rather, a knack to flying. 
The knack lies in learning how to throw yourself at the ground and miss.
In the beginning the Universe was created. This has made a lot of people
very angry and been widely regarded as a bad move.
This Prof. Panos, Ph.D. costs $12,345.67"""

In [None]:
# all you have to do to parse text is this:
doc = nlp(text)

In [None]:
tokens = [token for token in doc]
tokens[:10]

In [None]:
# Let's look at the tokens
# All you have to do is iterate through the doc
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word corpus
for i, token in enumerate(doc):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("part of speech:", token.pos_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 5:
        break

#### Get some data

First let's get a few text files, so that we can run our examples.

In [None]:
!mkdir data
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/article.txt' -o data/article.txt
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/pride_and_prejudice.txt' -o data/pride_and_prejudice.txt
!curl -L 'https://raw.githubusercontent.com/cytora/pycon-nlp-in-10-lines/master/data/rand-terrorism-dataset.txt'  -o data/rand-terrorism-dataset.txt

Now we will read the text file and then we will use the `nlp` object from spacy to analyze the text.

In [None]:
filename = "data/article.txt"
text = open(filename, 'r').read()
doc = nlp(text)

##### Print tokens

In [None]:
# Print tokens, one token per line
# The enumerate function is just used to add a counter
for token in doc:
    print(token)

###### Print Sentences

In [None]:
# Print the first 5 sentences (one sentence per line)
# The enumerate function is just used to add a counter
for i, sent in enumerate(doc.sents):
    print(i, "==>", sent)
    if i>5:
        break

#### Named Entities 

In [15]:
entities = set([ent.lemma_ for ent in doc.ents])
# entities

In [24]:
text = '''JPMorgan rose 0.45% to 95.03 in the stock market today. Intraday shares rose as high as 95.37, briefly clearing a 95.32 flat base buy point. Volume was light.
Bank of America, whose large U.S. operations make it sensitive to the Fed's rate hikes, is in a shallow base with a 25.45 buy point. BofA shares edged up 0.4% to 25.16.
Morgan Stanley is in a shallow base with a 49 buy point. Morgan Stanley climbed 0.8% to 48.26.
PNC Financial Services cleared a flat base with a 133.36 buy point for most of the session, but closed up 0.3% to 133.35, a penny below that mark. PNC also topped that entry intraday Wednesday, but closed just below that level.
Citigroup rose 0.4% to 71.76. Citigroup is still in buy range after clearing a 69.96 flat-base entry on Monday.'''
doc = nlp(text)

In [25]:
entities_with_type = set([ent.lemma_+" # "+ent.label_ for ent in doc.ents if ent.label_=='ORG' ])
entities_with_type

{'bank of america # ORG',
 'citigroup # ORG',
 'fed # ORG',
 'morgan stanley # ORG',
 'pnc # ORG',
 'pnc financial services # ORG'}

#### Noun chunks

In [26]:
chunks = [chunk.lemma_ for chunk in doc.noun_chunks if chunk.lemma_ not in entities]
chunks

['jpmorgan',
 'the stock market',
 'intraday share',
 'a 95.32 flat base buy point',
 'volume',
 'bank',
 'america',
 'large u.s. operation',
 '-PRON-',
 "the fed 's rate hike",
 'a shallow base',
 'a 25.45 buy point',
 'bofa share',
 'a shallow base',
 'a 49 buy point',
 'a flat base',
 'a 133.36 buy point',
 'the session',
 'that mark',
 'that entry intraday',
 'that level',
 'range',
 'a 69.96 flat - base entry']

In [19]:
from collections import Counter

keywords = Counter()
for chunk in chunks:
    # print(chunk, nlp.vocab[chunk].prob )
    if nlp.vocab[chunk].prob < -8: # probablity value -8 is arbitrarily selected threshold
        keywords[chunk] += 1

keywords.most_common(20)

[('a shallow base', 2),
 ('-PRON-', 1),
 ('a 49 buy point', 1),
 ('a 95.32 flat base buy point', 1),
 ('that level', 1),
 ('jpmorgan', 1),
 ('a flat base', 1),
 ('volume', 1),
 ("the fed 's rate hike", 1),
 ('a 133.36 buy point', 1),
 ('a 25.45 buy point', 1),
 ('bofa share', 1),
 ('a 69.96 flat - base entry', 1),
 ('intraday share', 1),
 ('bank', 1),
 ('the session', 1),
 ('that mark', 1),
 ('america', 1),
 ('that entry intraday', 1),
 ('range', 1)]

In [20]:
for ent1 in doc.ents:
    for ent2 in doc.ents:
        similarity = ent1.similarity(ent2)
        if similarity > 0.5:
            print('{} - {} - {}' .format(ent1, ent2, similarity))

0.45% to - 0.45% to - 1.0000000021195457
0.45% to - as high as 95.37, briefly - 0.5840479437952512
0.45% to - 0.4% to - 1.0000000021195457
0.45% to - 0.8% to - 1.0000000021195457
0.45% to - 0.3% to - 1.0000000021195457
0.45% to - 0.4% to - 1.0000000021195457
today - today - 0.9999999639922463
today - as high as 95.37, briefly - 0.5182494980459993
as high as 95.37, briefly - 0.45% to - 0.5840479437952512
as high as 95.37, briefly - today - 0.5182494980459993
as high as 95.37, briefly - as high as 95.37, briefly - 0.9999999697445826
as high as 95.37, briefly - 0.4% to - 0.5840479437952512
as high as 95.37, briefly - 0.8% to - 0.5840479437952512
as high as 95.37, briefly - 0.3% to - 0.5840479437952512
as high as 95.37, briefly - 0.4% to - 0.5840479437952512
Bank of America - Bank of America - 1.0000000042989907
Bank of America - PNC Financial Services - 0.58372110136483
Fed - Fed - 0.9999999493994777
0.4% to - 0.45% to - 1.0000000021195457
0.4% to - as high as 95.37, briefly - 0.584047943

In [23]:
from numpy import dot
from numpy.linalg import norm

# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

# Let's see if it can figure out this analogy
# B is to A as C is to ???
a = nlp.vocab['London']
b = nlp.vocab['UK']
c = nlp.vocab['France']

result = a.vector - b.vector + c.vector

# gather all known words, take only the lowercased versions
allWords = list({w for w in nlp.vocab if w.has_vector and w.is_title and w.lower_ not in set({a.lower_,b.lower_,c.lower_})})
# sort by similarity to the result
allWords.sort(key=lambda w: cosine(w.vector, result))
allWords.reverse()
print("\n----------------------------\nTop 3 closest results:")
for word in allWords[:3]:   
    print(word.orth_)
    
# it got it! 


----------------------------
Top 3 closest results:
Recall
Pride
Phase


  """


In [22]:
filename = "data/rand-terrorism-dataset.txt"
text = open(filename, 'r').read()
doc = nlp(text)

In [None]:
filename = "data/pride_and_prejudice.txt"
text = open(filename, 'r').read()
doc = nlp(text)