In [1]:
from pprintpp import pprint
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[
    ('European', 'NORP'),
    ('Google', 'ORG'),
    ('$5.1 billion', 'MONEY'),
    ('Wednesday', 'DATE'),
]


In [3]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

155

In [4]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 39,
         'PERSON': 77,
         'DATE': 23,
         'GPE': 9,
         'NORP': 2,
         'CARDINAL': 3,
         'LOC': 1,
         'ORDINAL': 1})

In [5]:
sentences = [x for x in article.sents]
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [6]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [7]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Aitan', 'PROPN', 'Aitan'),
 ('Goelman', 'PROPN', 'Goelman'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('lawyer', 'NOUN', 'lawyer'),
 ('denounced', 'VERB', 'denounce'),
 ('client', 'NOUN', 'client'),
 ('dismissal', 'NOUN', 'dismissal')]

In [8]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Aitan Goelman': 'ORG', 'Strzok': 'PERSON'}

In [9]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Aitan, 'B', 'ORG'), (Goelman, 'I', 'ORG'), (,, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (’s, 'O', ''), (lawyer, 'O', ''), (,, 'O', ''), (denounced, 'O', ''), (his, 'O', ''), (client, 'O', ''), (’s, 'O', ''), (dismissal, 'O', ''), (., 'O', '')]


In [10]:
doc = nlp("Next week I'll   be in Madrid.")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Next	0	next	False	False	Xxxx	ADJ	JJ
week	5	week	False	False	xxxx	NOUN	NN
I	10	-PRON-	False	False	X	PRON	PRP
'll	11	will	False	False	'xx	VERB	MD
  	15	  	False	True	  	SPACE	_SP
be	17	be	False	False	xx	AUX	VB
in	20	in	False	False	xx	ADP	IN
Madrid	23	Madrid	False	False	Xxxxx	PROPN	NNP
.	29	.	True	False	.	PUNCT	.


In [11]:
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)
 
# Wall Street Journal NP Journal
# an interesting piece NP piece
# crypto currencies NP currencies

Wall Street Journal NP Journal
an interesting piece NP piece
crypto currencies NP currencies


In [12]:
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/NNP <--compound-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [13]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['banana'].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [14]:
from scipy import spatial
 
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)
 
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector
king = nlp.vocab['king'].vector
 
# We now need to find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
maybe_king = man - woman + queen
computed_similarities = []
 
for word in nlp.vocab:
    # Ignore words without vectors
    if not word.has_vector:
        continue
 
    similarity = cosine_similarity(maybe_king, word.vector)
    computed_similarities.append((word, similarity))
 
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
print([w[0].text for w in computed_similarities[:10]])

['Queen', 'QUEEN', 'queen', 'King', 'KING', 'king', 'KIng', 'Kings', 'KINGS', 'kings']


In [30]:
banana = nlp.vocab['banana']
dog = nlp.vocab['dog']
fruit = nlp.vocab['fruit']
animal = nlp.vocab['animal']
 
print(dog.similarity(animal), dog.similarity(fruit))
print(banana.similarity(fruit), banana.similarity(animal))

0.66185343 0.23552851
0.67148364 0.24272855


In [35]:
import spacy
from spacy.tokens import Doc
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
 
sentiment_analyzer = SentimentIntensityAnalyzer()
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)
 
Doc.set_extension('alt_polarity_scores', getter=polarity_scores)
 
nlp = en_core_web_sm.load()
doc = nlp("Really Whaaat event apple nice! it!")
print(doc._.alt_polarity_scores)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/julianhatwell/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'neg': 0.0, 'neu': 0.596, 'pos': 0.404, 'compound': 0.5242}


In [36]:
print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7f06183af898>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7f06182ac468>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7f06182ac4c8>)]


In [39]:
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from spacy.tokens import Token
 
 
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None
 
 
class WordnetPipeline(object):
    def __init__(self, nlp):
        Token.set_extension('synset', default=None)
 
    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue
 
            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss)
 
        return doc

wn_pipeline = WordnetPipeline(nlp)
nlp.add_pipe(wn_pipeline, name='wn_synsets')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/julianhatwell/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [43]:

doc = nlp("Paris is the awesome capital of France.")
 
for token in doc:
    print(token.text, "-", token._.synset)

Paris - Synset('paris.n.01')
is - Synset('be.v.01')
the - None
awesome - Synset('amazing.s.02')
capital - Synset('capital.n.01')
of - None
France - Synset('france.n.01')
. - None
