In [58]:
import spacy
from spacy import displacy

In [59]:
sample = "AI has had a few excellent runs. In the Sixties, it was the great promise of what we would be able to do with the machine."

In [69]:
len(sample)

122

In [60]:
nlp = spacy.load('en')

In [61]:
doc = nlp(sample)

In [72]:
for token in doc:
    print(token)

AI
has
had
a
few
excellent
runs
.
In
the
Sixties
,
it
was
the
great
promise
of
what
we
would
be
able
to
do
with
the
machine
.


In [63]:
tokens_ = [token for token in doc]

In [64]:
print(tokens_)

[AI, has, had, a, few, excellent, runs, ., In, the, Sixties, ,, it, was, the, great, promise, of, what, we, would, be, able, to, do, with, the, machine, .]


In [65]:
#identifying the stop words
for words in doc:
    if words.is_stop == True:
        print(words)

has
had
a
few
the
it
was
the
of
what
we
would
be
to
do
with
the


#### TOKENIZATION

During processing, spaCy first tokenizes the text, i.e. segments it into words, punctuation and so on. 
This is done by applying rules specific to each language. For example, punctuation at the end of a sentence should 
be split off – whereas "U.K." should remain one token. Each Doc consists of individual tokens, and we can iterate 
over them:

First, the raw text is split on whitespace characters, similar to text.split(' '). Then, the tokenizer processes 
the text from left to right. On each substring, it performs two checks:

    1. Does the substring match a tokenizer exception rule? For example, "don't" does not contain whitespace, 
    but should be split into two tokens, "do" and "n't", while "U.K." should always remain one token.
    2. Can a prefix, suffix or infix be split off? For example punctuation like commas, periods, hyphens or quotes.

If there's a match, the rule is applied and the tokenizer continues its loop, starting with the newly split substrings.
This way, spaCy can split complex, nested tokens like combinations of abbreviations and multiple punctuation marks.

In [66]:
for token in doc:
    print(token.text)

AI
has
had
a
few
excellent
runs
.
In
the
Sixties
,
it
was
the
great
promise
of
what
we
would
be
able
to
do
with
the
machine
.


In [67]:
for token in doc:
    print(token.lemma_)

ai
have
have
a
few
excellent
run
.
in
the
sixties
,
-PRON-
be
the
great
promise
of
what
-PRON-
would
be
able
to
do
with
the
machine
.


In [68]:
for token in doc:
    print(token.pos_)

PROPN
VERB
VERB
DET
ADJ
ADJ
NOUN
PUNCT
ADP
DET
PROPN
PUNCT
PRON
VERB
DET
ADJ
NOUN
ADP
NOUN
PRON
VERB
VERB
ADJ
PART
VERB
ADP
DET
NOUN
PUNCT


In [80]:
displacy.render(doc, style='dep', jupyter = True, options={'distance': 70})

#### NAMED ENTITY RECOGNITION

In [78]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

AI 0 2 PERSON
the Sixties 36 47 DATE


In [81]:
displacy.render(doc, style = 'ent', jupyter = True)

In [89]:
tokens_2 = [token.orth_ for token in doc]

In [90]:
tokens_2

['AI',
 'has',
 'had',
 'a',
 'few',
 'excellent',
 'runs',
 '.',
 'In',
 'the',
 'Sixties',
 ',',
 'it',
 'was',
 'the',
 'great',
 'promise',
 'of',
 'what',
 'we',
 'would',
 'be',
 'able',
 'to',
 'do',
 'with',
 'the',
 'machine',
 '.']

In [85]:
tokens_3 = [token for token in doc]

In [86]:
tokens_3

[AI,
 has,
 had,
 a,
 few,
 excellent,
 runs,
 .,
 In,
 the,
 Sixties,
 ,,
 it,
 was,
 the,
 great,
 promise,
 of,
 what,
 we,
 would,
 be,
 able,
 to,
 do,
 with,
 the,
 machine,
 .]

#### PART OF SPEECH TAGGING

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [10]:
doc = nlp("'Apple is looking at buying U.K. startup for $1 billion. The deal if went thorugh is going to be humougous!")

In [16]:
for token in doc:
    print(("{0} / {1} / {2} / {3} / {4} /{5} / {6}").format(token.text, token.lemma_, token.pos_, token.tag_, token.is_punct, token.is_stop, token.dep_))

' / ' / PUNCT / `` / True /False / punct
Apple / apple / PROPN / NNP / False /False / nsubj
is / be / VERB / VBZ / False /True / aux
looking / look / VERB / VBG / False /False / ROOT
at / at / ADP / IN / False /True / prep
buying / buy / VERB / VBG / False /False / pcomp
U.K. / u.k. / PROPN / NNP / False /False / compound
startup / startup / NOUN / NN / False /False / dobj
for / for / ADP / IN / False /True / prep
$ / $ / SYM / $ / False /False / quantmod
1 / 1 / NUM / CD / False /False / compound
billion / billion / NUM / CD / False /False / pobj
. / . / PUNCT / . / True /False / punct
The / the / DET / DT / False /False / det
deal / deal / NOUN / NN / False /False / nsubj
if / if / ADP / IN / False /True / mark
went / go / VERB / VBD / False /False / advcl
thorugh / thorugh / NOUN / NN / False /False / acomp
is / be / VERB / VBZ / False /True / aux
going / go / VERB / VBG / False /False / ROOT
to / to / PART / TO / False /True / aux
be / be / VERB / VB / False /True / xcomp
humougous

In [18]:
from spacy import displacy
print(displacy.render(doc, jupyter = True, style = 'ent'))

None


#### DEPENDENCY PARSING

NOUN CHUNKS

In [19]:
nlp = spacy.load('en_core_web_sm')

In [21]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

In [29]:
for chunk in doc.noun_chunks:
    print(("{0} / {1} / {2} / {3}").format(chunk.text, chunk.root.text, chunk.root.head, chunk.root.dep_))

Autonomous cars / cars / shift / nsubj
insurance liability / liability / shift / dobj
manufacturers / manufacturers / toward / pobj


#### NAVIGATING THE PARSE TREE

spaCy uses the terms head and child to describe the words connected by a single arc in the dependency tree. 
The term dep is used for the arc label, which describes the type of syntactic relation that connects the child to 
the head. As with other attributes, the value of .dep is a hash value. You can get the string value with .dep_.

In [30]:
nlp = spacy.load("en_core_web_sm")

In [31]:
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

In [35]:
for token in doc:
    print(token.text, token.head.text, token.head.pos_, token.dep_,[child for child in token.children])

Autonomous cars NOUN amod []
cars shift VERB nsubj [Autonomous]
shift shift VERB ROOT [cars, liability, toward]
insurance liability NOUN compound []
liability shift VERB dobj [insurance]
toward shift VERB prep [manufacturers]
manufacturers toward ADP pobj []


In [36]:
displacy.render(doc, jupyter = True, style = 'dep')

#### ITERATING AROUND THE LOCAL TREE

In [41]:
import spacy

nlp = spacy.load('de')
doc = nlp("schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts])  # ['schöne', 'rote']
print([token.text for token in doc[2].rights])  # ['auf']

['schöne', 'rote']
['auf']


In [42]:
import spacy

In [43]:
nlp = spacy.load('en_core_web_sm')

In [44]:
doc = nlp("bright red apples on the tree")

In [49]:
print([token.text for token in doc[2].lefts])

['bright', 'red']


In [50]:
print([token.text for token in doc[2].rights])

['on']


#### WORD VECTORS AND SIMILARITY

spaCy is able to compare two objects, and make a prediction of how similar they are. 
Predicting similarity is useful for building recommendation systems or flagging duplicates. 
For example, you can suggest a user content that's similar to what they're currently looking at, 
or label a support ticket as a duplicate if it's very similar to an already existing one.

Each Doc, Span and Token comes with a .similarity() method that lets you compare it with another object, 
and determine the similarity. Of course similarity is always subjective – whether "dog" and "cat" are similar 
really depends on how you're looking at it. spaCy's similarity model usually assumes a pretty general-purpose 
definition of similarity.

In [52]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [53]:
doc = nlp("dog cat banana")

In [54]:
for token1 in doc:
    for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.53907
dog banana 0.28761
cat dog 0.53907
cat cat 1.0
cat banana 0.487522
banana dog 0.28761
banana cat 0.487522
banana banana 1.0


In [57]:
#converting the given text document into word vectors
for token in doc:
    print(token.text, token.vector)

dog [  8.70676517e-01   2.25968695e+00  -6.31433487e-01   5.31767309e-01
   2.93477565e-01   1.71422577e+00  -3.22116423e+00   8.72991681e-01
   1.93997455e+00   2.87031174e+00   2.26860499e+00  -1.70549417e+00
   1.75721872e+00   1.00328207e-01  -1.84722948e+00   8.10127974e-01
  -2.36641645e+00  -6.74954057e-01  -2.57085705e+00   1.77607059e+00
  -1.85285002e-01   1.23867345e+00  -9.75624323e-01   1.93797350e+00
   2.00332999e-02  -1.60199916e+00  -5.47065139e-01   1.61460507e+00
  -2.17767835e-01  -2.21958613e+00  -7.38540709e-01  -6.19614363e-01
  -3.90585244e-01   1.81051493e-02  -5.94446249e-02  -1.39486265e+00
   1.60074520e+00  -5.22987545e-01   2.30428672e+00  -1.77107120e+00
  -1.88900483e+00  -2.22219467e+00   1.16671133e+00  -2.62894034e+00
  -2.51317692e+00  -6.32965326e-01  -2.00493360e+00  -7.31566608e-01
  -1.48643404e-01  -1.04217589e+00   3.28766537e+00  -2.02324677e+00
  -2.19653702e+00   3.25622582e+00   3.02244604e-01  -2.88570762e+00
   3.49860716e+00   3.15575504

   5.49880266e-02   2.16840312e-01  -1.06817119e-01   3.27902555e-01]
banana [ -1.41948950e+00   5.58564067e-01   1.00896370e+00   9.34259415e-01
   1.15513587e+00   3.79428953e-01  -3.80002856e+00  -2.19876575e+00
   7.66154289e-01   1.85205436e+00   1.49350238e+00  -3.63348961e-01
   9.93571162e-01  -1.87211156e-01  -2.40033650e+00   1.19870663e+00
  -2.99316025e+00  -2.29795551e+00  -1.68042183e+00  -5.24839997e-01
   1.56570005e+00   2.40306044e+00   9.31506872e-01   2.49481010e+00
  -2.66593516e-01  -3.92615080e-01  -1.16136670e+00   1.16194636e-01
  -2.53938168e-01  -1.15748477e+00  -3.12320799e-01   1.84694076e+00
   2.14433932e+00   6.78922892e-01  -2.72198617e-01  -2.13090634e+00
  -1.37582466e-01   8.65555704e-01  -8.46146524e-01  -9.20764685e-01
  -2.96591711e+00   4.35806561e+00   2.49332428e+00  -1.94552362e+00
  -5.19501328e-01   4.18750572e+00  -3.46043825e-01  -2.68519306e+00
   2.11666584e+00   1.56255913e+00  -6.16478622e-01  -2.74263763e+00
  -2.25735211e+00  -1.5611

In [59]:
import spacy

nlp = spacy.load('en_core_web_sm')
tokens = nlp(u'dog cat banana afskfsd')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 23.8584 True
cat True 24.3287 True
banana True 25.8637 True
afskfsd True 26.5983 True


In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'dog cat banana afskfsd')

#### VOCAB, HASHES, LEXEME

In [10]:
import spacy

sample = "I love coffee"
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample)

In [11]:
for token in doc:
    print(("{0} / {1} / {2} / {3} / {4}").format(token.text, token.tag_, token.pos_, token.is_stop, token.lemma_))

I / PRP / PRON / False / -PRON-
love / VBP / VERB / False / love
coffee / NN / NOUN / False / coffee


In [19]:
print(doc.vocab.strings['I'])

4690420944186131903


In [30]:
# LEXEME = each word in a vocabulary
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp('I Love Coffee')

for token in doc:
    lexeme = doc.vocab[token.text]
    print(lexeme.text, lexeme.orth, lexeme.prefix_, lexeme.suffix_, lexeme.is_alpha, lexeme.is_stop, lexeme.lang_)

I 4690420944186131903 I I True False en
Love 13599639812707930908 L ove True False en
Coffee 3474706295102377020 C fee True False en


In [1]:
import spacy
from spacy import displacy

In [3]:
nlp = spacy.load('en_core_web_sm')
sample = 'Apple is looking at buying U.K. startup for $1 billion'
doc = nlp(sample)

In [4]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [5]:
displacy.render(doc, style = 'ent', jupyter = True)

#### LIGHTINING TOUR

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("Peach emoji is where it has always been. Peach is the superior emoji. It's outranking eggplant 🍑")

In [3]:
words = [t.text for t in doc]

In [6]:
words[0], words[2], words[-1]

('Peach', 'is', '🍑')

In [16]:
noun_chunks_ = [nc for nc in doc.noun_chunks]

In [17]:
noun_chunks_

[Peach emoji, it, Peach, the superior emoji, It, eggplant 🍑]

In [18]:
sentences = [sentence for sentence in doc.sents]

In [19]:
sentences

[Peach emoji is where it has always been.,
 Peach is the superior emoji.,
 It's outranking eggplant 🍑]

In [29]:
assert len(sentences) == 3, 'the length of the sentences is not correct'
print(sentences)

[Peach emoji is where it has always been., Peach is the superior emoji., It's outranking eggplant 🍑]


In [30]:
import spacy

In [31]:
nlp = spacy.load('en_core_web_sm')

In [32]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [33]:
apple = doc[0]

In [35]:
print('fine-grained POS:', apple.pos_, apple.pos)
print("coarse-grained POS:", apple.tag_, apple.tag)
print("Shape of the token:", apple.shape_, apple.shape)
print("Alphanumeric characters:", apple.is_alpha)
print('Punctuation? :', apple.is_punct)

fine-grained POS: PROPN 95
coarse-grained POS: NNP 15794550382381185553
Shape of the token: Xxxxx 16072095006890171862
Alphanumeric characters: True
Punctuation? : False


#### USING 'LIKE' FUNCTION

In [58]:
sample = "two, three, hundred, thousand, million, billion, 10, 20, 50 is like a number"
doc_1 = nlp(sample)

In [59]:
words = [token for token in doc_1]

In [60]:
words

[two,
 ,,
 three,
 ,,
 hundred,
 ,,
 thousand,
 ,,
 million,
 ,,
 billion,
 ,,
 10,
 ,,
 20,
 ,,
 50,
 is,
 like,
 a,
 number]

In [61]:
print("is it like a number? ", words[-1].like_num)

is it like a number?  False


In [62]:
for word in words:
    if word.like_num == True:
        print(word)

two
three
hundred
thousand
million
billion
10
20
50


In [63]:
for word in words:
    if word.is_digit == True:
        print(word)

10
20
50


In [None]:
num = input('Input the numbers')
list_1 = list(num.split("'"))

#### RECOGNISE AND UPDATE NAMED ENTITIES

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("San Francisco considers banning sidewalk delivery robots")

In [3]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

San Francisco 0 13 GPE


In [4]:
spacy.explain('GPE')

'Countries, cities, states'

In [5]:
from spacy.tokens import Span

In [7]:
doc_1 = nlp("FB is hiring a new VP of global policy")
for ent in doc_1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

VP 19 21 ORG


In [15]:
doc_1.ents = [Span(doc_1, 0, 1, label = doc_1.vocab.strings['ORG'])]
for ent in doc_1.ents:
    print(ent.text, ent.label_)

FB ORG


#### TRAIN AND UPDATE A NEURAL NETWORK MODEL