In [4]:
# Based on Udemy lecture with custom modifications

In [1]:
import spacy

In [None]:
# Below commands are required if doing for the first time
# Outputs are included here as a reference

In [3]:
# !python -m spacy info


    [93mInfo about spaCy[0m

    spaCy version      2.0.16         
    Location           /opt/anaconda3/envs/nlp_course/lib/python3.7/site-packages/spacy
    Platform           Darwin-19.6.0-x86_64-i386-64bit
    Python version     3.7.13         
    Models                            



In [5]:
# !python -m spacy download en

Collecting en_core_web_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz#egg=en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K    100% |████████████████████████████████| 37.4MB 89.0MB/s ta 0:00:011  14% |████▌                           | 5.3MB 21.9MB/s eta 0:00:02██████████▏ | 35.2MB 12.1MB/s eta 0:00:01�███████████ | 36.2MB 13.7MB/s eta 0:00:01
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.0.0

[93m    Linking successful[0m
    /opt/anaconda3/envs/nlp_course/lib/python3.7/site-packages/en_core_web_sm
    -->
    /opt/anaconda3/envs/nlp_course/lib/python3.7/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [2]:
nlp = spacy.load('en_core_web_sm')  # a small core English library

In [5]:
doc = nlp(u'Musk is looking at buying U.S. Twitter')

In [8]:
for token in doc:
    print(token.text, token.pos, token.pos_)

Musk 91 NOUN
is 99 VERB
looking 99 VERB
at 84 ADP
buying 99 VERB
U.S. 95 PROPN
Twitter 91 NOUN


In [9]:
for token in doc:
    print(token.dep_) # syntactic dependency

nsubj
aux
ROOT
prep
pcomp
compound
dobj


In [10]:
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7fc6ba6c8850>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7fc6ba79a350>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7fc6ba79a8f0>)]

In [11]:
nlp.pipe_names

['tagger', 'parser', 'ner']

## Tokenization

In [12]:
doc2 = nlp(u"Startups are evaluating hiring more carefully.")

In [13]:
for token in doc2:
    print(token.text, token.pos_,token.dep_)

Startups NOUN nsubj
are VERB aux
evaluating VERB ROOT
hiring VERB xcomp
more ADV advmod
carefully ADV advmod
. PUNCT punct


In [14]:
doc2[0]

Startups

In [15]:
doc2[0].pos_

'NOUN'

In [16]:
# random sentences from TechCrunch
article = nlp(u"Following a preview in April, Microsoft this morning announced the general availability of virtual machines (VMs) on Azure featuring the Ampere Altra, a processor based on the Arm architecture. The first Azure VMs powered by Arm chips, Microsoft says that they’re accessible in 10 Azure regions today and can be included in Kubernetes clusters managed using Azure Kubernetes Service beginning on September 1.")

In [17]:
article[20:40]

Azure featuring the Ampere Altra, a processor based on the Arm architecture. The first Azure VMs powered by

In [20]:
type(article[20:40]) # note that this is a Span data type
# Span is defined as: A slice from a Doc object.
# URL: https://spacy.io/api/span

spacy.tokens.span.Span

In [21]:
type(article)

spacy.tokens.doc.Doc

In [22]:
for sentence in article.sents:
    print(sentence)

Following a preview in April, Microsoft this morning announced the general availability of virtual machines (VMs) on Azure featuring the Ampere Altra, a processor based on the Arm architecture.
The first Azure VMs powered by Arm chips, Microsoft says that they’re accessible in 10 Azure regions today and can be included in Kubernetes clusters managed using Azure Kubernetes Service beginning on September 1.


In [26]:
mystring = '"We\'re moving to L.A.!"'

In [27]:
print(mystring)

"We're moving to L.A.!"


In [28]:
doc = nlp(mystring)

In [29]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [31]:
new_string = nlp(u"We're here to help! Our email is username@company.com! Or visit www.companywebsite.info!")

In [32]:
for t in new_string:
    print(t)

We
're
here
to
help
!
Our
email
is
username@company.com
!
Or
visit
www.companywebsite.info
!


In [33]:
len(new_string)

15

In [34]:
new_string.vocab

<spacy.vocab.Vocab at 0x7fc6b83c9440>

In [35]:
len(new_string.vocab)

57852

In [36]:
for t in new_string:
    print(t.text,end='~')

We~'re~here~to~help~!~Our~email~is~username@company.com~!~Or~visit~www.companywebsite.info~!~

In [37]:
for entity in new_string.ents:
    print(entity)

In [40]:
# Let's try another sentence
car_string = nlp(u"Johnny wanted to buy Ford for $10,000 but his girlfriend Claire said he should buy Volvo.")

In [43]:
for entity in car_string.ents:
    print(entity, entity.label_, spacy.explain(entity.label_))

Johnny PERSON People, including fictional
Ford ORG Companies, agencies, institutions, etc.
10,000 MONEY Monetary values, including unit
Claire PERSON People, including fictional
Volvo ORG Companies, agencies, institutions, etc.


In [45]:
for chunk in car_string.noun_chunks:
    print(chunk)

Johnny
Ford
his girlfriend
Claire
he
Volvo


In [46]:
from spacy import displacy

In [47]:
doc = nlp(u"Samsung invested in a new U.S. factory in TX")

In [53]:
displacy.render(doc,style='dep',jupyter=True,options={'distance':90})

In [56]:
doc = nlp(u"Facebook is considering to expand into Metaverse in next several years.")

In [57]:
displacy.render(doc,style='ent',jupyter=True)

## Stemming

In [58]:
import nltk

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


In [61]:
from nltk.stem.porter import PorterStemmer

In [62]:
p_stemmer = PorterStemmer()

In [72]:
words = ['run','runner','ran','running','hardened','easier', 'easily']

In [73]:
for word in words:
    print(word + '---->' + p_stemmer.stem(word))

run---->run
runner---->runner
ran---->ran
running---->run
hardened---->harden
easier---->easier
easily---->easili


In [69]:
from nltk.stem.snowball import SnowballStemmer

In [70]:
s_stemmer = SnowballStemmer(language='english')

In [74]:
for word in words:
    print(word + ' ---->' + s_stemmer.stem(word))

run ---->run
runner ---->runner
ran ---->ran
running ---->run
hardened ---->harden
easier ---->easier
easily ---->easili


In [78]:
word2 = ['generous','generation','generously','thankful','thank','thankless']

In [79]:
for word in word2:
    print(word + ' ---->' + s_stemmer.stem(word))

generous ---->generous
generation ---->generat
generously ---->generous
thankful ---->thank
thank ---->thank
thankless ---->thankless


## Lemmatization

In [80]:
doc1 = nlp(u"I'm a swimmer in the swim race because I enjoy swimming and have been swimming for a long time.")

In [81]:
for token in doc1:
    print(token.text,'\t',token.pos_,'\t',token.lemma, token.lemma_)

I 	 PRON 	 561228191312463089 -PRON-
'm 	 VERB 	 10382539506755952630 be
a 	 DET 	 11901859001352538922 a
swimmer 	 NOUN 	 8984364056738817612 swimmer
in 	 ADP 	 3002984154512732771 in
the 	 DET 	 7425985699627899538 the
swim 	 NOUN 	 13054409096476681252 swim
race 	 NOUN 	 8048469955494714898 race
because 	 ADP 	 16950148841647037698 because
I 	 PRON 	 561228191312463089 -PRON-
enjoy 	 VERB 	 13716726989081948958 enjoy
swimming 	 VERB 	 13054409096476681252 swim
and 	 CCONJ 	 2283656566040971221 and
have 	 VERB 	 14692702688101715474 have
been 	 VERB 	 10382539506755952630 be
swimming 	 VERB 	 13054409096476681252 swim
for 	 ADP 	 16037325823156266367 for
a 	 DET 	 11901859001352538922 a
long 	 ADJ 	 12965068231793614765 long
time 	 NOUN 	 8885804376230376864 time
. 	 PUNCT 	 12646065887601541794 .


## Stop Words

In [83]:
print(nlp.Defaults.stop_words)

{'enough', 'not', 'sixty', 'wherein', 'even', 'across', 'fifty', 'hereby', 'make', 'due', 'once', 'others', 'our', 'because', 'whereupon', 'about', 'hence', 'see', 'call', 'these', 'throughout', 'twelve', 'we', 'when', 'had', 'using', 'whose', 'afterwards', 'been', 'herself', 'until', 'to', 'in', 'under', 'via', 'side', 'below', 'former', 'already', 'therefore', 'amongst', 'else', 'down', 'himself', 'other', 'unless', 'yet', 'what', 'anything', 'ca', 're', 'hundred', 'yourself', 'another', 'why', 'out', 'otherwise', 'whatever', 'least', 'now', 'seemed', 'themselves', 'three', 'thus', 'both', 'take', 'amount', 'each', 'eight', 'thereupon', 'since', 'nevertheless', 'elsewhere', 'alone', 'with', 'thru', 'more', 'within', 'anywhere', 'somehow', 'become', 'anyone', 'last', 'can', 'onto', 'really', 'six', 'should', 'too', 'name', 'never', 'noone', 'his', 'seems', 'if', 'whence', 'eleven', 'yours', 'cannot', 'whereby', 'which', 'also', 'someone', 'none', 'have', 'am', 'two', 'was', 'put', 'se

In [84]:
nlp.vocab['is'].is_stop

True

In [85]:
nlp.vocab['Awesome'].is_stop

False

In [86]:
nlp.Defaults.stop_words.add('rly') # adding 'rly' to the stop words

In [87]:
nlp.vocab['rly'].is_stop

True

In [88]:
nlp.Defaults.stop_words.remove('rly')

In [90]:
nlp.vocab['rly'].is_stop = False

In [92]:
nlp.vocab['rly'].is_stop

False