In [2]:
# Adapted from Udemy lecture; custom modifications as following the lecture

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
doc = nlp(u"We ran fast in time to make to the bday party")

In [5]:
doc.text

'We ran fast in time to make to the bday party'

In [16]:
for word in doc:
    print( f"{word.text:{2}}, {word.pos_:{2}}, {word.tag_:{2}}")

We, PRON, PRP
ran, VERB, VBD
fast, ADV, RB
in, ADP, IN
time, NOUN, NN
to, PART, TO
make, VERB, VB
to, ADP, IN
the, DET, DT
bday, NOUN, NN
party, NOUN, NN


In [26]:
doc = nlp(u"I read books on NLP.")

In [27]:
word = doc[1]
word.text

'read'

In [28]:
f"{word.text:{2}}, {word.pos_:{2}}, {word.tag_:{2}}, {spacy.explain(word.tag_)}"

'read, VERB, VBP, verb, non-3rd person singular present'

In [29]:
doc = nlp(u"I read a book on NLP.")

In [30]:
word = doc[1]
f"{word.text:{2}}, {word.pos_:{2}}, {word.tag_:{2}}, {spacy.explain(word.tag_)}"

'read, VERB, VBD, verb, past tense'

In [31]:
doc.vocab[83].text

'ADJ'

In [32]:
doc.count_by(spacy.attrs.POS)

{96: 1, 99: 1, 84: 1, 89: 1, 91: 1, 94: 1, 95: 1}

In [33]:
doc.vocab[84].text

'ADP'

## NER (Named Entity Recognition)

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print("No entities found.")

In [4]:
doc = nlp(u'Hi how are you?')

In [5]:
show_ents(doc)

No entities found.


In [6]:
doc = nlp(u'Best seafood at Boston by the seashore after flying from NYC')

In [7]:
show_ents(doc)

Boston - GPE - Countries, cities, states
NYC - GPE - Countries, cities, states


In [8]:
doc = nlp(u"Can I buy the latest Tesla stocks on NYSE?")

In [9]:
show_ents(doc)

Tesla - PRODUCT - Objects, vehicles, foods, etc. (not services)
NYSE - ORG - Companies, agencies, institutions, etc.


In [10]:
doc = nlp(u"Tesla to build a U.K. factory for $6M")

In [11]:
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6M - MONEY - Monetary values, including unit


In [12]:
from spacy.tokens import Span
ORG = doc.vocab.strings[u"ORG"]

In [13]:
ORG

381

In [14]:
new_ent = Span(doc,0,1,label=ORG)

In [15]:
doc.ents = list(doc.ents) + [new_ent]

In [16]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6M - MONEY - Monetary values, including unit


In [42]:
doc = nlp(u"Our company created a brand new vacuum cleaner." u"This new vacuum-cleaner from Amazon rocks!")

In [43]:
show_ents(doc)

Amazon - ORG - Companies, agencies, institutions, etc.


In [44]:
from spacy.matcher import PhraseMatcher

In [45]:
matcher = PhraseMatcher(nlp.vocab)

In [46]:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

In [47]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [48]:
matcher.add('newproduct',None,*phrase_patterns)

In [49]:
found_matches = matcher(doc)

In [50]:
found_matches

[(2689272359382549672, 6, 8), (2689272359382549672, 11, 14)]

In [51]:
from spacy.tokens import Span

In [52]:
PROD = doc.vocab.strings[u"PRODUCT"]

In [55]:
new_ents = [Span(doc,match[1],match[2],label=PROD) for match in found_matches]

In [58]:
doc.ents = list(doc.ents) + new_ents

In [59]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum-cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
Amazon - ORG - Companies, agencies, institutions, etc.


In [63]:
doc = nlp(u"Originally I paid $30.71 for this Italian dish but now, there is a discount going for lunch specials on Monday through Friday.")

In [64]:
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[30.71]

In [65]:
[ent for ent in doc.ents]

[30.71, Italian, Monday through Friday]

In [66]:
from spacy import displacy

In [67]:
displacy.render(doc,style='ent',jupyter=True)

In [79]:
colors = {'MONEY':'lightgreen'}
options = {'ents':['MONEY','NORP','DATE'], 'colors':colors}

In [80]:
displacy.render(doc,style='ent',jupyter=True,options=options)

## SentenceSegmentation

In [81]:
from spacy.pipeline import SentenceSegmenter

In [83]:
# article from TechCrunch
doc = nlp(u"Just over two years after its launch, Facebook is shutting down the Facebook Gaming app on October 28, 2022. Now, when you open the app, you’ll see a banner stating that the app will no longer be available on iOS and Android after that date. The app also won’t be available on the Google Play Store or the Apple App Store.")

In [84]:
def split_on_newlines(doc):
    start = 0
    seen_newline = False
    
    for word in doc:
        if seen_newline:
            yield doc[start:word.i]
            start = word.i
            seen_newline = False
        elif word.text.startswith('\n'):
            seen_newline = True
    
    yield doc[start:]

In [85]:
sbd = SentenceSegmenter(nlp.vocab,strategy=split_on_newlines)

In [87]:
nlp.add_pipe(sbd)

In [88]:
for sentence in doc.sents:
    print(sentence)

Just over two years after its launch, Facebook is shutting down the Facebook Gaming app on October 28, 2022.
Now, when you open the app, you’ll see a banner stating that the app will no longer be available on iOS and Android after that date.
The app also won’t be available on the Google Play Store or the Apple App Store.


In [89]:
# now change to add \n
doc = nlp(u"Just over two years after its launch, Facebook is shutting down the Facebook Gaming app on October 28, 2022. Now, when you open the app, \n you’ll see a banner stating that the app will no longer be available on iOS and Android after that date. The app also won’t be available on the Google Play Store or the Apple App Store.")

In [90]:
for sentence in doc.sents:
    print(sentence)

Just over two years after its launch, Facebook is shutting down the Facebook Gaming app on October 28, 2022. Now, when you open the app, 
 
you’ll see a banner stating that the app will no longer be available on iOS and Android after that date. The app also won’t be available on the Google Play Store or the Apple App Store.
