In [1]:
!pip install spacy nltk



In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Dr. strange loves apples. Dr. treat mr. brown")

for sen in doc.sents:
    print(sen)
    for word in sen:
        print(word)

Dr. strange loves apples.
Dr.
strange
loves
apples
.
Dr. treat mr. brown
Dr.
treat
mr
.
brown


In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lior9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from nltk.tokenize import sent_tokenize

sent_tokenize("Dr. strange loves apples. Dr. treat mr. brown")

['Dr. strange loves apples.', 'Dr. treat mr. brown']

In [5]:
from nltk.tokenize import word_tokenize
word_tokenize("Dr. strange loves apples. Dr. treat mr. brown")

['Dr.', 'strange', 'loves', 'apples', '.', 'Dr.', 'treat', 'mr.', 'brown']

In [6]:
nlp = spacy.blank("en")
doc = nlp("john gave two $ to Lior")
doc[0]

john

In [7]:
for token in doc:
    print(token)

john
gave
two
$
to
Lior


In [8]:
doc[2].like_num

True

In [9]:
doc[3].is_currency

True

In [10]:
with open("students.txt") as f:
    txt = f.readlines()
txt    

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [11]:
joined_txt = ' '.join(txt)
joined_txt



In [12]:
tokens = nlp(joined_txt)
eml_lst = []
for token in tokens:
    if token.like_email:
        eml_lst.append(token)
eml_lst

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

In [13]:
from spacy.symbols import ORTH
sen = nlp("give me a double cheeseburger")
tokens = [token.text for token in sen]
tokens

['give', 'me', 'a', 'double', 'cheeseburger']

In [14]:
nlp.tokenizer.add_special_case("cheeseburger",[{ORTH: "cheese"},{ORTH:"burger"}])
sen = nlp("give me a double cheeseburger")
tokens = [token.text for token in sen]
tokens

['give', 'me', 'a', 'double', 'cheese', 'burger']

In [16]:
doc = nlp("Dr. strange loves apples. Dr. treat mr. brown")

for sen in doc.sents:
    print(sen)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [17]:
nlp.pipeline

[]

In [18]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x17aa421edc8>

In [19]:
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x17aa421edc8>)]

In [20]:
doc = nlp("Dr. strange loves apples. Dr. treat mr. brown")
for sen in doc.sents:
    print(sen)

Dr. strange loves apples.
Dr. treat mr.
brown


Exercise
(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [21]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

exe_doc1 = nlp(text)
for word in exe_doc1:
    if word.like_url:
        print(word)

http://www.data.gov/
http://www.science
http://data.gov.uk/.
http://www3.norc.org/gss+website/
http://www.europeansocialsurvey.org/.


(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [22]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

exe_doc2 = nlp(transactions)
for i in range(len(exe_doc2)):
    if exe_doc2[i].like_num and exe_doc2[i+1].is_currency:
        print(exe_doc2[i], exe_doc2[i+1])

two $
500 €


In [23]:
import spacy

nlp = spacy.blank("en")

doc = nlp("Captain america ate 100$ of sushi. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
sushi
.
Then
he
said
I
can
do
this
all
day
.


In [24]:
nlp.pipe_names

[]

In [25]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [26]:
doc = nlp("Captain america ate 100$ of sushi. Then he said I can do this all day.")

for token in doc:
    print(token, " | ", spacy.explain(token.pos_), " | ", token.lemma_)   #pos - tagger, lemma - lemmatizer

Captain  |  proper noun  |  Captain
america  |  proper noun  |  america
ate  |  verb  |  eat
100  |  numeral  |  100
$  |  numeral  |  $
of  |  adposition  |  of
sushi  |  noun  |  sushi
.  |  punctuation  |  .
Then  |  adverb  |  then
he  |  pronoun  |  he
said  |  verb  |  say
I  |  pronoun  |  I
can  |  auxiliary  |  can
do  |  verb  |  do
this  |  pronoun  |  this
all  |  determiner  |  all
day  |  noun  |  day
.  |  punctuation  |  .


In [27]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)   #ner

Tesla Inc ORG
$45 billion MONEY


In [28]:
from spacy import displacy

displacy.render(doc, style="ent")

In [29]:
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")
nlp.add_pipe("ner", source=source_nlp)
nlp.pipe_names

['ner']

In [30]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Tesla Inc ORG
$45 billion MONEY


In [31]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [32]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [33]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meeting
better  |  well


In [35]:
ar = nlp.get_pipe('attribute_ruler')
ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})   #extension
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust
