### Tonenization 

In [1]:
import spacy

In [2]:
nlp = spacy.load(name='en_core_web_sm')

In [3]:
# Define a string
text = ('We are looking to buy an profitable startup upto $1 billion valuation !')
print(text)

We are looking to buy an profitable startup upto $1 billion valuation !


In [4]:
# Create a doc object and explore tokens
doc = nlp(text)

In [5]:
# Print each tokens
for token in doc:
    print(token.text)

We
are
looking
to
buy
an
profitable
startup
upto
$
1
billion
valuation
!


In [6]:
len(doc)

14

In [8]:
# Indexing and slicing tokens
doc[0]

We

In [9]:
doc[0:5]

We are looking to buy

In [10]:
doc[-3]

billion

In [11]:
# Named entities
for ent in doc.ents:
    print(ent)

$1 billion


In [16]:
doc_1 = nlp('India is progressing at steady rate, its GDP expected to cross $1 Trillion by 2025')
for ent in doc_1.ents:
    print(ent)

India
$1 Trillion
2025


In [17]:
for ent in doc_1.ents:
    # Print the entity
    print(ent)
    # Print the entity label
    print(ent.label_)
    # Print entity label details
    print(str(spacy.explain(ent.label_)))
    print('\n')

India
GPE
Countries, cities, states


$1 Trillion
MONEY
Monetary values, including unit


2025
DATE
Absolute or relative dates or periods




In [18]:
# Noun chunks
for chunk in doc_1.noun_chunks:
    print(chunk.text)

India
steady rate
its GDP


#### Built-in Visualizers 

In [19]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("India is progressing at steady rate, its GDP expected to cross $1 Trillion by 2025")

In [21]:
# Distance between tokens
displacy.render(docs=doc,style='dep',jupyter=True,options={'distance':50})

In [22]:
# Visualizing the entity recognizer
displacy.render(docs=doc, style='ent', jupyter=True)

### Stemming and Lemmatization 

#### Porter Stemmer 

In [1]:
# Spacy does not have stemming, it supports lemmatization
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
#PorterStemmer
p_stemmer = PorterStemmer()

In [7]:
words = ['run','runner','running','ran','runs','done','helped','easily','fairly']

In [8]:
for word in words:
    print(word + ' ====> ' + p_stemmer.stem(word))

run ====> run
runner ====> runner
running ====> run
ran ====> ran
runs ====> run
done ====> done
helped ====> help
easily ====> easili
fairly ====> fairli


#### Snowball Stemmer 

In [9]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')

In [10]:
words = ['run','runner','running','ran','runs','done','helped','easily','fairly']

In [11]:
for word in words:
    print(word + ' ====> ' + s_stemmer.stem(word))

run ====> run
runner ====> runner
running ====> run
ran ====> ran
runs ====> run
done ====> done
helped ====> help
easily ====> easili
fairly ====> fair


#### Lemmatization 

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
doc = nlp("We should come forward and try our best to save wild forest.")
for token in doc:
    print(token.text, '\t', token.pos_, '\t',token.lemma_)

We 	 PRON 	 -PRON-
should 	 VERB 	 should
come 	 VERB 	 come
forward 	 ADV 	 forward
and 	 CCONJ 	 and
try 	 VERB 	 try
our 	 DET 	 -PRON-
best 	 ADJ 	 good
to 	 PART 	 to
save 	 VERB 	 save
wild 	 ADJ 	 wild
forest 	 NOUN 	 forest
. 	 PUNCT 	 .


### Stop Words 

In [15]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [16]:
print(nlp.Defaults.stop_words)

{'across', 'quite', 'how', 'every', 'make', 'only', "n't", 'of', 'rather', 'bottom', 'own', '‘m', 'yet', 'himself', 'are', 'since', 'amongst', 'side', 'two', 'under', 'anyhow', 'could', 'and', 'during', 'an', 'amount', 'however', 'put', 'myself', 'these', 'nevertheless', 'ten', 'towards', 'him', 'very', 'through', 'say', 'call', 'otherwise', 'such', 're', 'except', 'me', 'by', 'you', 'yourself', 'her', 'too', 'might', 'their', 'herein', 'may', 'mine', 'latterly', 'from', 'several', 'if', 'until', 'behind', 'throughout', 'each', 'after', 'any', 'often', 'was', "'ve", 'being', 'become', 'hereby', 'made', 'its', '’ve', 'within', 'fifty', 'keep', 'toward', 'became', 'thereupon', 'with', 'seems', 'but', 'anyway', 'do', 'else', 'sometimes', 'first', 'so', 'alone', 'this', 'hereupon', 'nothing', 'she', '’s', 'thereby', 'been', 'twelve', 'done', 'n’t', 'further', 'everyone', 'or', '‘d', 'few', 'around', 'afterwards', 'some', 'wherever', 'move', "'ll", 'another', 'none', 'then', 'latter', 'abov

In [17]:
# Total number of stop words in spacy
len(nlp.Defaults.stop_words)

326

In [18]:
# To check whether it is a stopword or not
nlp.vocab['helping'].is_stop

False

In [19]:
nlp.vocab['always'].is_stop

True

In [20]:
nlp.vocab['where'].is_stop

True

#### Add stopword 

In [36]:
nlp.vocab['test'].is_stop

False

In [37]:
nlp.Defaults.stop_words.add('test')
# Set stop word
nlp.vocab['test'].is_stop = True

In [38]:
nlp.vocab['test'].is_stop

True

#### Remove stopword 

In [39]:
nlp.Defaults.stop_words.remove('test')
nlp.vocab['test'].is_stop = False

In [40]:
nlp.vocab['test'].is_stop

False

### Vocabulary and Matching

In [41]:
import spacy
nlp = spacy.load('en_core_web_sm')
# Import matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

#### Creating Patterns 

In [42]:
pattern_1 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_2 = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

In [43]:
# Add patterns to matcher object
matcher.add('Hello World', None, pattern_1, pattern_2)

In [44]:
# Create a document
doc = nlp(" 'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common for beginners")

In [45]:
doc

 'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common for beginners

In [46]:
# Finding matches
find_matches = matcher(doc)

In [47]:
print(find_matches)

[(8585552006568828647, 2, 4), (8585552006568828647, 19, 22)]


#### Setting Pattern options and quantifiers 

In [48]:
pattern_3 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_4 = [{'LOWER': 'hello'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'world'}]

In [49]:
matcher.add('Hello World', None, pattern_3, pattern_4)

In [50]:
doc_1 = nlp("You can print Hello World or hello world or Hello-World")

In [51]:
find_matches = matcher(doc_1)
print(find_matches)

[(8585552006568828647, 3, 5), (8585552006568828647, 6, 8), (8585552006568828647, 9, 12)]


#### Phrase Mapping 

In [52]:
# Import PhraseMatcher
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [53]:
phrase_list = ["Hello World", "Alien Speaks", "How is going"]
# Convert each phrase in document object
phrase_patterns = [nlp(text) for text in phrase_list]

In [54]:
phrase_patterns

[Hello World, Alien Speaks, How is going]

In [55]:
type(phrase_patterns[0]) # Spacy docs

spacy.tokens.doc.Doc

In [56]:
# Pass each phrase into spacyobject
matcher.add("TerminologyList", None, *phrase_patterns)

In [58]:
doc_2 = nlp("Hello World, this is Alien Speaks from enterprise"
          "How is going now, we are taking over")

In [59]:
find_matches = matcher(doc_2) 
print(find_matches)

[(3766102292120407359, 0, 2), (3766102292120407359, 5, 7)]


In [60]:
for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc_3[start:end]
    print(match_id, string_id, start, end, span.text)

3766102292120407359 TerminologyList 0 2 Hello World
3766102292120407359 TerminologyList 5 7 Alien Speaks


### Parts of Speech Tagging 

In [61]:
import spacy
nlp = spacy.load(name='en_core_web_sm')

In [64]:
doc = nlp('We are looking to buy an profitable startup upto $1 billion valuation !')

In [65]:
print(doc.text)

We are looking to buy an profitable startup upto $1 billion valuation !


In [66]:
print(doc[0])

We


In [67]:
# Get the POS tag
print(doc[0].pos_)

PRON


In [68]:
print(doc[4].pos_)

VERB


In [69]:
# Fine-grained POS tag
print(doc[4].tag_)

VB


In [70]:
for token in doc:
    print(f'{token.text:{10}} {token.lemma_:{8}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

We         -PRON-   PRON     PRP    pronoun, personal
are        be       AUX      VBP    verb, non-3rd person singular present
looking    look     VERB     VBG    verb, gerund or present participle
to         to       PART     TO     infinitival "to"
buy        buy      VERB     VB     verb, base form
an         an       DET      DT     determiner
profitable profitable ADJ      JJ     adjective
startup    startup  NOUN     NN     noun, singular or mass
upto       upto     VERB     VB     verb, base form
$          $        SYM      $      symbol, currency
1          1        NUM      CD     cardinal number
billion    billion  NUM      CD     cardinal number
valuation  valuation NOUN     NN     noun, singular or mass
!          !        PUNCT    .      punctuation mark, sentence closer


#### Counting POS tags 

In [71]:
POS_counts = doc.count_by(spacy.attrs.POS)

In [72]:
print(POS_counts)

{95: 1, 87: 1, 100: 3, 94: 1, 90: 1, 84: 1, 92: 2, 99: 1, 93: 2, 97: 1}


In [73]:
# Decode POS code
doc.vocab[95].text

'PRON'

In [74]:
doc.vocab[100].text

'VERB'

#### Visualizing POS  

In [75]:
from spacy import displacy

In [79]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 80})

### Named Entity Recognition 

In [83]:
doc = nlp('Google is looking to buy an profitable startup upto $1 billion valuation in India, based on Mumbai!')

In [84]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, str(spacy.explain(ent.label_)))

Google 0 6 ORG Companies, agencies, institutions, etc.
$1 billion 52 62 MONEY Monetary values, including unit
India 76 81 GPE Countries, cities, states
Mumbai 92 98 GPE Countries, cities, states


#### Adding named entity to span 

In [91]:
doc = nlp('hello is looking to buy an profitable startup upto $1 billion valuation in India, based on Mumbai!')

In [92]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, str(spacy.explain(ent.label_)))

$1 billion 51 61 MONEY Monetary values, including unit
India 75 80 GPE Countries, cities, states
Mumbai 91 97 GPE Countries, cities, states


In [93]:
from spacy.tokens import Span

In [94]:
# Get the Hash value of ORG
ORG = doc.vocab.strings['ORG']
print(ORG)

383


In [95]:
# Create a new span
new_ent = Span(doc, 0, 1, label=ORG)

In [96]:
# Add the entity to the existing ent object
doc.ents = list(doc.ents) + [new_ent]

In [97]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, str(spacy.explain(ent.label_)))

hello 0 5 ORG Companies, agencies, institutions, etc.
$1 billion 51 61 MONEY Monetary values, including unit
India 75 80 GPE Countries, cities, states
Mumbai 91 97 GPE Countries, cities, states


#### Visualizing named entity 

In [98]:
displacy.render(docs=doc,style='ent',jupyter=True)

In [99]:
# Visualizing specific entities
options = {'ents': ['ORG', 'GPE']}
displacy.render(docs=doc,style='ent',jupyter=True,options=options)

### Sentence Segmentation 