### Tonenization 

In [1]:
import spacy

In [2]:
nlp = spacy.load(name='en_core_web_sm')

In [3]:
# Define a string
text = ('We are looking to buy an profitable startup upto $1 billion valuation !')
print(text)

We are looking to buy an profitable startup upto $1 billion valuation !


In [4]:
# Create a doc object and explore tokens
doc = nlp(text)

In [5]:
# Print each tokens
for token in doc:
    print(token.text)

We
are
looking
to
buy
an
profitable
startup
upto
$
1
billion
valuation
!


In [6]:
len(doc)

14

In [8]:
# Indexing and slicing tokens
doc[0]

We

In [9]:
doc[0:5]

We are looking to buy

In [10]:
doc[-3]

billion

In [11]:
# Named entities
for ent in doc.ents:
    print(ent)

$1 billion


In [16]:
doc_1 = nlp('India is progressing at steady rate, its GDP expected to cross $1 Trillion by 2025')
for ent in doc_1.ents:
    print(ent)

India
$1 Trillion
2025


In [17]:
for ent in doc_1.ents:
    # Print the entity
    print(ent)
    # Print the entity label
    print(ent.label_)
    # Print entity label details
    print(str(spacy.explain(ent.label_)))
    print('\n')

India
GPE
Countries, cities, states


$1 Trillion
MONEY
Monetary values, including unit


2025
DATE
Absolute or relative dates or periods




In [18]:
# Noun chunks
for chunk in doc_1.noun_chunks:
    print(chunk.text)

India
steady rate
its GDP


#### Built-in Visualizers 

In [19]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("India is progressing at steady rate, its GDP expected to cross $1 Trillion by 2025")

In [21]:
# Distance between tokens
displacy.render(docs=doc,style='dep',jupyter=True,options={'distance':50})

In [22]:
# Visualizing the entity recognizer
displacy.render(docs=doc, style='ent', jupyter=True)

### Stemming and Lemmatization 

#### Porter Stemmer 

In [1]:
# Spacy does not have stemming, it supports lemmatization
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
#PorterStemmer
p_stemmer = PorterStemmer()

In [7]:
words = ['run','runner','running','ran','runs','done','helped','easily','fairly']

In [8]:
for word in words:
    print(word + ' ====> ' + p_stemmer.stem(word))

run ====> run
runner ====> runner
running ====> run
ran ====> ran
runs ====> run
done ====> done
helped ====> help
easily ====> easili
fairly ====> fairli


#### Snowball Stemmer 

In [9]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')

In [10]:
words = ['run','runner','running','ran','runs','done','helped','easily','fairly']

In [11]:
for word in words:
    print(word + ' ====> ' + s_stemmer.stem(word))

run ====> run
runner ====> runner
running ====> run
ran ====> ran
runs ====> run
done ====> done
helped ====> help
easily ====> easili
fairly ====> fair


#### Lemmatization 

In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [13]:
doc = nlp("We should come forward and try our best to save wild forest.")
for token in doc:
    print(token.text, '\t', token.pos_, '\t',token.lemma_)

We 	 PRON 	 -PRON-
should 	 VERB 	 should
come 	 VERB 	 come
forward 	 ADV 	 forward
and 	 CCONJ 	 and
try 	 VERB 	 try
our 	 DET 	 -PRON-
best 	 ADJ 	 good
to 	 PART 	 to
save 	 VERB 	 save
wild 	 ADJ 	 wild
forest 	 NOUN 	 forest
. 	 PUNCT 	 .


### Stop Words 

### Vocabulary and Matching

### Parts of Speech Tagging 

### Named Entity Recognition 

### Sentence Segmentation 