**1.Tokenization of text**

In [11]:
import nltk

nltk.download('punkt_tab')
nltk.download('punkt')

corpus = """This is an exciting time to be working in speech and language processing.
Historically distinct fields (natural language processing, speech recognition,
computational linguistics, computational psycholinguistics) have begun to merge."""
tokens = nltk.word_tokenize(corpus)

print("Original corpus:\n",corpus,"\n")
print("Tokenized words : \n", tokens)


Original corpus:
 This is an exciting time to be working in speech and language processing.
Historically distinct fields (natural language processing, speech recognition,
computational linguistics, computational psycholinguistics) have begun to merge. 

Tokenized words : 
 ['This', 'is', 'an', 'exciting', 'time', 'to', 'be', 'working', 'in', 'speech', 'and', 'language', 'processing', '.', 'Historically', 'distinct', 'fields', '(', 'natural', 'language', 'processing', ',', 'speech', 'recognition', ',', 'computational', 'linguistics', ',', 'computational', 'psycholinguistics', ')', 'have', 'begun', 'to', 'merge', '.']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**2.Stop word Removal**

In [15]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
corpus = """This is an exciting time to be working in speech and language processing.
Historically distinct fields (natural language processing, speech recognition, computational
linguistics, computational psycholinguistics) have begun to merge."""
tokens = nltk.word_tokenize(corpus)
print("Original corpus :\n",corpus,"\n")
print("Tokenized words : \n", tokens)
stop_words = set(stopwords.words("english"))
rel_words = [rel for rel in tokens if not rel in stop_words]
print("\nTokens without stop words :\n",rel_words)


Original corpus :
 This is an exciting time to be working in speech and language processing.
Historically distinct fields (natural language processing, speech recognition, computational
linguistics, computational psycholinguistics) have begun to merge. 

Tokenized words : 
 ['This', 'is', 'an', 'exciting', 'time', 'to', 'be', 'working', 'in', 'speech', 'and', 'language', 'processing', '.', 'Historically', 'distinct', 'fields', '(', 'natural', 'language', 'processing', ',', 'speech', 'recognition', ',', 'computational', 'linguistics', ',', 'computational', 'psycholinguistics', ')', 'have', 'begun', 'to', 'merge', '.']

Tokens without stop words :
 ['This', 'exciting', 'time', 'working', 'speech', 'language', 'processing', '.', 'Historically', 'distinct', 'fields', '(', 'natural', 'language', 'processing', ',', 'speech', 'recognition', ',', 'computational', 'linguistics', ',', 'computational', 'psycholinguistics', ')', 'begun', 'merge', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**3.Stemming of text**

In [16]:
import nltk
from nltk.stem.porter import PorterStemmer
corpus = """This is an exciting time to be working in speech and language processing.
Historically distinct fields (natural language processing, speech recognition, computational
linguistics, computational psycholinguistics) have begun to merge."""
tokens = nltk.word_tokenize(corpus)
print("Original corpus:\n",corpus,"\n")
print("Tokenized words : \n", tokens)
porter = PorterStemmer()
stem_words = [porter.stem(stem) for stem in tokens]
print("\nStemmed words :\n",stem_words)


Original corpus:
 This is an exciting time to be working in speech and language processing.
Historically distinct fields (natural language processing, speech recognition, computational
linguistics, computational psycholinguistics) have begun to merge. 

Tokenized words : 
 ['This', 'is', 'an', 'exciting', 'time', 'to', 'be', 'working', 'in', 'speech', 'and', 'language', 'processing', '.', 'Historically', 'distinct', 'fields', '(', 'natural', 'language', 'processing', ',', 'speech', 'recognition', ',', 'computational', 'linguistics', ',', 'computational', 'psycholinguistics', ')', 'have', 'begun', 'to', 'merge', '.']

Stemmed words :
 ['thi', 'is', 'an', 'excit', 'time', 'to', 'be', 'work', 'in', 'speech', 'and', 'languag', 'process', '.', 'histor', 'distinct', 'field', '(', 'natur', 'languag', 'process', ',', 'speech', 'recognit', ',', 'comput', 'linguist', ',', 'comput', 'psycholinguist', ')', 'have', 'begun', 'to', 'merg', '.']


**4.Lemmatization**

In [19]:
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
corpus = "studies studying cries cry"
tokens = nltk.word_tokenize(corpus)
print("Original corpus:\n",corpus,"\n")
print("Tokenized words : \n", tokens)
lemma = WordNetLemmatizer()
lem_words = [lemma.lemmatize(lem) for lem in tokens]
print("\nLemmatized words :\n",lem_words)


Original corpus:
 studies studying cries cry 

Tokenized words : 
 ['studies', 'studying', 'cries', 'cry']

Lemmatized words :
 ['study', 'studying', 'cry', 'cry']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**5.N-gram model**

In [23]:
import nltk
nltk.download('reuters')
nltk.download('punkt')
from collections import defaultdict
from nltk.corpus import reuters
from nltk.util import trigrams

model = defaultdict(lambda: defaultdict(lambda: 0))
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
print(dict(model[('the','bank')]))

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'was': 0.033707865168539325, ',': 0.018726591760299626, "'": 0.29213483146067415, 'holding': 0.018726591760299626, '.': 0.026217228464419477, 'credit': 0.003745318352059925, 'said': 0.15355805243445692, 'has': 0.026217228464419477, 'bought': 0.00749063670411985, 'reported': 0.003745318352059925, 'would': 0.0299625468164794, 'regards': 0.00749063670411985, 'operating': 0.003745318352059925, 'as': 0.003745318352059925, 'will': 0.018726591760299626, 'is': 0.011235955056179775, 'loaned': 0.003745318352059925, 'had': 0.0299625468164794, 'to': 0.0449438202247191, 'lending': 0.011235955056179775, 'by': 0.00749063670411985, '"': 0.003745318352059925, 'of': 0.003745318352059925, 'when': 0.003745318352059925, 'were': 0.003745318352059925, 'dropped': 0.003745318352059925, 'must': 0.003745318352059925, 'research': 0.003745318352059925, 'group': 0.00749063670411985, 'card': 0.003745318352059925, 'did': 0.003745318352059925, 'still': 0.011235955056179775, 'and': 0.00749063670411985, 'in': 0.0037453

**6.POS tagging.**

In [30]:
import nltk
# Ensure the specific English tagger resource is downloaded
nltk.download('averaged_perceptron_tagger_eng')
from nltk import pos_tag

text = """This is an exciting time to be working in speech and language processing. Historically
distinct fields (natural language processing, speech recognition, computational linguistics,
computational psycholinguistics) have begun to merge."""

tokens = text.split(); print("Tokenized words :\n",tokens)
tagged_words = pos_tag(tokens)
print("\nPOS tagged words : \n",tagged_words)
# Removed the 'error' at the end as it was causing a SyntaxError

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Tokenized words :
 ['This', 'is', 'an', 'exciting', 'time', 'to', 'be', 'working', 'in', 'speech', 'and', 'language', 'processing.', 'Historically', 'distinct', 'fields', '(natural', 'language', 'processing,', 'speech', 'recognition,', 'computational', 'linguistics,', 'computational', 'psycholinguistics)', 'have', 'begun', 'to', 'merge.']

POS tagged words : 
 [('This', 'DT'), ('is', 'VBZ'), ('an', 'DT'), ('exciting', 'JJ'), ('time', 'NN'), ('to', 'TO'), ('be', 'VB'), ('working', 'VBG'), ('in', 'IN'), ('speech', 'NN'), ('and', 'CC'), ('language', 'NN'), ('processing.', 'NN'), ('Historically', 'NNP'), ('distinct', 'JJ'), ('fields', 'NNS'), ('(natural', 'JJ'), ('language', 'NN'), ('processing,', 'NN'), ('speech', 'NN'), ('recognition,', 'VBP'), ('computational', 'JJ'), ('linguistics,', 'JJ'), ('computational', 'JJ'), ('psycholinguistics)', 'NN'), ('have', 'VBP'), ('begun', 'VBN'), ('to', 'TO'), ('merge.', 'VB')]


**7.Chunking**

In [32]:
from nltk import pos_tag
from nltk import RegexpParser
corpus =""""This is an exciting time to be working in speech and language processing. Historically
distinct fields (natural language processing, speech recognition, computational linguistics,
computational psycholinguistics) have begun to merge."""
tokens = corpus.split()
print("Original corpus :\n",corpus)
print("\nSplit Text :\n",tokens)
import nltk
nltk.download('averaged_perceptron_tagger')
tokens_tag = pos_tag(tokens)
print("\nPOS tagging :\n",tokens_tag)
patterns= """mychunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}"""
chunker = RegexpParser(patterns)
print("\nAfter Regex :\n",chunker)
output = chunker.parse(tokens_tag)
print("\nChunked Text \n",output)


Original corpus :
 "This is an exciting time to be working in speech and language processing. Historically
distinct fields (natural language processing, speech recognition, computational linguistics,
computational psycholinguistics) have begun to merge.

Split Text :
 ['"This', 'is', 'an', 'exciting', 'time', 'to', 'be', 'working', 'in', 'speech', 'and', 'language', 'processing.', 'Historically', 'distinct', 'fields', '(natural', 'language', 'processing,', 'speech', 'recognition,', 'computational', 'linguistics,', 'computational', 'psycholinguistics)', 'have', 'begun', 'to', 'merge.']

POS tagging :
 [('"This', 'NN'), ('is', 'VBZ'), ('an', 'DT'), ('exciting', 'JJ'), ('time', 'NN'), ('to', 'TO'), ('be', 'VB'), ('working', 'VBG'), ('in', 'IN'), ('speech', 'NN'), ('and', 'CC'), ('language', 'NN'), ('processing.', 'NN'), ('Historically', 'NNP'), ('distinct', 'JJ'), ('fields', 'NNS'), ('(natural', 'JJ'), ('language', 'NN'), ('processing,', 'NN'), ('speech', 'NN'), ('recognition,', 'VBP'), (

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


**8.Named Entity Recognition**

In [34]:
import nltk
import spacy

nlp = spacy.load('en_core_web_sm')

text = """Sundar Pichai is the CEO of Google. Its headquarter is in Mountain View.Yuvraj Singh revealed
the key advice Sachin Tendulkar gave during the 2011 World Cup which helped the team immensely.
Tendulkar was India's highest run-getter during the"""
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)


Sundar Pichai PERSON
Google ORG
Mountain View GPE
Yuvraj Singh PERSON
Sachin Tendulkar PERSON
the 2011 World Cup EVENT
Tendulkar PERSON
India GPE


**9.Bag of Words**

In [36]:
import nltk
import re
import numpy as np
import heapq
nltk.download('punkt')

text="""Beans. I was trying to explain to somebody as we were flying in, that’s corn. That’s beans. And
they were very impressed at my agricultural knowledge. Please give it up for Amaury once again for that
outstanding introduction. I have a bunch of good friends here today, including somebody who I served
with, who is one of the finest senators in the country, and we’re lucky to have him, your Senator, Dick
Durbin is here. I also noticed, by the way, former Governor Edgar here, who I haven’t seen in a long time,
and somehow he has not aged and I have. And it’s great to see you, Governor. I want to thank President
Killeen and everybody at the U of I System for making it possible for me to be here today. And I am
deeply honored at the Paul Douglas Award that is being given to me. He is somebody who set the path for
so much outstanding public service here in Illinois. Now, I want to start by addressing the elephant in the
room. I know people are still wondering why I didn’t speak at the commencement."""

dataset = nltk.sent_tokenize(text)

for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W', ' ', dataset[i])
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

word2count = {}

for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

freq_words = heapq.nlargest(100, word2count, key=word2count.get)
print(freq_words)


['i', 'the', 'to', 'and', 'in', 'for', 'here', 'that', 'at', 'who', 'is', 'somebody', 's', 'it', 'have', 'of', 'beans', 'we', 'were', 'outstanding', 'a', 'today', 'by', 'governor', 't', 'he', 'want', 'me', 'was', 'trying', 'explain', 'as', 'flying', 'corn', 'they', 'very', 'impressed', 'my', 'agricultural', 'knowledge', 'please', 'give', 'up', 'amaury', 'once', 'again', 'introduction', 'bunch', 'good', 'friends', 'including', 'served', 'with', 'one', 'finest', 'senators', 'country', 're', 'lucky', 'him', 'your', 'senator', 'dick', 'durbin', 'also', 'noticed', 'way', 'former', 'edgar', 'haven', 'seen', 'long', 'time', 'somehow', 'has', 'not', 'aged', 'great', 'see', 'you', 'thank', 'president', 'killeen', 'everybody', 'u', 'system', 'making', 'possible', 'be', 'am', 'deeply', 'honored', 'paul', 'douglas', 'award', 'being', 'given', 'set', 'path', 'so']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**10.N-Gram Models.**

In [39]:
import nltk
nltk.download('punkt')
word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)
print(list(nltk.bigrams(nltk_tokens)))
print(list(nltk.trigrams(nltk_tokens)))


[('The', 'best'), ('best', 'performance'), ('performance', 'can'), ('can', 'bring'), ('bring', 'in'), ('in', 'sky'), ('sky', 'high'), ('high', 'success'), ('success', '.')]
[('The', 'best', 'performance'), ('best', 'performance', 'can'), ('performance', 'can', 'bring'), ('can', 'bring', 'in'), ('bring', 'in', 'sky'), ('in', 'sky', 'high'), ('sky', 'high', 'success'), ('high', 'success', '.')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
