In [1]:
import os 
import nltk

## Getting the required Corpus

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hardi\anaconda3\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
hamlet = nltk.corpus.gutenberg.words("shakespeare-hamlet.txt")

In [5]:
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [6]:
DS = """Data science is the domain of study that deals with vast volumes of data using modern tools and techniques to find unseen patterns, derive meaningful information, and make business decisions. Data science uses complex machine learning algorithms to build predictive models.

The data used for analysis can come from many different sources and presented in various formats.

Now that you know what data science is, let’s see why data science is essential to today’s IT landscape."""



## Tokenization

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
DS_tokens = word_tokenize(DS)

In [9]:
from nltk.probability import FreqDist
fdist = FreqDist()


In [10]:
for word in DS_tokens:
    fdist[word.lower()] += 1
    
fdist

FreqDist({'data': 6, 'science': 4, '.': 4, 'is': 3, 'and': 3, 'to': 3, ',': 3, 'the': 2, 'of': 2, 'that': 2, ...})

In [11]:
#Blankline is used to separate the text by paragraphs if it has paragraphs

from nltk.tokenize import blankline_tokenize

blnk = blankline_tokenize(DS)
blnk[0]

'Data science is the domain of study that deals with vast volumes of data using modern tools and techniques to find unseen patterns, derive meaningful information, and make business decisions. Data science uses complex machine learning algorithms to build predictive models.'

#### Tokenize by n number of words

Bigrams, trigrams, ngrams tokenization

In [12]:
from nltk.util import bigrams, trigrams, ngrams

In [13]:
bigram_toks = list(nltk.bigrams(DS_tokens))

In [14]:
bigram_toks

[('Data', 'science'),
 ('science', 'is'),
 ('is', 'the'),
 ('the', 'domain'),
 ('domain', 'of'),
 ('of', 'study'),
 ('study', 'that'),
 ('that', 'deals'),
 ('deals', 'with'),
 ('with', 'vast'),
 ('vast', 'volumes'),
 ('volumes', 'of'),
 ('of', 'data'),
 ('data', 'using'),
 ('using', 'modern'),
 ('modern', 'tools'),
 ('tools', 'and'),
 ('and', 'techniques'),
 ('techniques', 'to'),
 ('to', 'find'),
 ('find', 'unseen'),
 ('unseen', 'patterns'),
 ('patterns', ','),
 (',', 'derive'),
 ('derive', 'meaningful'),
 ('meaningful', 'information'),
 ('information', ','),
 (',', 'and'),
 ('and', 'make'),
 ('make', 'business'),
 ('business', 'decisions'),
 ('decisions', '.'),
 ('.', 'Data'),
 ('Data', 'science'),
 ('science', 'uses'),
 ('uses', 'complex'),
 ('complex', 'machine'),
 ('machine', 'learning'),
 ('learning', 'algorithms'),
 ('algorithms', 'to'),
 ('to', 'build'),
 ('build', 'predictive'),
 ('predictive', 'models'),
 ('models', '.'),
 ('.', 'The'),
 ('The', 'data'),
 ('data', 'used'),
 ('

In [15]:
#similarly for trigrams

trigrams_toks = list(nltk.trigrams(DS_tokens))

In [16]:
ngram_toks = list(nltk.ngrams(DS_tokens, 5))

In [17]:
ngram_toks

[('Data', 'science', 'is', 'the', 'domain'),
 ('science', 'is', 'the', 'domain', 'of'),
 ('is', 'the', 'domain', 'of', 'study'),
 ('the', 'domain', 'of', 'study', 'that'),
 ('domain', 'of', 'study', 'that', 'deals'),
 ('of', 'study', 'that', 'deals', 'with'),
 ('study', 'that', 'deals', 'with', 'vast'),
 ('that', 'deals', 'with', 'vast', 'volumes'),
 ('deals', 'with', 'vast', 'volumes', 'of'),
 ('with', 'vast', 'volumes', 'of', 'data'),
 ('vast', 'volumes', 'of', 'data', 'using'),
 ('volumes', 'of', 'data', 'using', 'modern'),
 ('of', 'data', 'using', 'modern', 'tools'),
 ('data', 'using', 'modern', 'tools', 'and'),
 ('using', 'modern', 'tools', 'and', 'techniques'),
 ('modern', 'tools', 'and', 'techniques', 'to'),
 ('tools', 'and', 'techniques', 'to', 'find'),
 ('and', 'techniques', 'to', 'find', 'unseen'),
 ('techniques', 'to', 'find', 'unseen', 'patterns'),
 ('to', 'find', 'unseen', 'patterns', ','),
 ('find', 'unseen', 'patterns', ',', 'derive'),
 ('unseen', 'patterns', ',', 'deriv

## Stemming

Stemming is to find similar words which stems or originates from the same root word. 

In [18]:
from nltk.stem import PorterStemmer

In [19]:
st = PorterStemmer()

In [20]:
for word in DS_tokens:
    print(st.stem(word))

data
scienc
is
the
domain
of
studi
that
deal
with
vast
volum
of
data
use
modern
tool
and
techniqu
to
find
unseen
pattern
,
deriv
meaning
inform
,
and
make
busi
decis
.
data
scienc
use
complex
machin
learn
algorithm
to
build
predict
model
.
the
data
use
for
analysi
can
come
from
mani
differ
sourc
and
present
in
variou
format
.
now
that
you
know
what
data
scienc
is
,
let
’
s
see
whi
data
scienc
is
essenti
to
today
’
s
it
landscap
.


## Lemmetization

In [30]:
nltk.download("omw-1.4")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hardi\anaconda3\nltk_data...


True

In [24]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

In [25]:
lemma = WordNetLemmatizer()


In [36]:
lemma.lemmatize("noth")

'noth'