In [1]:
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
#sentence tokenize
text = "Backgammon is one of the oldest known board games. Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East. It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice."
sentences = nltk.sent_tokenize(text)

for sentence in sentences:
    print(sentence)

Backgammon is one of the oldest known board games.
Its history can be traced back nearly 5,000 years to archeological discoveries in the Middle East.
It is a two player game where each player has fifteen checkers which move between twenty-four points according to the roll of two dice.


In [3]:
#word tokenize
words = nltk.word_tokenize(text)
for word in words:
    print(word)

Backgammon
is
one
of
the
oldest
known
board
games
.
Its
history
can
be
traced
back
nearly
5,000
years
to
archeological
discoveries
in
the
Middle
East
.
It
is
a
two
player
game
where
each
player
has
fifteen
checkers
which
move
between
twenty-four
points
according
to
the
roll
of
two
dice
.


In [4]:
#frequency of token
from nltk.probability import FreqDist
fdist = FreqDist()


for i in words:
    fdist[i] = fdist[i] + 1 
fdist

FreqDist({'the': 3, '.': 3, 'is': 2, 'of': 2, 'to': 2, 'two': 2, 'player': 2, 'Backgammon': 1, 'one': 1, 'oldest': 1, ...})

In [7]:
#most commonly occured 
top_10 = fdist.most_common(10)
top_10

[('the', 3),
 ('.', 3),
 ('is', 2),
 ('of', 2),
 ('to', 2),
 ('two', 2),
 ('player', 2),
 ('Backgammon', 1),
 ('one', 1),
 ('oldest', 1)]

In [9]:
sent = "a set of words that is complete in itself, typically containing a subject and predicate, conveying a statement, question, exclamation, or command, and consisting of a main clause and sometimes one or more subordinate clauses"

words = nltk.word_tokenize(sent)
words

['a',
 'set',
 'of',
 'words',
 'that',
 'is',
 'complete',
 'in',
 'itself',
 ',',
 'typically',
 'containing',
 'a',
 'subject',
 'and',
 'predicate',
 ',',
 'conveying',
 'a',
 'statement',
 ',',
 'question',
 ',',
 'exclamation',
 ',',
 'or',
 'command',
 ',',
 'and',
 'consisting',
 'of',
 'a',
 'main',
 'clause',
 'and',
 'sometimes',
 'one',
 'or',
 'more',
 'subordinate',
 'clauses']

In [10]:
#bigrams
list(nltk.bigrams(words))

[('a', 'set'),
 ('set', 'of'),
 ('of', 'words'),
 ('words', 'that'),
 ('that', 'is'),
 ('is', 'complete'),
 ('complete', 'in'),
 ('in', 'itself'),
 ('itself', ','),
 (',', 'typically'),
 ('typically', 'containing'),
 ('containing', 'a'),
 ('a', 'subject'),
 ('subject', 'and'),
 ('and', 'predicate'),
 ('predicate', ','),
 (',', 'conveying'),
 ('conveying', 'a'),
 ('a', 'statement'),
 ('statement', ','),
 (',', 'question'),
 ('question', ','),
 (',', 'exclamation'),
 ('exclamation', ','),
 (',', 'or'),
 ('or', 'command'),
 ('command', ','),
 (',', 'and'),
 ('and', 'consisting'),
 ('consisting', 'of'),
 ('of', 'a'),
 ('a', 'main'),
 ('main', 'clause'),
 ('clause', 'and'),
 ('and', 'sometimes'),
 ('sometimes', 'one'),
 ('one', 'or'),
 ('or', 'more'),
 ('more', 'subordinate'),
 ('subordinate', 'clauses')]

In [11]:
#trigrams
list(nltk.trigrams(words))

[('a', 'set', 'of'),
 ('set', 'of', 'words'),
 ('of', 'words', 'that'),
 ('words', 'that', 'is'),
 ('that', 'is', 'complete'),
 ('is', 'complete', 'in'),
 ('complete', 'in', 'itself'),
 ('in', 'itself', ','),
 ('itself', ',', 'typically'),
 (',', 'typically', 'containing'),
 ('typically', 'containing', 'a'),
 ('containing', 'a', 'subject'),
 ('a', 'subject', 'and'),
 ('subject', 'and', 'predicate'),
 ('and', 'predicate', ','),
 ('predicate', ',', 'conveying'),
 (',', 'conveying', 'a'),
 ('conveying', 'a', 'statement'),
 ('a', 'statement', ','),
 ('statement', ',', 'question'),
 (',', 'question', ','),
 ('question', ',', 'exclamation'),
 (',', 'exclamation', ','),
 ('exclamation', ',', 'or'),
 (',', 'or', 'command'),
 ('or', 'command', ','),
 ('command', ',', 'and'),
 (',', 'and', 'consisting'),
 ('and', 'consisting', 'of'),
 ('consisting', 'of', 'a'),
 ('of', 'a', 'main'),
 ('a', 'main', 'clause'),
 ('main', 'clause', 'and'),
 ('clause', 'and', 'sometimes'),
 ('and', 'sometimes', 'one'

In [12]:
#ngram
list(nltk.ngrams(words,5))

[('a', 'set', 'of', 'words', 'that'),
 ('set', 'of', 'words', 'that', 'is'),
 ('of', 'words', 'that', 'is', 'complete'),
 ('words', 'that', 'is', 'complete', 'in'),
 ('that', 'is', 'complete', 'in', 'itself'),
 ('is', 'complete', 'in', 'itself', ','),
 ('complete', 'in', 'itself', ',', 'typically'),
 ('in', 'itself', ',', 'typically', 'containing'),
 ('itself', ',', 'typically', 'containing', 'a'),
 (',', 'typically', 'containing', 'a', 'subject'),
 ('typically', 'containing', 'a', 'subject', 'and'),
 ('containing', 'a', 'subject', 'and', 'predicate'),
 ('a', 'subject', 'and', 'predicate', ','),
 ('subject', 'and', 'predicate', ',', 'conveying'),
 ('and', 'predicate', ',', 'conveying', 'a'),
 ('predicate', ',', 'conveying', 'a', 'statement'),
 (',', 'conveying', 'a', 'statement', ','),
 ('conveying', 'a', 'statement', ',', 'question'),
 ('a', 'statement', ',', 'question', ','),
 ('statement', ',', 'question', ',', 'exclamation'),
 (',', 'question', ',', 'exclamation', ','),
 ('question

In [13]:
tuple(nltk.ngrams(words,7))

(('a', 'set', 'of', 'words', 'that', 'is', 'complete'),
 ('set', 'of', 'words', 'that', 'is', 'complete', 'in'),
 ('of', 'words', 'that', 'is', 'complete', 'in', 'itself'),
 ('words', 'that', 'is', 'complete', 'in', 'itself', ','),
 ('that', 'is', 'complete', 'in', 'itself', ',', 'typically'),
 ('is', 'complete', 'in', 'itself', ',', 'typically', 'containing'),
 ('complete', 'in', 'itself', ',', 'typically', 'containing', 'a'),
 ('in', 'itself', ',', 'typically', 'containing', 'a', 'subject'),
 ('itself', ',', 'typically', 'containing', 'a', 'subject', 'and'),
 (',', 'typically', 'containing', 'a', 'subject', 'and', 'predicate'),
 ('typically', 'containing', 'a', 'subject', 'and', 'predicate', ','),
 ('containing', 'a', 'subject', 'and', 'predicate', ',', 'conveying'),
 ('a', 'subject', 'and', 'predicate', ',', 'conveying', 'a'),
 ('subject', 'and', 'predicate', ',', 'conveying', 'a', 'statement'),
 ('and', 'predicate', ',', 'conveying', 'a', 'statement', ','),
 ('predicate', ',', 'con

In [17]:
#stemming 
from nltk.stem import PorterStemmer
stm = PorterStemmer()

stm.stem('buying'),stm.stem('eductional'),stm.stem('sometimes')

('buy', 'educt', 'sometim')

In [25]:
#lemmitization
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('geese'),lemmatizer.lemmatize('cacti')

('goose', 'cactus')

In [8]:
#pos tag
sent = input('Enter sentence: ')
sent_tok = nltk.word_tokenize(sent)

for i in sent_tok:
    print(nltk.pos_tag([i]))

Enter sentence: my name is komal
[('my', 'PRP$')]
[('name', 'NN')]
[('is', 'VBZ')]
[('komal', 'NN')]


In [9]:
#name entity recognition
from nltk import ne_chunk

sent1 = 'John lives in New York'
sent1_tok = nltk.word_tokenize(sent1)
sent1_pos = nltk.pos_tag(sent1_tok)
print(sent1_pos)

sent1_ner = ne_chunk(sent1_pos)
print(sent1_ner)

[('John', 'NNP'), ('lives', 'VBZ'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP')]
(S (PERSON John/NNP) lives/VBZ in/IN (GPE New/NNP York/NNP))
