# Part 1 - Tokenizers

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

# Sentence tokenizer breaks a paragraph into sentences i.e stores sentences in array
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [3]:
# Breaks by words
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


# Part 2 - Stop Words

In [4]:
# Stop words are words which do not care about like a,an,the

In [5]:
from nltk.corpus import stopwords

In [6]:
stopwords

<WordListCorpusReader in 'C:\\Users\\Ishan Bhargava\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>

In [7]:
sorted(stopwords.words('english'))

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [8]:
example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

# can also be written as:       
# filtered_sentence = [w for w in word_tokens if not w in stop_words]

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


# Part 3 - Stemming

In [9]:
# Stemming is converting words to root word

In [10]:
from nltk.stem import PorterStemmer
# We have multiple stemmer but this is the best

In [11]:
ps = PorterStemmer()

In [12]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [13]:
new_text = "It is important to be very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
be
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


# Part 4 - Part of Speech Tagging

In [14]:
# This means labeling words in a sentence as nouns, adjectives, verbs...etc. 

In [15]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [16]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [17]:
train_text

'PRESIDENT GEORGE W. BUSH\'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nFebruary 2, 2005\n\n\n9:10 P.M. EST \n\nTHE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: \n\nAs a new Congress gathers, all of us in the elected branches of government share a great privilege: We\'ve been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) \n\nTwo weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. \n\nTonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.

In [18]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [19]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)
tokenized

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.",
 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.',
 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.',
 '(Applause.)',
 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.',
 '31, 2006.',
 "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.",
 'We have gathered under this Capitol dome in moments of national mourning and national ach

In [20]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))

In [21]:
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

# Part 8 - Lemmatizing

In [22]:
# Stemming and Lemmatizing are almost same but stemming can create us non-existing words
# Also if two words have same meaning then in this they will have same root word
# This is not in the case of stemming

In [23]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
# pos(part of speech) pos="a" means consider it as adjective
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("running",pos='v'))
# default pos is "n" i.e. noun

cat
cactus
goose
rock
python
good
best
run
run


# Part 9 - NLTK Corpora

In [24]:
# It consist of a file which has text about something in it

In [25]:
from nltk.corpus import gutenberg
# we are importing gutenberg corpora

In [26]:
# sample text
sample = gutenberg.raw("bible-kjv.txt")

tok = sent_tokenize(sample)

for x in range(5):
    print(tok[x])

[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.
And the Spirit of God moved upon the face of the
waters.
1:3 And God said, Let there be light: and there was light.
1:4 And God saw the light, that it was good: and God divided the light
from the darkness.


# Part 10 - WordNet

In [27]:
# With WordNet we can find synonyms of word, defination and various other things

In [28]:
from nltk.corpus import wordnet

In [29]:
syns = wordnet.synsets("program")
syns
# Gives synonyms for the word "program"

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [30]:
print(syns[0].name())
# Gives the name of the object for this Synset
print(syns[0].lemmas()[0].name())
# Gives the name of the 1st synonym
print(syns[0].definition())
# Gives the defination of the 1st synonym
print(syns[0].examples())
# Gives an examples of the word in use

plan.n.01
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [31]:
for sym in wordnet.synsets("good"):
    print(sym)            
    print(sym.lemmas())

Synset('good.n.01')
[Lemma('good.n.01.good')]
Synset('good.n.02')
[Lemma('good.n.02.good'), Lemma('good.n.02.goodness')]
Synset('good.n.03')
[Lemma('good.n.03.good'), Lemma('good.n.03.goodness')]
Synset('commodity.n.01')
[Lemma('commodity.n.01.commodity'), Lemma('commodity.n.01.trade_good'), Lemma('commodity.n.01.good')]
Synset('good.a.01')
[Lemma('good.a.01.good')]
Synset('full.s.06')
[Lemma('full.s.06.full'), Lemma('full.s.06.good')]
Synset('good.a.03')
[Lemma('good.a.03.good')]
Synset('estimable.s.02')
[Lemma('estimable.s.02.estimable'), Lemma('estimable.s.02.good'), Lemma('estimable.s.02.honorable'), Lemma('estimable.s.02.respectable')]
Synset('beneficial.s.01')
[Lemma('beneficial.s.01.beneficial'), Lemma('beneficial.s.01.good')]
Synset('good.s.06')
[Lemma('good.s.06.good')]
Synset('good.s.07')
[Lemma('good.s.07.good'), Lemma('good.s.07.just'), Lemma('good.s.07.upright')]
Synset('adept.s.01')
[Lemma('adept.s.01.adept'), Lemma('adept.s.01.expert'), Lemma('adept.s.01.good'), Lemma('a

In [32]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
#             This will
print(set(synonyms))
print(set(antonyms))

{'secure', 'safe', 'well', 'just', 'adept', 'right', 'unspoiled', 'goodness', 'undecomposed', 'honest', 'salutary', 'skilful', 'practiced', 'in_effect', 'expert', 'sound', 'unspoilt', 'effective', 'dependable', 'commodity', 'skillful', 'thoroughly', 'full', 'upright', 'beneficial', 'proficient', 'dear', 'good', 'ripe', 'trade_good', 'in_force', 'near', 'soundly', 'estimable', 'honorable', 'respectable', 'serious'}
{'evil', 'bad', 'evilness', 'ill', 'badness'}


In [33]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))
# This will find the similarity between the two words

0.9090909090909091


In [34]:
w1 = wordnet.synset('man.n.01')
w2 = wordnet.synset('table.n.01')
print(w1.wup_similarity(w2))

0.16666666666666666


# Part 11 - Text Classification

In [35]:
# We are going to classify the text as positive or negative

In [36]:
import random
from nltk.corpus import movie_reviews

In [86]:
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)),category))

In [87]:
# One liner for the above loop 
# documents = [(list(movie_reviews.words(fileid)), category)
#              for category in movie_reviews.categories()
#              for fileid in movie_reviews.fileids(category)]

In [88]:
random.shuffle(documents)

In [89]:
movie_reviews.categories()

['neg', 'pos']

In [90]:
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        print(movie_reviews.words(fileid))

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...]
['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...]
['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...]
['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...]
['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]
['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...]
['that', "'", 's', 'exactly', 'how', 'long', 'the', ...]
['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...]
['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...]
['best', 'remembered', 'for', 'his', 'understated', ...]
['janeane', 'garofalo', 'in', 'a', 'romantic', ...]
['and', 'now', 'the', 'high', '-', 'flying', 'hong', ...]
['a', 'movie', 'like', 'mortal', 'kombat', ':', ...]
['she', 'was', 'the', 'femme', 'in', '"', 'la', ...]
['john', 'carpenter', 'makes', 'b', '-', 'movies', '.', ...]
['i', "'", 'm', 'really', 'starting', 'to', 'wonder', ...]
['so', 'what', 'do',

['there', 'are', 'some', 'pretty', 'impressive', ...]
['so', ',', 'it', "'", 's', 'thirty', 'years', 'later', ...]
['except', 'for', 'a', 'few', 'bright', 'moments', ...]
['maybe', 'this', 'mission', 'should', 'have', 'been', ...]
['mulholland', 'drive', 'did', 'very', 'well', 'at', ...]
['ingredients', ':', 'possessed', 'plastic', 'dolls', ...]
['watching', 'the', 'movie', ',', 'i', 'vowed', 'to', ...]
['i', 'was', 'going', 'to', 'see', 'ram', 'shrasta', ...]
['starship', 'troopers', 'is', 'a', 'bad', 'movie', ...]
['capsule', ':', 'one', 'of', 'the', 'ten', 'worst', ...]
['hello', 'kids', '.', 'today', 'the', 'movie', ...]
['deserves', 'recognition', 'for', ':', 'achieving', ...]
['starring', 'ben', 'stiller', ',', 'elizabeth', ...]
['the', 'camera', 'zooms', 'in', 'incredibly', 'close', ...]
['"', 'goodbye', ',', 'lover', '"', 'sat', 'on', 'the', ...]
['as', 'with', 'any', 'gen', '-', 'x', 'mtv', 'movie', ...]
['nicolas', 'cage', 'comes', 'up', 'with', 'an', ...]
['a', 'big', 'house

['`', 'the', 'bachelor', "'", 'is', 'one', 'of', 'the', ...]
['topless', 'women', 'talk', 'about', 'their', 'lives', ...]
['beware', 'of', 'movies', 'with', 'the', 'director', ...]
['august', 'and', 'september', 'are', 'a', 'wasteland', ...]
['the', 'event', 'horizon', 'is', 'the', 'boundary', ...]
['there', 'is', 'a', 'rule', 'when', 'it', 'comes', ...]
['as', 'a', 'hot', '-', 'shot', 'defense', 'attorney', ...]
['i', "'", 'm', 'not', 'sure', 'i', 'should', 'be', ...]
['the', 'most', 'interesting', 'thing', 'about', ...]
['an', 'american', 'werewolf', 'in', 'paris', 'is', ...]
['depending', 'on', 'who', 'you', 'ask', ',', 'the', ...]
['wizards', 'is', 'an', 'animated', 'feature', 'that', ...]
['inspired', 'by', 'the', '1958', 'film', 'house', ...]
['capsule', ':', 'liebes', 'meets', 'tod', '.', 'this', ...]
['you', 'know', 'that', 'a', 'movie', 'has', 'issues', ...]
['adam', 'sandler', 'vehicles', 'are', 'never', ...]
['deserves', 'recognition', 'for', ':', 'making', ...]
['any', 'mov

['midway', 'through', '"', 'anaconda', '"', ',', ...]
['if', 'the', '70', "'", 's', 'nostalgia', 'didn', "'", ...]
['synopsis', ':', 'nice', 'girl', 'susanne', 'has', ...]
['well', 'lets', 'see', '.', '.', '.', 'i', "'", 'm', ...]
['14', 'years', 'ago', ',', 'national', 'lampoon', ...]
['plunkett', '&', 'macleane', 'marks', 'the', ...]
['capsule', ':', 'silly', 'and', 'inane', 'adaptation', ...]
['i', 'didn', "'", 't', 'hate', 'the', 'big', 'hit', ...]
['when', 'the', 'film', 'features', 'richard', 'lynch', ...]
['you', 'don', "'", 't', 'look', 'at', 'a', 'ren', '?', ...]
['well', ',', 'as', 'i', 'check', 'my', 'score', ...]
['terrence', 'malick', 'made', 'an', 'excellent', '90', ...]
['i', 'cried', 'during', '_babe_', '.', 'i', 'admit', ...]
['saw', 'an', 'advanced', 'screening', 'of', 'the', ...]
['an', '18', '-', 'foot', '-', 'high', ',', '43', '-', ...]
['1989', "'", 's', '"', 'major', 'league', '"', 'was', ...]
['*', '*', '*', 'the', 'following', 'review', ...]
['one', 'of', 'the'

['this', 'is', 'the', 'movie', 'that', 'could', ...]
['attention', 'moviegoers', ':', 'you', 'are', 'about', ...]
['i', 'have', 'nothing', 'against', 'unabashedly', ...]
['if', 'you', "'", 've', 'seen', 'the', 'trailers', ...]
['house', 'on', 'haunted', 'hill', '(', '1999', ')', ...]
['fit', 'for', 'a', 'ghoul', "'", 's', 'night', 'out', ...]
['"', 'marie', 'couldn', "'", 't', 'talk', ',', '"', ...]
['well', ',', 'here', "'", 's', 'a', 'distasteful', ...]
['okay', ',', 'i', 'just', 'don', "'", 't', 'know', ...]
['"', 'the', 'beach', '"', 'is', 'a', 'structurally', ...]
['the', 'most', 'absurd', 'remake', 'of', '1998', '?', ...]
['in', '1990', ',', 'the', 'surprise', 'success', 'an', ...]
['these', 'days', ',', 'we', 'are', 'witnessing', ...]
['after', '1993', "'", 's', '"', 'falling', 'down', ...]
['i', 'think', 'of', 'i', 'know', 'what', 'you', 'did', ...]
['"', 'nothing', 'more', 'than', 'a', 'high', 'budget', ...]
['one', 'of', 'the', '90s', "'", 'most', 'unwelcome', ...]
['mugshot'

['aspiring', 'broadway', 'composer', 'robert', '(', ...]
['"', 'easely', 'one', 'of', 'the', 'worst', 'films', ...]
['these', 'days', ',', 'people', 'have', 'rather', ...]
['paul', 'verhoeven', ',', 'the', 'dutch', 'auteur', ...]
['fact', 'that', 'charles', 'bronson', 'represents', ...]
['overblown', 'remake', 'of', 'the', '1963', 'robert', ...]
['"', 'tarzan', 'and', 'the', 'lost', 'city', '"', ...]
['everything', 'in', 'the', 'phantom', 'you', 'have', ...]
['a', 'life', 'less', 'ordinary', '(', 'r', ')', ...]
['how', 'could', 'a', 'g', '-', 'rated', 'disney', ...]
['the', 'corruptor', 'is', 'a', 'big', 'silly', 'mess', ...]
['"', 'return', 'to', 'horror', 'high', ',', '"', ...]
['the', 'art', 'of', 'woo', 'attempts', 'to', 'be', ...]
['capsule', ':', 'a', 'science', 'fiction', 'allegory', ...]
['seen', 'december', '2', ',', '1997', 'at', '6', ':', ...]
['plot', ':', 'a', 'down', '-', 'and', '-', 'out', ...]
['for', 'those', 'interested', 'in', 'the', 'true', ...]
['director', 'doug',

['as', 'african', 'american', 'detective', 'vergil', ...]
['in', 'wonder', 'boys', 'michael', 'douglas', 'plays', ...]
['plot', ':', 'derek', 'zoolander', 'is', 'a', 'male', ...]
['unzipped', 'is', 'a', 'cinematic', 'portrait', 'of', ...]
['dora', '(', 'fernanda', 'montenegro', ')', 'sits', ...]
['the', 'event', 'of', 'events', 'is', 'upon', 'us', ...]
['sometimes', 'a', 'movie', 'comes', 'along', 'that', ...]
['that', 'thing', 'you', 'do', '!', '(', 'r', ')', ...]
['"', 'he', "'", 's', 'back', ',', 'and', 'it', "'", ...]
['dreamworks', 'pictures', 'presents', 'a', 'jinks', ...]
['will', 'hunting', '(', 'matt', 'damon', ')', 'is', ...]
['if', 'there', "'", 's', 'one', 'thing', 'in', ...]
['what', 'do', 'you', 'get', 'when', 'you', 'slap', ...]
['`', 'we', 'run', 'tings', '.', 'tings', 'don', "'", ...]
['at', 'first', 'glance', ',', 'it', 'appears', 'that', ...]
['--', 'comedy', ',', 'rated', 'pg', ',', 'runs', ...]
['when', '_star', 'wars_', 'came', 'out', 'some', ...]
['national', 'la

['on', 'the', 'basis', 'of', 'this', 'film', 'alone', ...]
['it', "'", 's', 'a', 'fact', 'that', 'a', 'good', ...]
['there', 'exists', 'a', 'litany', 'of', 'differences', ...]
['jay', 'and', 'silent', 'bob', 'strike', 'back', ',', ...]
['damn', 'those', 'trailers', '.', 'had', 'it', 'not', ...]
['bob', 'the', 'happy', 'bastard', "'", 's', 'quickie', ...]
['this', 'sunday', 'afternoon', 'i', 'had', 'the', ...]
['a', 'common', 'complaint', 'amongst', 'film', ...]
['whew', '.', 'this', 'film', 'oozes', 'energy', ',', ...]
['steven', 'spielberg', "'", 's', '"', 'amistad', ',', ...]
['he', 'has', 'spent', 'his', 'entire', 'life', 'in', ...]
['being', 'that', 'it', 'is', 'a', 'foreign', ...]
['when', 'i', 'first', 'heard', 'of', 'contact', ',', ...]
['you', "'", 've', 'got', 'to', 'love', 'disney', '.', ...]
['with', 'three', 'pre', '-', 'to', 'mid', '-', 'teen', ...]
['i', 'hate', 'to', 'burst', 'your', 'bubble', ',', ...]
['susan', 'granger', "'", 's', 'review', 'of', '"', ...]
['i', 'must

['okay', ',', 'let', 'me', 'first', 'say', ',', 'this', ...]
['taking', 'a', 'few', 'tips', 'from', 'the', 'pulp', ...]
['garry', 'shandling', 'makes', 'his', 'long', ...]
['there', "'", 's', 'a', 'moment', 'in', 'schindler', ...]
['i', 'recall', 'the', 'trials', 'and', 'tribulations', ...]
['full', 'metal', 'jacket', ',', 'very', 'much', ...]
['oliver', 'stone', "'", 's', 'latest', 'feature', ...]
['in', 'tim', 'burton', "'", 's', '`', 'sleepy', ...]
['ok', ',', 'i', 'admit', 'i', 'had', 'a', 'bad', ...]
['bowfinger', 'is', 'a', 'good', 'movie', 'about', ...]
['pulp', 'fiction', ',', 'quentin', 'tarantino', "'", ...]
['i', 'think', 'the', 'first', 'thing', 'this', ...]
['i', 'know', 'it', 'already', 'opened', 'in', ...]
['capsule', ':', 'this', 'is', 'a', 'film', 'that', ...]
['"', 'gattaca', '"', 'represents', 'a', 'solid', ...]
['there', 'was', 'a', 'huge', 'crowd', '-', 'so', ...]
['available', 'in', 'an', 'all', 'new', 'video', ...]
['when', 'i', 'left', 'the', 'theater', 'after',

['capsule', ':', 'this', 'is', 'a', '1950s', 'or', ...]
['don', "'", 't', 'let', 'the', 'following', 'quirks', ...]
['getting', 'it', 'right', 'is', 'a', 'far', 'far', ...]
['no', 'matter', 'what', 'you', 'suspect', ',', 'this', ...]
['"', 'dangerous', 'beauty', '"', 'is', 'a', 'really', ...]
['imagine', 'this', 'scenario', ':', 'you', 'and', ...]
['steve', 'soderbergh', "'", 's', '"', 'sex', ',', ...]
['synopsis', ':', 'private', 'detective', 'tom', ...]
['to', 'me', ',', 'nicolas', 'cage', 'sounds', 'like', ...]
['at', 'first', 'glance', ',', 'daylight', 'would', ...]
['after', 'the', 'simple', 'looking', 'little', ...]
['note', ':', 'some', 'may', 'consider', 'portions', ...]
['bruce', 'willis', 'and', 'sixth', 'sense', ...]
['there', "'", 's', 'good', 'news', 'and', 'bad', ...]
['harmless', ',', 'silly', 'and', 'fun', 'comedy', ...]
['steven', 'spielberg', 'is', 'now', 'considered', ...]
['with', 'the', 'exception', 'of', 'their', ...]
['"', 'the', 'endurance', ':', 'shackleton', "

['let', 'me', 'first', 'say', 'that', 'the', ...]
['alchemy', 'is', 'steeped', 'in', 'shades', 'of', ...]
['who', 'would', 'have', 'thought', '?', 'jim', ...]
['capsule', ':', 'this', 'is', 'a', 'harrowing', ...]
['another', "'", 'independent', 'film', "'", ',', ...]
['a', 'frequent', 'error', 'is', 'the', ...]
['very', 'few', 'people', 'would', 'be', 'unaware', ...]
['let', 'me', 'start', 'off', 'by', 'saying', 'that', ...]
['seen', 'september', '5', ',', '1998', 'at', '10', ...]
['the', 'characters', 'in', '"', 'palmetto', '"', ...]
['"', 'you', "'", 've', 'got', 'mail', '"', 'is', ...]
['with', 'the', 'sudden', 'liberal', 'emergence', 'of', ...]
['mary', 'norton', "'", 's', 'children', "'", 's', ...]
['touchstone', 'pictures', 'and', 'spyglass', ...]
['the', 'reunion', 'film', 'is', 'not', 'an', ...]
['once', 'upon', 'a', 'time', 'a', 'solitary', 'ogre', ...]
['some', 'movies', 'have', 'such', 'an', 'impact', ...]
['`', 'run', 'lola', 'run', "'", ',', 'a', 'german', ...]
['in', '"',

['so', 'many', 'students', 'strive', 'to', 'get', ...]
['available', 'for', 'rental', '-', 'october', '12', ...]
['my', 'filmcritic', '.', 'com', 'colleague', 'norm', ...]
['what', 'i', 'look', 'for', 'in', 'a', 'movie', 'is', ...]
['mike', 'myers', ',', 'you', 'certainly', 'did', ...]
['it', 'is', 'with', 'hesitance', 'that', 'i', 'call', ...]
['the', 'verdict', ':', 'spine', '-', 'chilling', ...]
['an', 'indian', 'runner', 'was', 'more', 'than', 'a', ...]
['seen', 'july', '8', ',', '1998', 'at', 'the', ...]
['not', 'since', 'attending', 'an', 'ingmar', ...]
['after', 'a', 'successful', 'run', 'in', 'australia', ...]
['every', 'once', 'in', 'a', 'while', ',', 'a', 'film', ...]
['"', 'through', 'a', 'spyglass', ',', 'i', 'could', ...]
['wong', 'kar', '-', 'wei', "'", 's', '"', 'fallen', ...]
['city', 'of', 'angels', 'is', 'the', 'kind', 'of', ...]
['have', 'you', 'ever', 'wondered', 'if', 'death', ...]
['originally', 'entitled', 'dancing', 'about', ...]
['"', 'well', 'this', 'is', 'not

In [91]:
print(documents[0])
# These are our features with 1st part having the movie review and 2nd label ie pos or neg

(['wyatt', 'earp', 'has', 'a', 'lot', 'to', 'tell', 'and', 'little', 'to', 'say', '.', 'this', 'story', 'of', 'the', 'legendary', 'lawman', 'runs', 'three', 'hours', 'and', 'nine', 'minutes', 'and', 'that', "'", 's', 'too', 'long', 'for', 'this', 'epic', '-', 'wannabe', 'that', 'plays', 'more', 'like', 'a', 'tv', 'mini', '-', 'series', 'than', 'a', 'movie', '.', 'the', 'story', 'opens', 'on', 'the', 'earp', 'family', 'farm', ',', 'with', 'young', 'wyatt', 'ready', 'to', 'run', 'away', 'from', 'home', '.', 'he', 'wants', 'to', 'fight', 'in', 'the', 'civil', 'war', ',', 'but', 'his', 'dad', '(', 'hackman', ')', 'has', 'different', 'ideas', 'and', 'wants', 'his', 'son', 'to', 'study', 'law', '.', 'time', 'passes', 'and', 'wyatt', 'heads', 'west', ',', 'returns', 'home', 'to', 'marry', ',', 'and', ',', 'after', 'a', 'spell', ',', 'begins', 'his', 'career', 'in', 'law', 'enforcement', '.', 'by', 'the', 'time', 'he', '(', 'costner', ')', 'and', 'his', 'brothers', '(', 'madsen', ',', 'david',

In [103]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
#     Convert everything to lower

all_words = nltk.FreqDist(all_words)
# it forms a tuple of words and their frequency
print(all_words.most_common(15))
# Print most common 15 words

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [104]:
all_words

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [105]:
print(all_words["stupid"])

253


In [106]:
# Now what we do is that we take all the words and compile them and find the most popular words
# and then classify the words as pos or neg and for a new review we will just check the word in the category

# Part 12 - Words as features

In [170]:
word_features = []
for words in all_words.most_common(3000):
    word_features.append(words[0])
# It will take the words(not count) of the most common 3000 words

In [171]:
word_features

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for',
 'his',
 'this',
 'film',
 'i',
 'he',
 'but',
 'on',
 'are',
 't',
 'by',
 'be',
 'one',
 'movie',
 'an',
 'who',
 'not',
 'you',
 'from',
 'at',
 'was',
 'have',
 'they',
 'has',
 'her',
 'all',
 '?',
 'there',
 'like',
 'so',
 'out',
 'about',
 'up',
 'more',
 'what',
 'when',
 'which',
 'or',
 'she',
 'their',
 ':',
 'some',
 'just',
 'can',
 'if',
 'we',
 'him',
 'into',
 'even',
 'only',
 'than',
 'no',
 'good',
 'time',
 'most',
 'its',
 'will',
 'story',
 'would',
 'been',
 'much',
 'character',
 'also',
 'get',
 'other',
 'do',
 'two',
 'well',
 'them',
 'very',
 'characters',
 ';',
 'first',
 '--',
 'after',
 'see',
 '!',
 'way',
 'because',
 'make',
 'life',
 'off',
 'too',
 'any',
 'does',
 'really',
 'had',
 'while',
 'films',
 'how',
 'plot',
 'little',
 'where',
 'people',
 'over',
 'could',
 'then',
 'me',
 'scene',
 'man',
 'bad',
 '

In [172]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
# What this function does it that for each word in most common 3000 words it check that
# if the word in document has that word or not and mark it true or false accordingly

In [173]:
# To test with one file
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



In [174]:
# What this does it that for each review it marks the most common words as true or false if they come or not
featuresets = [(find_features(rev), category) for (rev, category) in documents]
# featuresets = [({words(most common):True|False},category)]

In [175]:
featuresets[0]
# This is for one movie review

({',': True,
  'the': True,
  '.': True,
  'a': True,
  'and': True,
  'of': True,
  'to': True,
  "'": True,
  'is': True,
  'in': True,
  's': True,
  '"': True,
  'it': False,
  'that': True,
  '-': True,
  ')': True,
  '(': True,
  'as': True,
  'with': True,
  'for': True,
  'his': True,
  'this': True,
  'film': True,
  'i': False,
  'he': True,
  'but': True,
  'on': True,
  'are': True,
  't': True,
  'by': True,
  'be': True,
  'one': True,
  'movie': True,
  'an': True,
  'who': True,
  'not': True,
  'you': False,
  'from': True,
  'at': True,
  'was': True,
  'have': False,
  'they': False,
  'has': True,
  'her': False,
  'all': True,
  '?': True,
  'there': False,
  'like': True,
  'so': False,
  'out': False,
  'about': False,
  'up': False,
  'more': True,
  'what': True,
  'when': False,
  'which': False,
  'or': False,
  'she': False,
  'their': False,
  ':': False,
  'some': True,
  'just': True,
  'can': True,
  'if': False,
  'we': False,
  'him': False,
  'into': 

# Part 13 - Naive Bayes Classifier

In [176]:
count_set=0
for i in featuresets:
    count_set+=1

In [177]:
count_set
# Total reviews

2000

In [178]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

# 1900 as training and remanining 1000 as testing 

In [188]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
# It is the most basic algo
# It is suitable for classifying btw two types 

In [189]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 84.0


In [None]:
classifier.show_most_informative_features(15)
# Print most popular words on one side and not on other
# ie the words with hightest pos:neg ratio or vice versa

# Part 14 - Saving the Model

In [191]:
import pickle

In [192]:
# To save our classifier in a file
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [193]:
# To load the classifier
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

# Part 15 - Sklearn Classifier

In [199]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [198]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.85
BernoulliNB accuracy percent: 0.84


In [200]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 84.0
Most Informative Features
             outstanding = True              pos : neg    =     11.1 : 1.0
                   mulan = True              pos : neg    =      9.1 : 1.0
                  finest = True              pos : neg    =      8.0 : 1.0
                  seagal = True              neg : pos    =      7.7 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  prinze = True              neg : pos    =      6.5 : 1.0
             wonderfully = True              pos : neg    =      6.4 : 1.0
                   damon = True              pos : neg    =      6.2 : 1.0
              schumacher = True              neg : pos    =      6.1 : 1.0
                   inept = True              neg : pos    =      6.0 : 1.0
                  wasted = True              neg : pos    =      5.9 : 1.0
                   jolie = True              neg : pos    =      5.8 : 1.0
                   flynt 



SGDClassifier_classifier accuracy percent: 85.0
SVC_classifier accuracy percent: 82.0
LinearSVC_classifier accuracy percent: 81.0
NuSVC_classifier accuracy percent: 87.0


# Part 16 - Combining algos with a vote

In [202]:
from nltk.classify import ClassifierI
from statistics import mode

In [207]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [216]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  SVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
# We have printed accuracy of the model formed by voting of multiple models

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)
# We have printed the confidence of the 6 testing data

voted_classifier accuracy percent: 90.0
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 100.0


# Part 19 - Sentiment Analysis Module