In [1]:
import nltk
nltk.download()
# to download nltk and using pip install nltk and then download all the packages

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### NLTK with python

Next steps

1. Tokenizing - Splitting sentences and words from the body of text.
2. Part of Speech tagging
3. Machine Learning with the Naive Bayes classifier
4. How to tie in Scikit-learn (sklearn) with NLTK
5. Training classifiers with datasets
6. Performing live, streaming, sentiment analysis with Twitter.

### 1. Tokenization

In [2]:
from nltk import tokenize
example_txt= """Hello Mr. Smith, how are you doing today? 
The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."""
print(tokenize.sent_tokenize(example_txt)) # for tokenizing the given para to sentences and words
print(tokenize.word_tokenize(example_txt))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]
['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


### 2. Stop words

Stop words are those which are not having any meaning and which are frequently used

-> from nltk.corpus import stopwords

list of stop words in NLTK 

-> set(stopwords.words('english'))

We will remove the stop from the tokenized list of given data

In [3]:
from nltk.corpus import stopwords
stopwords.words('english')
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
words_tokens=tokenize.word_tokenize(example_sent)
#filtered_sentence=[w for w in words_tokens if not w in stopwords]
filtered_sentence = []

for w in words_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(words_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


### 3. Stemming 

Stemming is the method of normalzation, tense of words are involved 

there are different types of stemming methods used most popular stemming algorithms is the Porter stemmer


In [15]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [18]:
new_text = """It is important to by very pythonly while you are pythoning 
with python. All pythoners have pythoned poorly at least once."""
words = tokenize.word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


### 4. Part of Speech Tagging 

Labeling the words in a sentence as nouns, adjectives, verbs... etc..

from nltk.tokenize import PunktSentenceTokenizer

nltk.pos_tag(words)

In [36]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [31]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [32]:
type(sample_text)

# Next, we can train the Punkt tokenizer like:

str

In [33]:
custom_sent_tokenizer=PunktSentenceTokenizer(train_text)

In [34]:
# Then we can actually tokenize, using
tokenized =custom_sent_tokenizer.tokenize(sample_text)

In [38]:
# Now we can finish up this part of speech tagging script by creating a function
#that will run through and tag all of the parts of speech per sentence like so:
def process_content():
    try:
        for i in tokenized[:5]:
            words= nltk.tokenize.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))
        
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

### 5. Chunking

we group words, based on their parts of speech

One of the main goals of chunking is to group into what are known as "noun phrases." These are phrases of one or more words that contain a noun, maybe some descriptive words, maybe a verb, and maybe something like an adverb. The idea is to group nouns with the words that are in relation to them.

1. '+' = match 1 or more
2. '?' = match 0 or 1 repetitions.
3. '*' = match 0 or MORE repetitions	  
4. '.' = Any character except a new line

part of speech tags are denoted with the "<" and ">"

In [4]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            #chunked.draw()     
            print(chunked)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)
    except Exception as e:
        print(str(e))

process_content()


(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  g

(S
  When/WRB
  they/PRP
  murder/VBP
  children/NNS
  at/IN
  a/DT
  school/NN
  in/IN
  (Chunk Beslan/NNP)
  ,/,
  or/CC
  blow/VB
  up/RP
  commuters/NNS
  in/IN
  (Chunk London/NNP)
  ,/,
  or/CC
  behead/VB
  a/DT
  bound/NN
  captive/NN
  ,/,
  the/DT
  terrorists/NNS
  hope/VBP
  these/DT
  horrors/NNS
  will/MD
  break/VB
  our/PRP$
  will/MD
  ,/,
  allowing/VBG
  the/DT
  violent/NN
  to/TO
  inherit/VB
  the/DT
  (Chunk Earth/NNP)
  ./.)
(Chunk Beslan/NNP)
(Chunk London/NNP)
(Chunk Earth/NNP)
(S
  But/CC
  they/PRP
  have/VBP
  miscalculated/VBN
  :/:
  We/PRP
  love/VBP
  our/PRP$
  freedom/NN
  ,/,
  and/CC
  we/PRP
  will/MD
  fight/VB
  to/TO
  keep/VB
  it/PRP
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  In/IN
  a/DT
  time/NN
  of/IN
  testing/VBG
  ,/,
  we/PRP
  can/MD
  not/RB
  find/VB
  security/NN
  by/IN
  abandoning/VBG
  our/PRP$
  commitments/NNS
  and/CC
  retreating/VBG
  within/IN
  our/PRP$
  borders/NNS
  ./.)
(S
  If/IN
  we/PRP

(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  (Chunk President/NNP George/NNP W./NNP Bush/NNP)
  waves/VBZ
  toward/IN
  the/DT
  upper/JJ
  visitors/NNS
  gallery/NN
  of/IN
  the/DT
  (Chunk House/NNP Chamber/NNP)
  following/VBG
  his/PRP$
  State/NN
  of/IN
  the/DT
  (Chunk Union/NNP)
  remarks/NNS
  (Chunk Tuesday/NNP)
  ,/,
  (Chunk Jan/NNP)
  ./.)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk House/NNP Chamber/NNP)
(Chunk Union/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)
(S
  31/CD
  ,/,
  2006/CD
  at/IN
  the/DT
  (Chunk United/NNP)
  States/NNPS
  (Chunk Capitol/NNP)
  ./.)
(Chunk United/NNP)
(Chunk Capitol/NNP)
(S
  (Chunk White/NNP House/NNP photo/NN)
  by/IN
  (Chunk Eric/NNP Draper/NNP)
  The/DT
  same/JJ
  is/VBZ
  true/JJ
  of/IN
  (Chunk Iran/NNP)
  ,/,
  a/DT
  nation/NN
  now/RB
  held/VBN
  hostage/NN
  by/IN
  a/DT
  small/JJ
  clerical/JJ
  elite/NN
  that/WDT
  is/VBZ
  isolating/VBG
  and/CC
  repressing/VBG
  its/PRP$
  people/NNS
  ./.

(S
  To/TO
  draw/VB
  that/DT
  support/NN
  ,/,
  we/PRP
  must/MD
  always/RB
  be/VB
  clear/JJ
  in/IN
  our/PRP$
  principles/NNS
  and/CC
  willing/JJ
  to/TO
  act/VB
  ./.)
(S
  The/DT
  only/JJ
  alternative/NN
  to/TO
  American/JJ
  leadership/NN
  is/VBZ
  a/DT
  dramatically/RB
  more/RBR
  dangerous/JJ
  and/CC
  anxious/JJ
  world/NN
  ./.)
(S
  Yet/CC
  we/PRP
  also/RB
  choose/VBP
  to/TO
  lead/VB
  because/IN
  it/PRP
  is/VBZ
  a/DT
  privilege/NN
  to/TO
  serve/VB
  the/DT
  values/NNS
  that/WDT
  gave/VBD
  us/PRP
  birth/NN
  ./.)
(S
  American/JJ
  leaders/NNS
  --/:
  from/IN
  (Chunk Roosevelt/NNP)
  to/TO
  (Chunk Truman/NNP)
  to/TO
  (Chunk Kennedy/NNP)
  to/TO
  (Chunk Reagan/NNP)
  --/:
  rejected/VBD
  isolation/NN
  and/CC
  retreat/NN
  ,/,
  because/IN
  they/PRP
  knew/VBD
  that/IN
  (Chunk America/NNP)
  is/VBZ
  always/RB
  more/RBR
  secure/JJ
  when/WRB
  freedom/NN
  is/VBZ
  on/IN
  the/DT
  march/NN
  ./.)
(Chunk Roosevelt/NNP)
(Chunk Tru

(S
  The/DT
  retirement/NN
  of/IN
  the/DT
  baby/NN
  boom/NN
  generation/NN
  will/MD
  put/VB
  unprecedented/JJ
  strains/NNS
  on/IN
  the/DT
  federal/JJ
  government/NN
  ./.)
(S
  By/IN
  2030/CD
  ,/,
  spending/VBG
  for/IN
  (Chunk Social/NNP Security/NNP)
  ,/,
  (Chunk Medicare/NNP)
  and/CC
  (Chunk Medicaid/NNP)
  alone/RB
  will/MD
  be/VB
  almost/RB
  60/CD
  percent/NN
  of/IN
  the/DT
  entire/JJ
  federal/JJ
  budget/NN
  ./.)
(Chunk Social/NNP Security/NNP)
(Chunk Medicare/NNP)
(Chunk Medicaid/NNP)
(S
  And/CC
  that/DT
  will/MD
  present/VB
  future/JJ
  Congresses/NNS
  with/IN
  impossible/JJ
  choices/NNS
  --/:
  staggering/VBG
  tax/NN
  increases/NNS
  ,/,
  immense/JJ
  deficits/NNS
  ,/,
  or/CC
  deep/JJ
  cuts/NNS
  in/IN
  every/DT
  category/NN
  of/IN
  spending/NN
  ./.)
(S
  (Chunk Congress/NNP)
  did/VBD
  not/RB
  act/VB
  last/JJ
  year/NN
  on/IN
  my/PRP$
  proposal/NN
  to/TO
  (Chunk save/VB Social/NNP Security/NNP)
  --/:
  (/(
  applau

(S
  We/PRP
  'll/MD
  also/RB
  fund/VB
  additional/JJ
  research/NN
  in/IN
  cutting-edge/JJ
  methods/NNS
  of/IN
  producing/VBG
  ethanol/NN
  ,/,
  not/RB
  just/RB
  from/IN
  corn/NN
  ,/,
  but/CC
  from/IN
  wood/NN
  chips/NNS
  and/CC
  stalks/NNS
  ,/,
  or/CC
  switch/VB
  grass/NN
  ./.)
(S
  Our/PRP$
  goal/NN
  is/VBZ
  to/TO
  make/VB
  this/DT
  new/JJ
  kind/NN
  of/IN
  ethanol/JJ
  practical/JJ
  and/CC
  competitive/JJ
  within/IN
  six/CD
  years/NNS
  ./.)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  Breakthroughs/NNS
  on/IN
  this/DT
  and/CC
  other/JJ
  new/JJ
  technologies/NNS
  will/MD
  help/VB
  us/PRP
  reach/VB
  another/DT
  great/JJ
  goal/NN
  :/:
  to/TO
  replace/VB
  more/JJR
  than/IN
  75/CD
  percent/NN
  of/IN
  our/PRP$
  oil/NN
  imports/NNS
  from/IN
  the/DT
  (Chunk Middle/NNP East/NNP)
  by/IN
  2025/CD
  ./.)
(Chunk Middle/NNP East/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  By/IN
  applyin

(S
  Through/IN
  the/DT
  (Chunk Helping/NNP America/NNP)
  's/POS
  (Chunk Youth/NNP Initiative/NNP)
  ,/,
  we/PRP
  are/VBP
  encouraging/VBG
  caring/VBG
  adults/NNS
  to/TO
  get/VB
  involved/VBN
  in/IN
  the/DT
  life/NN
  of/IN
  a/DT
  child/NN
  --/:
  and/CC
  this/DT
  good/JJ
  work/NN
  is/VBZ
  being/VBG
  led/VBN
  by/IN
  our/PRP$
  (Chunk First/NNP Lady/NNP)
  ,/,
  (Chunk Laura/NNP Bush/NNP)
  ./.)
(Chunk Helping/NNP America/NNP)
(Chunk Youth/NNP Initiative/NNP)
(Chunk First/NNP Lady/NNP)
(Chunk Laura/NNP Bush/NNP)
(S (/( (Chunk Applause/NNP) ./. )/))
(Chunk Applause/NNP)
(S
  This/DT
  year/NN
  we/PRP
  will/MD
  add/VB
  resources/NNS
  to/TO
  encourage/VB
  young/JJ
  people/NNS
  to/TO
  stay/VB
  in/IN
  school/NN
  ,/,
  so/RB
  more/JJR
  of/IN
  (Chunk America/NNP)
  's/POS
  youth/NN
  can/MD
  raise/VB
  their/PRP$
  sights/NNS
  and/CC
  achieve/VBP
  their/PRP$
  dreams/NNS
  ./.)
(Chunk America/NNP)
(S
  A/DT
  hopeful/JJ
  society/NN
  comes/VBZ
  

This line, broken down:

1. <RB.?>* = "0 or more of any tense of adverb," followed by:

2. <VB.?>* = "0 or more of any tense of verb," followed by:

3. '<NNP>+' = "One or more proper nouns," followed by

4. '<NN>?' = "zero or one singular noun."

### Chinking 

after a lot of chunking, you have some words in your chunk you still do not want, but you have no idea how to get rid of them by chunking. You may find that chinking is your solution.

it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

In [45]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            #chunked.draw()

    except Exception as e:
        print(str(e))

process_content()

In [46]:
# main difference is }<VB.?|IN|DT|TO>+{
#This means we're removing from the chink one or more verbs, prepositions, determiners, or the word 'to'.

### 6. Named Entity Recognition.

The idea is to have the machine immediately be able to pull out "entities" like people, places, things, locations, monetary figures, and more.

When Binary is False, it picked up the same things, but wound up splitting up terms like White House into "White" and "House" as if they were different, whereas we could see in the binary = True option, the named entity recognition was correct to say White House was part of the same named entity.

In [53]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text= state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    for i in tokenized[:3]:
        words= nltk.word_tokenize(i)
        tagged= nltk.pos_tag(words)
        namedEnt = nltk.ne_chunk(tagged,binary=True) # binary can be True or false
        namedEnt.draw()
        
process_content()

### 7. Lemmatizing with NLTK

Lemmitization is the stemming the word to the root word which is in the dictionary

The only major thing to note is that lemmatize takes a part of speech parameter, "pos." If not supplied, the default is "noun." This means that an attempt will be made to find the closest noun, which can create trouble for you.

In [60]:
from nltk import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats")) # considered as  noun
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a")) # when POS is passed then this is considered as adverb
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("running",pos="v")) # pos is passed as verb

print(lemmatizer.lemmatize("run",'v')) # pos is passed as verb

cat
cactus
goose
rock
python
good
best
run
run


In [5]:
import sys
import nltk
print(nltk.__file__)


C:\Users\hemanth\Anaconda3\lib\site-packages\nltk\__init__.py


### Wordnet with NLTK

WordNet is a lexical database for the English language, which was created by Princeton, and is part of the NLTK corpus.

You can use WordNet alongside the NLTK module to find the meanings of words, synonyms, antonyms, and more. Let's cover some examples.



In [6]:
from nltk.corpus import wordnet

In [7]:
syns = wordnet.synsets("program")

In [8]:
print(syns[0].name())

plan.n.01


In [11]:
type(syns),syns

(list,
 [Synset('plan.n.01'),
  Synset('program.n.02'),
  Synset('broadcast.n.02'),
  Synset('platform.n.02'),
  Synset('program.n.05'),
  Synset('course_of_study.n.01'),
  Synset('program.n.07'),
  Synset('program.n.08'),
  Synset('program.v.01'),
  Synset('program.v.02')])

In [22]:
syns[0].lemmas()[0].name()

'plan'

In [23]:
syns[0].definition()

'a series of steps to be carried out or goals to be accomplished'

In [24]:
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


Next, how might we discern synonyms and antonyms to a word? The lemmas will be synonyms, and then you can use .antonyms to find the antonyms to the lemmas. As such, we can populate some lists like:

In [26]:
synonyms=[]
antonyms=[]
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

set()
{'bad', 'ill', 'evil', 'evilness', 'badness'}


In [27]:
#  WordNet to compare the similarity of two words

In [28]:
w1= wordnet.synset('ship.n.01')
w2= wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [29]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [30]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


In [31]:
type(w1)

nltk.corpus.reader.wordnet.Synset

## Text Classification with NLTK

we're going to start by trying to use the movie reviews database that is part of the NLTK corpus. From there we'll try to use words as "features" which are a part of either a positive or negative movie review. The NLTK corpus movie_reviews data set has the reviews, and they are labeled already as positive or negative. This means we can train and test with this data.

In [32]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words["stupid"])

(['"', 'virus', '"', 'is', 'a', 'monster', 'movie', 'without', 'a', 'monster', '.', 'any', 'movie', 'with', 'a', 'hurdle', 'that', 'large', 'to', 'overcome', 'had', 'better', 'be', 'pretty', 'damn', 'good', 'otherwise', '.', 'sadly', ',', '"', 'virus', '"', 'does', 'not', 'deliver', ',', 'on', 'any', 'level', '.', 'the', 'movie', 'opens', 'with', 'the', 'russian', 'space', 'station', 'mir', 'about', 'to', 'transmit', 'something', '(', 'we', 'never', 'find', 'out', 'what', ')', 'to', 'a', 'big', 'boat', 'with', 'lots', 'of', 'satellites', 'on', 'it', '.', 'sudddenly', ',', 'a', 'wave', 'of', 'colorful', 'lightning', 'comes', 'flying', 'through', 'space', ',', 'and', 'winds', 'up', 'destroying', 'mir', 'and', 'using', 'it', 'to', 'transmit', 'itself', 'to', 'the', 'aforementioned', 'big', 'boat', '.', 'cut', 'to', 'seven', 'days', 'later', ',', 'we', 'meet', 'donald', 'sutherland', 'and', 'his', 'band', 'of', 'seafaring', 'vultures', '.', 'see', ',', 'they', 'spend', 'all', 'their', 'tim

In [33]:
print(documents)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [54]:
from matplotlib import pyplot as plt

In [None]:
all_words

In [60]:
all_words.most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [63]:
print(all_words["stupid"])

253


In [65]:
# converting words to features
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

function that will find these top 3,000 words in our positive and negative documents, marking their presence as either positive or negative:

In [66]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [67]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))



In [68]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [70]:
# we have extracted all the features from the document

## Naive Bayes Classifier with NLTK

In [71]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [72]:
classifier= nltk.NaiveBayesClassifier.train(training_set)

In [74]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 78.0


In [75]:
classifier.show_most_informative_features(15)

Most Informative Features
               atrocious = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =     10.6 : 1.0
                bothered = True              neg : pos    =      9.7 : 1.0
                     ugh = True              neg : pos    =      9.0 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0
                    mena = True              neg : pos    =      6.3 : 1.0

## Using pickle method to save the classifier 

Pickle module to go ahead and serialize our classifier object, so that all we need to do is load that file in real quick.

In [77]:
import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [78]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

## Scikit-Learn Sklearn with NLTK

In [79]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [80]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.8
BernoulliNB accuracy percent: 0.78


In [81]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [83]:

print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 78.0
Most Informative Features
               atrocious = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =     10.6 : 1.0
                bothered = True              neg : pos    =      9.7 : 1.0
                     ugh = True              neg : pos    =      9.0 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0
                    mena 



SGDClassifier_classifier accuracy percent: 82.0
SVC_classifier accuracy percent: 74.0
LinearSVC_classifier accuracy percent: 80.0
NuSVC_classifier accuracy percent: 78.0


Combining classifier algorithms is is a common technique, done by creating a sort of voting system, where each algorithm gets one vote, and the classification that has the votes votes is the chosen one.

In [None]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode


class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]
        
training_set = featuresets[:1900]
testing_set =  featuresets[1900:]

#classifier = nltk.NaiveBayesClassifier.train(training_set)

classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()




print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)

Original Naive Bayes Algo accuracy percent: 90.0
Most Informative Features
               atrocious = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =     10.6 : 1.0
                bothered = True              neg : pos    =      9.7 : 1.0
                     ugh = True              neg : pos    =      9.0 : 1.0
                 frances = True              pos : neg    =      8.3 : 1.0
                  annual = True              pos : neg    =      8.3 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0
                    mena 



SGDClassifier_classifier accuracy percent: 86.0
