In [54]:
import nltk

In [55]:
import sys
import sklearn

In [56]:
print('Python : {}'.format(sys.version))
print('nltk : {}'.format(nltk.__version__))
print('sklearn : {}'.format(sklearn.__version__))

Python : 3.5.6 |Anaconda custom (64-bit)| (default, Aug 26 2018, 16:05:27) [MSC v.1900 64 bit (AMD64)]
nltk : 3.3
sklearn : 0.20.0


## Sentence and Word tokenizing

In [57]:
from nltk.tokenize import sent_tokenize, word_tokenize
text = 'Hii there ! this is ISHANK TYAGI and I am Awesome. PE PA PA PARAPPA  '
print(sent_tokenize(text))

['Hii there !', 'this is ISHANK TYAGI and I am Awesome.', 'PE PA PA PARAPPA']


In [58]:
print(word_tokenize(text))

['Hii', 'there', '!', 'this', 'is', 'ISHANK', 'TYAGI', 'and', 'I', 'am', 'Awesome', '.', 'PE', 'PA', 'PA', 'PARAPPA']


## Removing stopwords

In [59]:
# removing stop words -useless data
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

{'be', 'doesn', 'was', 'those', 'below', 'haven', 'but', 'not', 'needn', "isn't", 'him', 'few', "mightn't", 'she', 'their', 'doing', 'mustn', 'aren', "you've", 'because', 'there', "hasn't", 'than', 'won', 'why', 'should', 'o', 'down', 'so', 'my', 'is', 'they', 'mightn', 'them', 'having', 'above', 'after', 'does', "don't", 'have', 'once', 'and', 'at', 'an', 'now', 'ma', 'shouldn', "should've", "shouldn't", 'very', 'whom', 'while', 'ours', 'which', 'on', "you'll", 'it', 'has', 'here', 'his', 'more', 'from', "weren't", 'themselves', 'isn', 'a', 'of', 'll', 'am', "she's", "hadn't", 'shan', 'some', 'in', 'her', "couldn't", "wasn't", 'this', 'been', 'i', 'me', 'do', 'hadn', 'himself', 'by', "aren't", 'most', 'can', 'wasn', 'we', 'during', 'only', "needn't", 'don', 'these', 'with', "haven't", 'each', 'about', 'yourself', 'what', "it's", 'he', "mustn't", "wouldn't", 'against', 'into', 'no', "that'll", 'when', 'between', 'd', 'you', 'under', 'if', 'ourselves', 'too', 'herself', 'who', 'itself',

In [60]:
example = 'This is some sample text showing stopword filtration'
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example)
filtered_tokens = [w for w in word_tokens if not w in stop_words]

In [61]:
print(filtered_tokens)

['This', 'sample', 'text', 'showing', 'stopword', 'filtration']


## Stemming

In [62]:
# Stemming words with NLTK -- normslizing
from nltk.stem import PorterStemmer
ps = PorterStemmer()
example_words = ['ride', 'rider', 'riding', 'rides']
stems = [ps.stem(w) for w in example_words ]

In [63]:
print(stems)

['ride', 'rider', 'ride', 'ride']


In [64]:
# Stemming an entire sentence
new_text = 'When riders are riding their horses, they often think of how cowboys rode horses'
words = word_tokenize(new_text)
stem_words = [ps.stem(w) for w in words]
print(stem_words)


['when', 'rider', 'are', 'ride', 'their', 'hors', ',', 'they', 'often', 'think', 'of', 'how', 'cowboy', 'rode', 'hors']


## POS

In [65]:
# Part of Speech Tagging
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, 

Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people, 

Whereas it is essential, if man is not to be compelled to have recourse, as a last resort, to rebellion against tyranny and oppression, that human rights should be protected by the rule of law, 

Whereas it is essential to promote the development of friendly relations between nations, 

Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights, in the dignity and worth of the human person and in

## Loading text from corpora

In [66]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

## Training Senctence Tokenizer 

In [67]:
# training of PunktSentenceTokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [68]:
# using custom tokenizer to tokenize the text
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [69]:
print(tokenized)

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.', '31, 2006.', "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.", 'We have gathered under this Capitol dome in moments of national mourning and national achievemen

In [70]:
#define a function tht will tag each tokenized word with a part of speech
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            print('\n');
    except Exception as e:
        print(str(e))

In [71]:
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]


[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('n

## Various Tag of POS in NLTK

In [72]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Chunking 

In [73]:
# chunking with NLTK using Regexp


train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)


def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            # combine the part of speech tag with a regular expressio
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)
            
#             chunked.draw()
    except Exception as e:
        print(str(e))

In [74]:
process_content()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)
(Chunk Applause/NNP)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk State/NNP)
(Chunk Union/NNP Address/NNP)
(Chunk Capitol/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN)
(Chunk Capitol/NNP dome/NN)
(Chunk have/VBP served/VBN America/NNP)


## Chinking 

In [75]:
#chinking  with nltk

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)


def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            # combine the part of speech tag with a regular expressio
            chunkGram = r"""Chunk: {<.*>+}
                                        }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)
            
#             chunked.draw()
    except Exception as e:
        print(str(e))

In [76]:
process_content()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP)
(Chunk ./.)
(Chunk
  Mr./NNP
  Speaker/NNP
  ,/,
  Vice/NNP
  President/NNP
  Cheney/NNP
  ,/,
  members/NNS)
(Chunk Congress/NNP ,/, members/NNS)
(Chunk
  Supreme/NNP
  Court/NNP
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:)
(Chunk our/PRP$ nation/NN)
(Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
(Chunk America/NNP)
(Chunk its/PRP$ founding/NN ideals/NNS and/CC)
(Chunk noble/JJ dream/NN ./.)
(Chunk Tonight/NN we/PRP)
(Chunk hope/NN)
(Chunk glad/JJ reunion/NN)
(Chunk husband/NN who/WP)
(Chunk so/RB long/RB ago/RB ,/, and/CC we/PRP)
(Chunk grateful/JJ)
(Chunk good/JJ life/NN)
(Chunk Coretta/NNP Scott/NNP King/NNP ./.)
(Chunk

## Named Entity Recognition

In [77]:
# Named word entity recognition like people, place, location, things

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            namedEnt.draw();
#             chunked.draw()
    except Exception as e:
        print(str(e))

In [78]:
process_content()

## Lemmatizing


In [79]:
# Lemmatizing
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


## Wordnet 

In [80]:
# wordnet
from nltk.corpus import wordnet
syns = wordnet.synsets("program")

print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [81]:
print(syns[0])
print(syns[0].name())
print(syns[0].lemmas()[0])
print(syns[0].lemmas()[0].name())
print(syns[0].definition())
print(syns[0].examples())


Synset('plan.n.01')
plan.n.01
Lemma('plan.n.01.plan')
plan
a series of steps to be carried out or goals to be accomplished
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [82]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

In [83]:
print(set(synonyms))

{'just', 'expert', 'trade_good', 'unspoiled', 'soundly', 'commodity', 'skilful', 'sound', 'good', 'right', 'honorable', 'full', 'near', 'safe', 'respectable', 'effective', 'unspoilt', 'well', 'adept', 'estimable', 'skillful', 'thoroughly', 'in_force', 'goodness', 'dear', 'serious', 'ripe', 'practiced', 'honest', 'secure', 'in_effect', 'undecomposed', 'upright', 'dependable', 'salutary', 'proficient', 'beneficial'}


In [84]:
print(set(antonyms))

{'evil', 'ill', 'bad', 'badness', 'evilness'}


In [85]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [86]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

0.6956521739130435


In [87]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.32


# Text Classification with NLTK

In [88]:
import random
import nltk
from nltk.corpus import movie_reviews

In [89]:
# build a list of documents
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)
            ]
# shuffle the documents
random.shuffle(documents)
print('Number of documents: {}'.format(len(documents)))
print('First Review: {}'.format(documents[0]))

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print('Most common words: {}'.format(all_words.most_common(15)))
print('the word happy: {}'.format(all_words["happy"]))

Number of documents: 2000
First Review: (['mike', 'myers', ',', 'you', 'certainly', 'did', 'throw', 'us', 'a', '?', 'frickin', "'", 'bone', 'here', 'in', 'what', 'you', 'call', '?', 'the', 'biggest', 'austin', 'powers', 'adventure', 'yet', '.', "'", 'austin', 'powers', ':', 'the', 'spy', 'who', 'shagged', 'me', 'is', 'the', 'sequel', 'to', 'the', '1997', 'smash', 'comedy', 'austin', 'powers', ':', 'international', 'man', 'of', 'mystery', '.', 'many', 'are', 'skeptical', 'about', 'sequels', ',', 'saying', 'that', 'the', 'sequel', 'is', 'never', 'better', 'or', 'as', 'good', 'as', 'the', 'original', ',', 'but', 'austin', 'powers', ':', 'tswsm', 'goes', 'beyond', 'the', 'first', 'film', '.', 'austin', 'powers', ':', 'the', 'spy', 'who', 'shagged', 'me', 'stars', 'mike', 'myers', 'in', 'three', 'different', 'roles', '.', 'he', 'reprises', 'his', 'role', 'as', 'the', 'title', 'character', ',', 'austin', 'powers', ',', 'the', 'shagadelic', 'spy', 'whose', 'body', 'was', 'frozen', 'in', '1967

Most common words: [(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
the word happy: 215


In [90]:
print(len(all_words))

39768


In [91]:
# using the first 4000 most common words as features
word_features = list(all_words.keys())[:4000]

In [92]:
# a function to find features that will determine which of the 4000 word features are contained in review
def find_features(document):
    words = set(document)
    features = {}
    
    for w in word_features:
        features[w] = (w in words)
    return features

#using an example from a negative review
features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
    if value == True:
        print(key)

away
completely
deal
given
would
here
4
to
be
so
start
girlfriend
offering
hide
fed
find
entire
happen
when
world
my
they
with
visions
life
also
bit
street
sense
entertaining
showing
neighborhood
since
don
figured
your
sitting
break


In [93]:
print(features)

{'loot': False, 'singularly': False, 'polemic': False, 'reinvents': False, 'secondhand': False, 'replaceable': False, 'forlorn': False, 'richard': False, 'inspite': False, 'fleshing': False, 'defends': False, 'dormitory': False, 'attentive': False, 'somersets': False, 'kennif': False, 'huckleberry': False, 'pronged': False, 'mammals': False, 'hierarchy': False, 'tsi': False, 'kinda': False, 'brinks': False, 'gunton': False, 'misfits': False, 'exhibits': False, 'prefecture': False, 'rumba': False, 'ventured': False, 'hazy': False, 'lowest': False, 'vat': False, 'intersperesed': False, 'punctuated': False, 'authors': False, 'tenderness': False, 'supreme': False, 'selfishly': False, 'cower': False, 'stations': False, 'dalton': False, 'gittin': False, 'comepete': False, 'openings': False, 'superbowl': False, 'ireland': False, 'trimming': False, 'spirit': False, 'awfully': False, 'gallantry': False, 'experiment': False, 'caters': False, 'awakes': False, 'babe': False, 'dreyer': False, 'flav

In [94]:
# for all documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]


In [95]:
# train-test split of featureset using sklearn
from sklearn import model_selection
# define a seed for reproducibility
seed = 1

# spliting dataset
train, test = model_selection.train_test_split(featuresets, test_size = 0.4, random_state=seed)

In [96]:
print(len(train))
print(len(test))

1200
800


In [97]:
# using sklearn algorithms in NLTK
from nltk.classify.scikitlearn import  SklearnClassifier
from sklearn.svm import SVC

In [98]:
model = SklearnClassifier(SVC(kernel = 'linear'))

In [99]:
# training 
model.train(train)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [100]:
# testing 
accuracy = nltk.classify.accuracy(model, test)
print('SVC Accuracy: {}'.format(accuracy))

SVC Accuracy: 0.62625
