In [69]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.corpus import webtext
from autocorrect import Speller
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.collections import *
from autocorrect import Speller
from pattern.en import suggest

In [6]:
syns = wordnet.synsets('Jump')
print(syns)

[Synset('jump.n.01'), Synset('leap.n.02'), Synset('jump.n.03'), Synset('startle.n.01'), Synset('jump.n.05'), Synset('jump.n.06'), Synset('jump.v.01'), Synset('startle.v.02'), Synset('jump.v.03'), Synset('jump.v.04'), Synset('leap_out.v.01'), Synset('jump.v.06'), Synset('rise.v.11'), Synset('jump.v.08'), Synset('derail.v.02'), Synset('chute.v.01'), Synset('jump.v.11'), Synset('jumpstart.v.01'), Synset('jump.v.13'), Synset('leap.v.02'), Synset('alternate.v.01')]


In [13]:
print(syns[0].name())
print(syns[0].lemmas()[0].name())

jump.n.01
jump


In [16]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('brain.n.01')

print(w1.wup_similarity(w2))


0.21052631578947367


### **Spell Checker Techniques**

#### **Important steps involved**
##### ***1. fixing word lengthening***
##### ***2. perform text preprocessing***
##### ***3. perform spell fixing***

***regex pre-trained complier***

In [40]:
def remove_length(text):        
    patt = re.compile(r"(.)\1{2,}")
    return patt.sub(r"\1\1",text)

inputString = ['booook','bboookkk','cooool','ccooolll']  
for i in inputString:
    print(i,':',remove_length(text=i))


booook : book
bboookkk : bbookk
cooool : cool
ccooolll : ccooll


***spell pre-trained machine model***

In [68]:
def fixSpellingSpell(text):
    return spell(text)

spell = Speller(lang='en')    
inputString = ['mussage','survice','hte','caaaar']  
for i in inputString:
    print(i,':',fixSpellingSpell(text=i))

mussage : message
survice : service
hte : the
caaaar : aaaaaa


***suggest pre-trained machine model***

In [73]:
def fixSpellingSuggest(text):
    return suggest(text)
  
inputString = ['mussage','survice','hte','caaaaaaar']  
for i in inputString:
    print(i,':',fixSpellingSuggest(text=i))

mussage : [('message', 0.6216216216216216), ('massage', 0.3783783783783784)]
survice : [('service', 0.9253112033195021), ('survive', 0.07468879668049792)]
hte : [('the', 0.8653201565642368), ('he', 0.13408515883485067), ('ate', 0.00022706139307570876), ('hate', 0.0002162489457863893), ('hue', 0.00012974936747183358), ('te', 1.0812447289319465e-05), ('htm', 1.0812447289319465e-05)]
caaaaaaar : [('caaaaaaar', 0.0)]


In [78]:
print(suggest('ths neww abot eerth'))
print(suggest('ths'))
print(suggest('neww'))
print(suggest('abot'))
print(suggest('eerth'))


[('ths neww abot eerth', 0.0)]
[('the', 0.9481216457960644), ('this', 0.048134677581774456), ('thus', 0.002511580518665071), ('th', 0.0006042009738298049), ('thy', 0.000556812662156879), ('tis', 5.923538959115734e-05), ('tss', 1.1847077918231468e-05)]
[('new', 0.8439024390243902), ('news', 0.15609756097560976)]
[('about', 0.9966711051930759), ('cabot', 0.0019973368841544607), ('abort', 0.0006657789613848203), ('abbot', 0.0006657789613848203)]
[('earth', 0.9831932773109243), ('berth', 0.01680672268907563)]


## Text Classification Model

In [103]:
from nltk.corpus import movie_reviews

In [118]:
print('length of words: ', len(movie_reviews.words()))
print('Review Categories: ', movie_reviews.categories())
print('File Ids:',movie_reviews.fileids()[:5])

length of words:  1583820
Review Categories:  ['neg', 'pos']
File Ids: ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt']


#### Removing punctutations

In [105]:
text = " ".join(movie_reviews.words())

import string
text_filtered = text.translate(str.maketrans('','',string.punctuation))
print('Text length before removing punctuations : ', len(text))
print('Text length after removing punctuations : ', len(text_filtered))

Text length before removing punctuations :  7810519
Text length after removing punctuations :  7559896


#### Removing stopwords

In [107]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [108]:
tokens = word_tokenize(text_filtered)

word_filtered = [w.lower() for w in tokens if w not in stopwords]

print('Text length before removing stopwords : ', len(tokens))
print('Text length after removing stopwords : ', len(word_filtered))

Text length before removing stopwords :  1337085
Text length after removing stopwords :  708475


#### Text Preprocessing
1. Dictionary of frequency

In [110]:
counter_dict = nltk.FreqDist(word_filtered)
print(counter_dict.most_common(20))
print(len(counter_dict))

[('film', 9519), ('one', 5853), ('movie', 5774), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2170), ('would', 2110), ('much', 2050), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1912), ('well', 1906), ('characters', 1859), ('first', 1836), ('see', 1749), ('way', 1693), ('make', 1642)]
39295


In [112]:
docs = [(list(movie_reviews.words(fileid)), category) 
        for category in movie_reviews.categories() 
        for fileid in movie_reviews.fileids(category)
        ]

In [115]:
print(docs[1])

(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', 'it', "'", 's', 'got', 'a', 'head', 'start', 'in', 'this', 'movie', 'starring', 'jamie', 'lee', 'curtis', 'and', 'another', 'baldwin', 'brother', '(', 'william', 'this', 'time', ')', 'in', 'a', 'story', 'regarding', 'a', 'crew', 'of', 'a', 'tugboat', 'that', 'comes', 'across', 'a', 'deserted', 'russian', 'tech', 'ship', 'that', 'has', 'a', 'strangeness', 'to', 'it', 'when', 'they', 'kick', 'the', 'power', 'back', 'on', '.', 'little', 'do', 'they', 'know', 'the', 'power', 'within', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', 'don', "'", 't', 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', 'we', 'don', "'", 't'

In [116]:
print(len(docs))

2000


2. Feature Extraction

In [119]:
word_features = [
    w[0] for w in counter_dict.most_common(3000)
]

In [120]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [121]:
search_features(word_features)

{'film': True,
 'one': True,
 'movie': True,
 'like': True,
 'even': True,
 'good': True,
 'time': True,
 'story': True,
 'would': True,
 'much': True,
 'character': True,
 'also': True,
 'get': True,
 'two': True,
 'well': True,
 'characters': True,
 'first': True,
 'see': True,
 'way': True,
 'make': True,
 'life': True,
 'really': True,
 'films': True,
 'plot': True,
 'little': True,
 'people': True,
 'could': True,
 'scene': True,
 'bad': True,
 'man': True,
 'never': True,
 'best': True,
 'new': True,
 'scenes': True,
 'many': True,
 'director': True,
 'know': True,
 'movies': True,
 'action': True,
 'great': True,
 'another': True,
 'love': True,
 'go': True,
 'made': True,
 'us': True,
 'big': True,
 'end': True,
 'something': True,
 'back': True,
 'still': True,
 'world': True,
 'seems': True,
 'work': True,
 'makes': True,
 'however': True,
 'every': True,
 'though': True,
 'better': True,
 'real': True,
 'audience': True,
 'enough': True,
 'seen': True,
 'take': True,
 'aroun

In [123]:
featureset = [
    (search_features(docs), category)
    for (docs, category) in docs
]

In [124]:
featureset[0]

({'film': True,
  'one': True,
  'movie': True,
  'like': True,
  'even': True,
  'good': True,
  'time': False,
  'story': False,
  'would': True,
  'much': False,
  'character': True,
  'also': True,
  'get': True,
  'two': True,
  'well': True,
  'characters': True,
  'first': False,
  'see': True,
  'way': True,
  'make': True,
  'life': True,
  'really': True,
  'films': True,
  'plot': True,
  'little': True,
  'people': True,
  'could': False,
  'scene': False,
  'bad': True,
  'man': False,
  'never': False,
  'best': False,
  'new': True,
  'scenes': True,
  'many': False,
  'director': True,
  'know': True,
  'movies': True,
  'action': False,
  'great': False,
  'another': False,
  'love': False,
  'go': True,
  'made': False,
  'us': True,
  'big': True,
  'end': False,
  'something': False,
  'back': True,
  'still': True,
  'world': True,
  'seems': True,
  'work': False,
  'makes': True,
  'however': False,
  'every': True,
  'though': False,
  'better': False,
  'real':

In [125]:
len(search_features(docs[0][0]))

3000

3. Training and Test split

In [126]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [154]:
negcount = 0
poscount = 0
for i in range(1600):
    if training_set[i][1] == 'neg':
        negcount +=1
    else:
        poscount +=1

In [155]:
print('Total percentage of Negative reviews', (negcount/1600)*100)
print('Total percentage of Positive reviews', (poscount/1600)*100)

Total percentage of Negative reviews 62.5
Total percentage of Positive reviews 37.5


In [156]:
negcount = 0
poscount = 0
for i in range(400):
    if testing_set[i][1] == 'neg':
        negcount +=1
    else:
        poscount +=1

In [157]:
print('Total percentage of Negative reviews', (negcount/1600)*100)
print('Total percentage of Positive reviews', (poscount/1600)*100)

Total percentage of Negative reviews 0.0
Total percentage of Positive reviews 25.0


4. Training model

In [165]:
classifier1 = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier's accuracy on testing is : {}".format(nltk.classify.accuracy(classifier1, testing_set)*100))
print("Classifier's accuracy on training is : {}".format(nltk.classify.accuracy(classifier1, training_set)*100))

Classifier's accuracy on testing is : 73.75
Classifier's accuracy on training is : 90.25


In [164]:
classifier2 = nltk.NaiveBayesClassifier.train(testing_set)
print("Classifier's accuracy is : {}".format(nltk.classify.accuracy(classifier2, training_set)*100))

Classifier's accuracy is : 37.5
