In [8]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.corpus import webtext
from autocorrect import Speller
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
from nltk.collections import *
from autocorrect import Speller
from pattern.en import suggest

In [9]:
syns = wordnet.synsets('Jump')
print(syns)

[Synset('jump.n.01'), Synset('leap.n.02'), Synset('jump.n.03'), Synset('startle.n.01'), Synset('jump.n.05'), Synset('jump.n.06'), Synset('jump.v.01'), Synset('startle.v.02'), Synset('jump.v.03'), Synset('jump.v.04'), Synset('leap_out.v.01'), Synset('jump.v.06'), Synset('rise.v.11'), Synset('jump.v.08'), Synset('derail.v.02'), Synset('chute.v.01'), Synset('jump.v.11'), Synset('jumpstart.v.01'), Synset('jump.v.13'), Synset('leap.v.02'), Synset('alternate.v.01')]


In [10]:
print(syns[0].name())
print(syns[0].lemmas()[0].name())

jump.n.01
jump


In [11]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('brain.n.01')

print(w1.wup_similarity(w2))


0.21052631578947367


### **Spell Checker Techniques**

#### **Important steps involved**
##### ***1. fixing word lengthening***
##### ***2. perform text preprocessing***
##### ***3. perform spell fixing***

***regex pre-trained complier***

In [12]:
def remove_length(text):        
    patt = re.compile(r"(.)\1{2,}")
    return patt.sub(r"\1\1",text)

inputString = ['booook','bboookkk','cooool','ccooolll']  
for i in inputString:
    print(i,':',remove_length(text=i))


booook : book
bboookkk : bbookk
cooool : cool
ccooolll : ccooll


***spell pre-trained machine model***

In [13]:
def fixSpellingSpell(text):
    return spell(text)

spell = Speller(lang='en')    
inputString = ['mussage','survice','hte','caaaar']  
for i in inputString:
    print(i,':',fixSpellingSpell(text=i))

mussage : message
survice : service
hte : the
caaaar : aaaaaa


***suggest pre-trained machine model***

In [14]:
def fixSpellingSuggest(text):
    return suggest(text)
  
inputString = ['mussage','survice','hte','caaaaaaar']  
for i in inputString:
    print(i,':',fixSpellingSuggest(text=i))

mussage : [('message', 0.6216216216216216), ('massage', 0.3783783783783784)]
survice : [('service', 0.9253112033195021), ('survive', 0.07468879668049792)]
hte : [('the', 0.8653201565642368), ('he', 0.13408515883485067), ('ate', 0.00022706139307570876), ('hate', 0.0002162489457863893), ('hue', 0.00012974936747183358), ('te', 1.0812447289319465e-05), ('htm', 1.0812447289319465e-05)]
caaaaaaar : [('caaaaaaar', 0.0)]


In [15]:
print(suggest('ths neww abot eerth'))
print(suggest('ths'))
print(suggest('neww'))
print(suggest('abot'))
print(suggest('eerth'))


[('ths neww abot eerth', 0.0)]
[('the', 0.9481216457960644), ('this', 0.048134677581774456), ('thus', 0.002511580518665071), ('th', 0.0006042009738298049), ('thy', 0.000556812662156879), ('tis', 5.923538959115734e-05), ('tss', 1.1847077918231468e-05)]
[('new', 0.8439024390243902), ('news', 0.15609756097560976)]
[('about', 0.9966711051930759), ('cabot', 0.0019973368841544607), ('abort', 0.0006657789613848203), ('abbot', 0.0006657789613848203)]
[('earth', 0.9831932773109243), ('berth', 0.01680672268907563)]


## Text Classification Model

In [16]:
from nltk.corpus import movie_reviews

In [17]:
print('length of words: ', len(movie_reviews.words()))
print('Review Categories: ', movie_reviews.categories())
print('File Ids:',movie_reviews.fileids()[:5])

length of words:  1583820
Review Categories:  ['neg', 'pos']
File Ids: ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt']


In [53]:
docs1 = [(list(movie_reviews.words(fileid)), category) 
        for category in movie_reviews.categories() 
        for fileid in movie_reviews.fileids(category)
        ]
df1 = pd.DataFrame(docs1)
df1[1].value_counts()

neg    1000
pos    1000
Name: 1, dtype: int64

In [56]:
print("Number of negative reviews:", len(movie_reviews.fileids(categories=['neg'])))
print("Number of positive reviews:", len(movie_reviews.fileids(categories=['pos'])))

Number of negative reviews: 1000
Number of positive reviews: 1000


#### Removing punctutations

In [18]:
text = " ".join(movie_reviews.words())

import string
text_filtered = text.translate(str.maketrans('','',string.punctuation))
print('Text length before removing punctuations : ', len(text))
print('Text length after removing punctuations : ', len(text_filtered))

Text length before removing punctuations :  7810519
Text length after removing punctuations :  7559896


#### Removing stopwords

In [19]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [57]:
tokens = word_tokenize(text_filtered)

word_filtered = [w.lower() for w in tokens if w.lower() not in stopwords]

print('Text length before removing stopwords : ', len(tokens))
print('Text length after removing stopwords : ', len(word_filtered))

Text length before removing stopwords :  1337085
Text length after removing stopwords :  708475


#### Text Preprocessing
1. Dictionary of frequency

In [21]:
counter_dict = nltk.FreqDist(word_filtered)
print(counter_dict.most_common(20))
print(len(counter_dict))

[('film', 9519), ('one', 5853), ('movie', 5774), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2170), ('would', 2110), ('much', 2050), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1912), ('well', 1906), ('characters', 1859), ('first', 1836), ('see', 1749), ('way', 1693), ('make', 1642)]
39295


In [22]:
docs = [(list(movie_reviews.words(fileid)), category) 
        for category in movie_reviews.categories() 
        for fileid in movie_reviews.fileids(category)
        ]

In [23]:
print(docs[1])

(['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', 'review', 'damn', 'that', 'y2k', 'bug', '.', 'it', "'", 's', 'got', 'a', 'head', 'start', 'in', 'this', 'movie', 'starring', 'jamie', 'lee', 'curtis', 'and', 'another', 'baldwin', 'brother', '(', 'william', 'this', 'time', ')', 'in', 'a', 'story', 'regarding', 'a', 'crew', 'of', 'a', 'tugboat', 'that', 'comes', 'across', 'a', 'deserted', 'russian', 'tech', 'ship', 'that', 'has', 'a', 'strangeness', 'to', 'it', 'when', 'they', 'kick', 'the', 'power', 'back', 'on', '.', 'little', 'do', 'they', 'know', 'the', 'power', 'within', '.', '.', '.', 'going', 'for', 'the', 'gore', 'and', 'bringing', 'on', 'a', 'few', 'action', 'sequences', 'here', 'and', 'there', ',', 'virus', 'still', 'feels', 'very', 'empty', ',', 'like', 'a', 'movie', 'going', 'for', 'all', 'flash', 'and', 'no', 'substance', '.', 'we', 'don', "'", 't', 'know', 'why', 'the', 'crew', 'was', 'really', 'out', 'in', 'the', 'middle', 'of', 'nowhere', ',', 'we', 'don', "'", 't'

In [24]:
print(len(docs))

2000


In [61]:
docs[200][0]

['capsule',
 ':',
 'the',
 'much',
 'anticipated',
 're',
 '-',
 'adaptation',
 'of',
 'the',
 'pierre',
 'boulle',
 'novel',
 'comes',
 'to',
 'the',
 'screen',
 'as',
 'a',
 'dark',
 'and',
 'a',
 'little',
 'dreary',
 'film',
 'with',
 'lots',
 'of',
 'chases',
 'and',
 'fighting',
 ',',
 'but',
 'very',
 'little',
 'intelligence',
 '.',
 'visually',
 'there',
 'is',
 'much',
 'to',
 'like',
 'about',
 'this',
 'version',
 ',',
 'but',
 'the',
 'approach',
 'is',
 'to',
 'take',
 'an',
 'adventure',
 'after',
 'the',
 'style',
 'of',
 'gulliver',
 "'",
 's',
 'travels',
 'and',
 'treat',
 'it',
 'as',
 'an',
 'action',
 'film',
 '.',
 'that',
 'makes',
 'it',
 'a',
 'film',
 'without',
 'much',
 'center',
 '.',
 ',',
 '0',
 '(',
 '-',
 '4',
 'to',
 '+',
 '4',
 ')',
 'pierre',
 'boulle',
 ',',
 'author',
 'of',
 'the',
 'bridge',
 'on',
 'the',
 'river',
 'kwai',
 ',',
 'wrote',
 'planet',
 'of',
 'the',
 'apes',
 '(',
 'a',
 '.',
 'k',
 '.',
 'a',
 '.',
 'monkey',
 'planet',
 ')',
 

2. Feature Extraction

In [95]:
# word_features = [
#     w[0] for w in counter_dict.most_common(3000)
# ]
word_features = classifier1.show_most_informative_features(100)


Most Informative Features
             outstanding = True              pos : neg    =     13.4 : 1.0
               ludicrous = True              neg : pos    =     12.4 : 1.0
              refreshing = True              pos : neg    =      8.9 : 1.0
                   jolie = True              neg : pos    =      7.8 : 1.0
                   mulan = True              pos : neg    =      7.6 : 1.0
                religion = True              pos : neg    =      6.9 : 1.0
                  finest = True              pos : neg    =      6.7 : 1.0
                 idiotic = True              neg : pos    =      6.7 : 1.0
             beautifully = True              pos : neg    =      6.5 : 1.0
            breathtaking = True              pos : neg    =      6.5 : 1.0
                  welles = True              neg : pos    =      6.4 : 1.0
              schumacher = True              neg : pos    =      6.3 : 1.0
             wonderfully = True              pos : neg    =      6.2 : 1.0

In [111]:
word_features = ['ludicrous','idiotic','seagal','welles','schumacher','inept','lame','whatsoever','wasted','bland','unfunny','alicia',
       'sandler','ridiculous','random','jolie','poorly','waste','lifeless','garbage','awful','sat','laughable','bomb',
       'stupid','painfully','embarrassing','worst','pointless','bore','mess', 'anger','ecades','finest','damon','anding',
       'rfully','mulan','ifully','jesus','nation','flynt','ndship','obi','ligion','tucker','dinary','allows','tastic',
       'hanks','bowski','ripley','era','taking','eshing','soners','listic','tiller','lonely','german','gon']

len(word_features)

61

In [132]:
# Read the text file
with open('MostInformative.txt', 'r') as file:
    lines = file.readlines()
    
data = []
for line in lines:
    line = line.strip()
    if line:
#         print(line.split())
        word, _ , _ , *sentiment = line.split()
        sentiment = ' '.join(sentiment)
        sentiment, ratio = sentiment.split('=')
        sentiment = sentiment.strip()
        data.append({
            'Word': word,
            'Positive/Negative': sentiment,
            'Ratio': ratio})
    

data[0]

{'Word': 'outstanding',
 'Positive/Negative': 'pos : neg',
 'Ratio': ' 13.4 : 1.0'}

In [133]:
pos = []
neg = []
for i in data:

    print(i['Word'])

outstanding
ludicrous
refreshing
jolie
mulan
religion
finest
idiotic
beautifully
breathtaking
welles
schumacher
wonderfully
ordinary
alicia
seagal
jedi
inept
hanks
unfunny
damon
wasted
lame
ridiculous
bland
bore
sat
waste
anger
gon
awful
painfully
luckily
mature
worst
poorly
laughable
garbage
extraordinary
lebowski
memorable
badly
designer
lifeless
boring
era
mess
scorsese
random
portrayal
harris
dull
snow
terrific
nomination
pointless
notch
embarrassing
gabriel
excellent
lucas
freddie
paxton
flynt
skip
deserves
stupid
na
obi
spacey
virus
decades
snipes
court
flawed
realistic
whatsoever
tucker
touches
ripley
prinze
anywhere
subtle
breasts
eve
contemporary
endearing
masterpiece
damme
henstridge
godzilla
porn
fantastic
satisfying
allows
natural
crafted
freedom
german
tedious


In [97]:
def search_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [98]:
search_features(docs[0][0])

{'ludicrous': False,
 'idiotic': False,
 'seagal': False,
 'welles': False,
 'schumacher': False,
 'inept': False,
 'lame': False,
 'whatsoever': False,
 'wasted': False,
 'bland': False,
 'unfunny': False,
 'alicia': False,
 'sandler': False,
 'ridiculous': False,
 'random': False,
 'jolie': False,
 'poorly': False,
 'waste': False,
 'lifeless': False,
 'garbage': False,
 'awful': False,
 'sat': False,
 'laughable': False,
 'bomb': False,
 'stupid': False,
 'painfully': False,
 'embarrassing': False,
 'worst': False,
 'pointless': False,
 'bore': False,
 'mess': True,
 'anger': False,
 'ecades': False,
 'finest': False,
 'damon': False,
 'anding': False,
 'rfully': False,
 'mulan': False,
 'ifully': False,
 'jesus': False,
 'nation': False,
 'flynt': False,
 'ndship': False,
 'obi': False,
 'ligion': False,
 'tucker': False,
 'dinary': False,
 'allows': False,
 'tastic': False,
 'hanks': False,
 'bowski': False,
 'ripley': False,
 'era': False,
 'taking': False,
 'eshing': False,
 'so

In [99]:
featureset = [
    (search_features(docs), category)
    for (docs, category) in docs
]

In [100]:
featureset[0]

({'ludicrous': False,
  'idiotic': False,
  'seagal': False,
  'welles': False,
  'schumacher': False,
  'inept': False,
  'lame': False,
  'whatsoever': False,
  'wasted': False,
  'bland': False,
  'unfunny': False,
  'alicia': False,
  'sandler': False,
  'ridiculous': False,
  'random': False,
  'jolie': False,
  'poorly': False,
  'waste': False,
  'lifeless': False,
  'garbage': False,
  'awful': False,
  'sat': False,
  'laughable': False,
  'bomb': False,
  'stupid': False,
  'painfully': False,
  'embarrassing': False,
  'worst': False,
  'pointless': False,
  'bore': False,
  'mess': True,
  'anger': False,
  'ecades': False,
  'finest': False,
  'damon': False,
  'anding': False,
  'rfully': False,
  'mulan': False,
  'ifully': False,
  'jesus': False,
  'nation': False,
  'flynt': False,
  'ndship': False,
  'obi': False,
  'ligion': False,
  'tucker': False,
  'dinary': False,
  'allows': False,
  'tastic': False,
  'hanks': False,
  'bowski': False,
  'ripley': False,
  '

In [101]:
len(featureset)

2000

In [102]:
len(search_features(docs[0][0]))

61

3. Training and Test split

In [103]:
training_set = featureset[:1600]
testing_set = featureset[1600:]

In [104]:
import random

# Assuming 'my_list' is the list you want to shuffle
shuffleList = random.sample(featureset, len(featureset))

training_set1 = shuffleList[:1600]
testing_set1 = shuffleList[1600:] 

In [105]:
negcount = 0
poscount = 0
for i in range(1600):
    if training_set[i][1] == 'neg':
        negcount +=1
    else:
        poscount +=1

In [106]:
print('Total percentage of Negative reviews', (negcount/1600)*100)
print('Total percentage of Positive reviews', (poscount/1600)*100)

Total percentage of Negative reviews 62.5
Total percentage of Positive reviews 37.5


In [107]:
negcount = 0
poscount = 0
for i in range(400):
    if testing_set[i][1] == 'neg':
        negcount +=1
    else:
        poscount +=1

In [108]:
print('Total percentage of Negative reviews', (negcount/1600)*100)
print('Total percentage of Positive reviews', (poscount/1600)*100)

Total percentage of Negative reviews 0.0
Total percentage of Positive reviews 25.0


4. Training model

In [109]:
classifier1 = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier's accuracy on testing is : {}".format(nltk.classify.accuracy(classifier1, testing_set)*100))
print("Classifier's accuracy on training is : {}".format(nltk.classify.accuracy(classifier1, training_set)*100))

Classifier's accuracy on testing is : 77.0
Classifier's accuracy on training is : 75.5


In [110]:
classifier1 = nltk.NaiveBayesClassifier.train(training_set1)
print("Classifier's accuracy on testing is : {}".format(nltk.classify.accuracy(classifier1, testing_set1)*100))
print("Classifier's accuracy on training is : {}".format(nltk.classify.accuracy(classifier1, training_set1)*100))

Classifier's accuracy on testing is : 75.75
Classifier's accuracy on training is : 76.3125


In [37]:
classifier2 = nltk.NaiveBayesClassifier.train(testing_set)
print("Classifier's accuracy is : {}".format(nltk.classify.accuracy(classifier2, training_set)*100))

Classifier's accuracy is : 37.5


Improving the model accuracy

Most Informative words

In [78]:
classifier1.show_most_informative_features(100)

Most Informative Features
             outstanding = True              pos : neg    =     13.4 : 1.0
               ludicrous = True              neg : pos    =     12.4 : 1.0
              refreshing = True              pos : neg    =      8.9 : 1.0
                   jolie = True              neg : pos    =      7.8 : 1.0
                   mulan = True              pos : neg    =      7.6 : 1.0
                religion = True              pos : neg    =      6.9 : 1.0
                  finest = True              pos : neg    =      6.7 : 1.0
                 idiotic = True              neg : pos    =      6.7 : 1.0
             beautifully = True              pos : neg    =      6.5 : 1.0
            breathtaking = True              pos : neg    =      6.5 : 1.0
                  welles = True              neg : pos    =      6.4 : 1.0
              schumacher = True              neg : pos    =      6.3 : 1.0
             wonderfully = True              pos : neg    =      6.2 : 1.0

Saving Model 

In [135]:
import pickle
save_classifier = open('naive_bayes_model.pkl','wb')
pickle.dump(classifier1, save_classifier)
save_classifier.close()

In [136]:
classifier_f = open('naive_bayes_model.pkl','rb')
classifier = pickle.load(classifier_f)
classifier_f.close()

In [154]:
custom_review = 'disgusting  movie'

text_filtered = custom_review.translate(str.maketrans('','',string.punctuation))
custom_review_tokens = word_tokenize(text_filtered)
word_filtered = [w.lower() for w in custom_review_tokens if w.lower() not in stopwords]
custom_review_set = search_features(word_filtered)




print(classifier.classify(custom_review_set))

pos


Vader Sentiment Analysis

In [148]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [171]:
sentence = 'The car is cool.'
sentence1 = 'The car is cool!!'
sentence2 = 'The car is cool!!!!!'
sentence3 = 'The car is COOL!!!!!'
sentence4 = 'THE CAR IS COOL!!'
sentence5 = 'THE CAR IS  A BIT COOL!!'
sentence6 = 'the car is very cool!!'
sentence7 = 'I enjoyed your hotel food.'
sentence8 = "I enjoyed your hotel food, but I didn't like the way waiters offer their services."

print(analyser.polarity_scores(sentence))
print(analyser.polarity_scores(sentence1))
print(analyser.polarity_scores(sentence2))
print(analyser.polarity_scores(sentence3))
print(analyser.polarity_scores(sentence4))
print(analyser.polarity_scores(sentence5))
print(analyser.polarity_scores(sentence6))
print(analyser.polarity_scores(sentence7))
print(analyser.polarity_scores(sentence8))

{'neg': 0.0, 'neu': 0.566, 'pos': 0.434, 'compound': 0.3182}
{'neg': 0.0, 'neu': 0.51, 'pos': 0.49, 'compound': 0.4374}
{'neg': 0.0, 'neu': 0.464, 'pos': 0.536, 'compound': 0.5374}
{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6371}
{'neg': 0.0, 'neu': 0.51, 'pos': 0.49, 'compound': 0.4374}
{'neg': 0.0, 'neu': 0.634, 'pos': 0.366, 'compound': 0.4374}
{'neg': 0.0, 'neu': 0.557, 'pos': 0.443, 'compound': 0.49}
{'neg': 0.0, 'neu': 0.548, 'pos': 0.452, 'compound': 0.5106}
{'neg': 0.15, 'neu': 0.73, 'pos': 0.121, 'compound': -0.1318}


Preceding Tri-gram

In [173]:
sen = 'your hotel sevice is great!'
sen1 = 'your hotel service is not that great'

print(analyser.polarity_scores(sen))
print(analyser.polarity_scores(sen1))

{'neg': 0.0, 'neu': 0.477, 'pos': 0.523, 'compound': 0.6588}
{'neg': 0.354, 'neu': 0.646, 'pos': 0.0, 'compound': -0.5096}
