In [64]:
import nltk

#### 8. Lemmatization:
Lemmatization is similar to stemming - but result in a real word which is similar/synonym to original word. Which could be very different from original word but can be looked up in a real dictionary.

In [1]:
from nltk.stem  import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [7]:
print(lemmatizer.lemmatize('cat'))
print(lemmatizer.lemmatize('better', pos='a'))#pos define the 'part of speech of text' default is n for noun
print(lemmatizer.lemmatize('best',pos='a'))
print(lemmatizer.lemmatize('run'))
print(lemmatizer.lemmatize('run',pos='v'))

cat
good
best
run
run


#### 9. Coropora

In [8]:
from nltk.corpus import gutenberg
sample = gutenberg.raw('bible-kjv.txt')

In [13]:
from nltk.tokenize import word_tokenize, sent_tokenize
sample_tok = word_tokenize(sample)
print(len(sample_tok))
print(sample_tok[5:15])

sent_tok = sent_tokenize(sample)
print(len(sent_tok))
print(sent_tok[5:15])

29812
['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth gras

#### 10. wordnet

In [21]:
from nltk.corpus import wordnet
syns = wordnet.synsets("bird")

In [23]:
print(syns)

[Synset('bird.n.01'), Synset('bird.n.02'), Synset('dame.n.01'), Synset('boo.n.01'), Synset('shuttlecock.n.01'), Synset('bird.v.01')]


In [35]:
print(syns[0])
print(syns[0].lemmas())
print(syns[0].lemmas()[0])
print(syns[0].lemmas()[0].name())
print(syns[0].name())
print(syns[0].definition())
print(syns[0].examples())

Synset('bird.n.01')
[Lemma('bird.n.01.bird')]
Lemma('bird.n.01.bird')
bird
bird.n.01
warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings
[]


In [40]:
synonmys = []
antonyms = []
for syn in wordnet.synsets('good'):
    for l in syn.lemmas():
#         print('l: ', l)
        synonmys.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print(set(synonmys))
print(set(antonyms))
        

{'undecomposed', 'dear', 'upright', 'honest', 'soundly', 'near', 'in_force', 'unspoilt', 'effective', 'full', 'just', 'respectable', 'skillful', 'serious', 'estimable', 'honorable', 'secure', 'ripe', 'expert', 'in_effect', 'trade_good', 'goodness', 'thoroughly', 'commodity', 'safe', 'sound', 'salutary', 'unspoiled', 'beneficial', 'dependable', 'proficient', 'right', 'good', 'well', 'skilful', 'adept', 'practiced'}
{'evil', 'badness', 'bad', 'ill', 'evilness'}


#### 10.2 - to check the word similarity

In [140]:
w1 = wordnet.synset('ship.n.01') #n is for noun and 01 is for first
w2 = wordnet.synset('boat.n.01')

# comparing the semantic similarity between these words
print(w1.wup_similarity(w2))# wup - is wooh and pulmar -the people who wrote semantic similarity paper

0.9090909090909091


In [46]:
w1 = wordnet.synset('ship.n.01') #n is for noun and 01 is for first
w2 = wordnet.synset('cart.n.01')

# comparing the semantic similarity between these words
print(w1.wup_similarity(w2))# wup - is wooh and pulmar -the people who wrote semantic similarity paper

0.7272727272727273


In [47]:
w1 = wordnet.synset('ship.n.01') #n is for noun and 01 is for first
w2 = wordnet.synset('cactus.n.01')

# comparing the semantic similarity between these words
print(w1.wup_similarity(w2))# wup - is wooh and pulmar -the people who wrote semantic similarity paper

0.38095238095238093


In [45]:
w1 = wordnet.synset('ship.n.01') #n is for noun and 01 is for first
w2 = wordnet.synset('cat.n.01')

# comparing the semantic similarity between these words
print(w1.wup_similarity(w2))# wup - is wooh and pulmar -the people who wrote semantic similarity paper

0.32


#### 11 Text Classification

In [48]:
from nltk.corpus import movie_reviews


In [141]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)),category))

In [50]:
print(documents[0])

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'b

In [142]:
import random
random.shuffle(documents)

In [143]:
#  getting all the word in the corpora
all_words = list(map(str.lower, movie_reviews.words()))

print(type(all_words), len(all_words))

<class 'list'> 1583820


In [144]:
freq_words = nltk.FreqDist(all_words)
print(type(freq_words), len(freq_words))

<class 'nltk.probability.FreqDist'> 39768


In [145]:
print(freq_words.most_common(15))

# observe it has go punctation ans wards like and to etc in it for this case stopwords removal is REQUIRED

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [146]:
print(freq_words['stupid'])
# how many times the word stupid pop up in complete corpus.

253


In [147]:
word_features = list(freq_words.keys())[:3000]

In [173]:
#  buildig the features
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
featuresets = [(find_features(rcv), category) for(rcv, category) in documents]



In [211]:
# divind into training and test fest

# hIGLY POSITIVE DATA - IF YOU COMMENT THE random(Documents) ie. don't shuffle the dataase
training_set = featuresets[:1900]
test_set = featuresets[1900:]

# hIGLY NEGATIVE DATA - IF YOU COMMENT THE random(Documents) ie. don't shuffle the dataase
# test_set = featuresets[:100]
# training_set = featuresets[100:]



In [212]:
# buildign the classifier
classifer = nltk.NaiveBayesClassifier.train(training_set)

In [213]:
print("Naive Bayes alog accuracy : ", nltk.classify.accuracy(classifer, test_set))

Naive Bayes alog accuracy :  0.79


In [214]:
classifer.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =     11.3 : 1.0
                  regard = True              pos : neg    =     10.5 : 1.0
              recognizes = True              pos : neg    =      8.1 : 1.0
              schumacher = True              neg : pos    =      7.8 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
           unimaginative = True              neg : pos    =      7.8 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.5 : 1.0
               atrocious = True              neg : pos    =      6.4 : 1.0
                    mena = True              neg : pos    =      6.3 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0
                  suvari = True              neg : pos    =      6.3 : 1.0
                   kudos = True              pos : neg    =      5.9 : 1.0

#### 14. saving  with pickling

In [215]:
#  teck of pickleing
import pickle
save_classifier = open('naviebayes.pickle','wb')
pickle.dump(classifer,save_classifier)
save_classifier.close()

In [216]:
#  loading the saved classifer
classifier_f = open('naviebayes.pickle','rb')
classifer = pickle.load(classifier_f)
classifier_f.close()

#### 15 sklearn - incorporation

In [217]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB, BernoulliNB
# from sklearn

In [218]:
mn = SklearnClassifier(MultinomialNB()).train(training_set)
print("Multinomial algo accuracy : ", nltk.classify.accuracy(mn, test_set))

Multinomial algo accuracy :  0.8


In [219]:
# gaussian = SklearnClassifier(GaussianNB()).train(training_set)
# print("Gaussian algo accuracy : ", nltk.classify.accuracy(gaussian, test_set))

In [220]:
bernouli = SklearnClassifier(BernoulliNB()).train(training_set)
print("Bernoulli algo accuracy : ", nltk.classify.accuracy(bernouli, test_set))

Bernoulli algo accuracy :  0.8


In [221]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC, NuSVC

In [222]:
logistic_regrs = SklearnClassifier(LogisticRegression()).train(training_set)
print("LogisticRegression algo accuracy : ", nltk.classify.accuracy(logistic_regrs, test_set))

LogisticRegression algo accuracy :  0.73


In [223]:
stocastic_gradien_descent = SklearnClassifier(SGDClassifier()).train(training_set)
print("SGDClassifier algo accuracy : ", nltk.classify.accuracy(stocastic_gradien_descent, test_set))



SGDClassifier algo accuracy :  0.72


In [224]:
linear_support_vector = SklearnClassifier(LinearSVC()).train(training_set)
print("LinearSVC algo accuracy : ", nltk.classify.accuracy(linear_support_vector, test_set))

LinearSVC algo accuracy :  0.72


In [225]:
suport_vector = SklearnClassifier(SVC()).train(training_set)
print("SVC algo accuracy : ", nltk.classify.accuracy(suport_vector, test_set))

SVC algo accuracy :  0.58


In [226]:
#  Nu you can specify the number of support vectors
nuain_support_vector = SklearnClassifier(NuSVC(nu=0.9)).train(training_set)
print("NuSVC algo accuracy : ", nltk.classify.accuracy(nuain_support_vector, test_set))

NuSVC algo accuracy :  0.66


####  16. Combining algos with a vote

In [227]:
from nltk.classify import ClassifierI
from statistics import  mode

In [228]:
class VoteClassifer(ClassifierI):
    def __init__(self, *classifiers):
        self.classifiers = classifiers
        
    def __getvotes(self, features):
        votes = []
        for classifier in self.classifiers:
            votes.append(classifier.classify(features))
        return votes
    def classify(self, features):
        votes = self.__getvotes(features)
        return mode(votes)
        
    def confidence(self, features):
        votes = self.__getvotes(features)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes/ len(votes)
        return conf
        
        

In [229]:
vote_classifier = VoteClassifer(classifer, 
                                mn, 
                                bernouli, 
                                logistic_regrs, 
                                stocastic_gradien_descent, 
                                linear_support_vector,
#                                 suport_vector, 
                                nuain_support_vector)
print("Vote classifier algo accuracy : ", nltk.classify.accuracy(vote_classifier, test_set))

Vote classifier algo accuracy :  0.79


In [230]:
print("Classification : ", vote_classifier.classify(test_set[0][0]))
print("Confidence : ", vote_classifier.confidence(test_set[0][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[1][0]))
print("Confidence : ", vote_classifier.confidence(test_set[1][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[2][0]))
print("Confidence : ", vote_classifier.confidence(test_set[2][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[3][0]))
print("Confidence : ", vote_classifier.confidence(test_set[3][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[4][0]))
print("Confidence : ", vote_classifier.confidence(test_set[4][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[5][0]))
print("Confidence : ", vote_classifier.confidence(test_set[5][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[6][0]))
print("Confidence : ", vote_classifier.confidence(test_set[6][0]))
print('\n')
print("Classification : ", vote_classifier.classify(test_set[7][0]))
print("Confidence : ", vote_classifier.confidence(test_set[7][0]))
print('\n')


Classification :  pos
Confidence :  0.7142857142857143


Classification :  neg
Confidence :  1.0


Classification :  neg
Confidence :  1.0


Classification :  neg
Confidence :  0.5714285714285714


Classification :  pos
Confidence :  0.5714285714285714


Classification :  pos
Confidence :  0.7142857142857143


Classification :  neg
Confidence :  0.8571428571428571


Classification :  neg
Confidence :  1.0




- what is a synset?
- why is lammatization useful?
- why to use lemmatization?
- what are different types of tokenizer and lemmatizers?