In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

In [2]:
sample_text = "Does it really work? Lets see."

In [3]:
sent_tokenize(sample_text)

['Does it really work?', 'Lets see.']

In [4]:
words = word_tokenize(sample_text)
print(words)
from nltk import pos_tag
pos_tag(words)
#pos means part of speech
#VB means verb, NN means noun, etc.

['Does', 'it', 'really', 'work', '?', 'Lets', 'see', '.']


[('Does', 'VBZ'),
 ('it', 'PRP'),
 ('really', 'RB'),
 ('work', 'VB'),
 ('?', '.'),
 ('Lets', 'NNS'),
 ('see', 'VBP'),
 ('.', '.')]

In [5]:
from nltk.corpus import stopwords #we will use it to remove stop words from our text

In [6]:
stop = stopwords.words('english') # Language specify krni padegi because isme bhaut sari languages hoti hai
stop[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
# removing stopwords
clean_words = [w for w in words if not w.lower() in stop]
clean_words

['really', 'work', '?', 'Lets', 'see', '.']

In [8]:
#words like playing, played etc should be converted to their root words i.e. happy, play
#we use porter stemmer in nltk.stem in order to do that
stem_words = ['play','played','playing','player','happier']
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for w in stem_words:
    print(ps.stem(w))
#ye itna bhi smart nhi hai. we will see a better option

play
play
play
player
happier


In [9]:
# from nltk.test import state_union

In [10]:
# better option
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
lem.lemmatize('playing', pos = 'v') #pos means part of speech #v means verb
# similarly n means noun, a means adjective

'play'

In [11]:
from nltk.corpus import wordnet

In [12]:
wordnet.synsets('respect')
#ye mujhe synonyms laa kr de deda

[Synset('respect.n.01'),
 Synset('esteem.n.01'),
 Synset('respect.n.03'),
 Synset('deference.n.01'),
 Synset('obedience.n.03'),
 Synset('regard.n.06'),
 Synset('deference.n.02'),
 Synset('respect.v.01'),
 Synset('respect.v.02')]

In [13]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
w1.wup_similarity(w2)

0.9090909090909091

# Working with Movie reviews dataset

In [14]:
from nltk.corpus import movie_reviews

In [15]:
len(movie_reviews.fileids())
#len(movie_reviews.fileids('pos')) #this will give all the positive reviews
# movie_reviews.words(movie_reviews.fileids()[5])
# movie_reviews.fileids('pos') # will give the file ids of all the files of positive category
# movie_reviews.fileids('neg') # will give the file ids of all the file of negative category

2000

In [16]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

documents[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [17]:
import random
random.seed(0) #har baar same result aana chahiye warna result alag alag aaega
random.shuffle(documents) 
#this will shuffle the entire document so now we can take the first 1500 as the training data and keep the rest as testing data

In [18]:
training_documents = documents[:1500]
testing_documents = documents[1500:len(documents)]

In [19]:
import string
a = '1234'
punctuation = list(string.punctuation)
all_words = []
stop = stop+punctuation
#this will add punctuation in our stopwords list\

In [20]:
for doc in training_documents:
    for w in doc[0]:
        if w.lower() not in stop:
            all_words.append(w.lower())

In [21]:
# len(pos)

In [22]:
dist = nltk.FreqDist(all_words)
features = dist.most_common(3000)#taking the first most common 3000 words
feature_words = [i[0] for i in features]
feature_words[:10]

['film',
 'one',
 'movie',
 'like',
 'even',
 'time',
 'good',
 'story',
 'would',
 'much']

In [23]:
def get_features(document):
    words = set(document) #creates a set of words and a set will not contain duplicate elements
    features = {}
    distribution = nltk.FreqDist(words)
    for w in feature_words:
        if w in words:
            features[w] = distribution[w]
    return features

In [24]:
training_data = [(get_features(a), b) for (a, b) in training_documents]
testing_data = [(get_features(a), b) for (a, b) in testing_documents]

In [25]:
classifier = nltk.NaiveBayesClassifier.train(training_data)

In [26]:
classifier.show_most_informative_features(10)

Most Informative Features
             outstanding = 1                 pos : neg    =     21.7 : 1.0
                  castle = 1                 pos : neg    =     11.1 : 1.0
                  alicia = 1                 neg : pos    =     10.9 : 1.0
                   anger = 1                 pos : neg    =      8.3 : 1.0
                  prinze = 1                 neg : pos    =      8.2 : 1.0
                   waste = 1                 neg : pos    =      7.9 : 1.0
                   mulan = 1                 pos : neg    =      7.8 : 1.0
                     era = 1                 pos : neg    =      7.6 : 1.0
                   inept = 1                 neg : pos    =      7.6 : 1.0
              schumacher = 1                 neg : pos    =      7.6 : 1.0


In [27]:
nltk.classify.accuracy(classifier, testing_data)

0.618

In [28]:
#agar hum ye data sklearn ke kisi classifier ko dena chahe then hume data ka form change krna padega
#nltk mein ek tareeka hai jisse hume ye nhi krna padega
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [29]:
classifier_sklearn = SklearnClassifier(SVC())
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [30]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.752

In [31]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = SklearnClassifier(RandomForestClassifier())
classifier_rf.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [32]:
nltk.classify.accuracy(classifier_rf, testing_data)

0.676

In [33]:
from sklearn.naive_bayes import MultinomialNB
classifier_mnb = SklearnClassifier(MultinomialNB())
classifier_mnb.train(training_data)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [34]:
nltk.classify.accuracy(classifier_mnb, testing_data)

0.83

# Making some impovements

In [50]:
def get_mapping(tag):
    if tag.startswith('NN'):
        return wordnet.NOUN
    elif tag.startswith('JJ'):
        return wordnet.ADJ
    elif tag.startswith('RB'):
        return wordnet.ADV
    elif tag.startswith('VB'):
        return wordnet.VERB
    else:
        return 'n'

In [51]:
def clean_data(document):
    cleaned_document = []
    lem = WordNetLemmatizer()
    tags = pos_tag(document)
    for i in range(len(document)):
        word = document[i]
        tag = get_mapping(tags[i][1])
        if word not in stop:
            cleaned_document.append(lem.lemmatize(word, tag))
    return cleaned_document

In [52]:
cleaned_documents = []
for i in range(len(documents)):
    cleaned_documents.append((clean_data(documents[i][0]), documents[i][1]))

In [53]:
training_documents = cleaned_documents[:1500]
testing_documents = cleaned_documents[1500:len(cleaned_documents)]

In [54]:
all_words = []
for doc in training_documents:
    for w in doc[0]:
        if w.lower() not in stop:
            all_words.append(w.lower())

In [55]:
dist = nltk.FreqDist(all_words)
features = dist.most_common(3000)#taking the first most common 3000 words
feature_words = [i[0] for i in features]
feature_words[:10]

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time']

In [56]:
def get_features(document):
    words = set(document) #creates a set of words and a set will not contain duplicate elements
    features = {}
    distribution = nltk.FreqDist(words)
    for w in feature_words:
        if w in words:
            features[w] = distribution[w]
        else: 
            features[w] = 0
    return features

In [57]:
training_data = [(get_features(a), b) for (a, b) in training_documents]
testing_data = [(get_features(a), b) for (a, b) in testing_documents]

In [58]:
classifier = nltk.NaiveBayesClassifier.train(training_data)

In [59]:
classifier.show_most_informative_features(10)

Most Informative Features
             outstanding = 1                 pos : neg    =     21.7 : 1.0
                  castle = 1                 pos : neg    =     13.8 : 1.0
               ludicrous = 1                 neg : pos    =     11.2 : 1.0
                  alicia = 1                 neg : pos    =     10.9 : 1.0
               stupidity = 1                 neg : pos    =     10.1 : 1.0
                  prinze = 1                 neg : pos    =      8.2 : 1.0
                   mulan = 1                 pos : neg    =      7.8 : 1.0
                     era = 1                 pos : neg    =      7.6 : 1.0
                   inept = 1                 neg : pos    =      7.6 : 1.0
              schumacher = 1                 neg : pos    =      7.6 : 1.0


In [60]:
nltk.classify.accuracy(classifier, testing_data)

0.788

In [61]:
classifier_mnb = SklearnClassifier(MultinomialNB())
classifier_mnb.train(training_data)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [62]:
nltk.classify.accuracy(classifier_mnb, testing_data)

0.804

In [63]:
classifier_rf = SklearnClassifier(RandomForestClassifier())
classifier_rf.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [64]:
nltk.classify.accuracy(classifier_rf, testing_data)

0.672