# Load the data

In [1]:
from nltk.corpus import movie_reviews
movie_reviews.categories()  #different categories are +ve and -ve.

['neg', 'pos']

In [2]:
movie_reviews.fileids()   #this returns the file ids for +ve and -ve class

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [3]:
#To get the fileids only for one class- say neg -ve
movie_reviews.fileids('neg')

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [4]:
#To get the words present inside a review file
movie_reviews.words(movie_reviews.fileids()[5])  #for 5th file

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

# Data Cleaning

### Load the data
We have the array in which we keep a tuple of words and the category for each document.

In [5]:
documents = []
for category in movie_reviews.categories():  #for each category +ve and -ve, we are going to the fileid and from this fileid, I am adding the words.
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
        
documents[0: 5]    

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

### Shuffle the data as first we have 1000 -ve docs and then 1000 +ve docs


In [6]:
import random
random.shuffle(documents)
documents[0: 5]

[(['as', 'much', 'as', 'i', 'wanted', 'to', 'like', ...], 'pos'),
 (['reflecting', 'on', '"', 'bedazzled', ',', '"', 'a', ...], 'pos'),
 (['robocop', 'is', 'an', 'intelligent', 'science', ...], 'pos'),
 (['when', 'casting', 'the', 'key', 'part', 'of', 'the', ...], 'pos'),
 (['plot', ':', 'a', 'bunch', 'of', 'bad', 'guys', ...], 'pos')]

### Cleaning of each of these file -
Remove stopwords and do lemmatization.

In [7]:
#Changing the format of pos_tag : 

from nltk.corpus import wordnet
def get_simple_pos(tag):  #To get the format of the pos_tag taken by the lemmatizer and that returned by pos should be same.
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('N'):
        return wordnet.NOUN
    
    else:
        return wordnet.NOUN

In [8]:
#Sample example

from nltk import pos_tag

w = 'better'
print(pos_tag(w))   #If we don't pass w as array, then it gets as individual characters. POS tag expects to get an array.
print(pos_tag([w]))

[('b', 'NN'), ('e', 'NN'), ('t', 'NN'), ('t', 'NN'), ('e', 'NN'), ('r', 'NN')]
[('better', 'RBR')]


In [9]:
#Creating list of stopwords
import string
from nltk.corpus import stopwords
stops = list(stopwords.words('english'))
punctuations = list(string.punctuation)  #Add punctuations to the list of stopwords.
stops += punctuations 
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])   #For getting the pos, we pass the word in array, otherwise it will consider every letter of the letter individually.
            clean_words = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_words.lower())
    return output_words

In [11]:
#Creating the documents and adding a tupple with cleaned words and category of each document

documents = [(clean_review(document), category) for document, category in documents]
documents[0]

(['much',
  'want',
  'like',
  'film',
  'go',
  'pleased',
  'left',
  'first',
  'get',
  'usual',
  'line',
  'newspaper',
  'glitz',
  'like',
  'scream',
  'love',
  'wild',
  'thing',
  'campbell',
  'give',
  'stun',
  'performance',
  'twisty',
  'plot',
  'dazzle',
  'day',
  'oh',
  'yeah',
  'day',
  'newspaper',
  'us',
  'day',
  'neve',
  'campbell',
  'fall',
  'love',
  'onto',
  'serious',
  'stuff',
  'film',
  'mean',
  'nearly',
  'sarcastic',
  'way',
  'wild',
  'thing',
  'single',
  'handedly',
  'broken',
  'rule',
  'surround',
  'r',
  'rating',
  'mpaa',
  'say',
  'bad',
  'way',
  'see',
  'film',
  'may',
  'shock',
  'see',
  'however',
  'perhaps',
  'mpaa',
  'pleased',
  'film',
  'charm',
  'cast',
  'tricky',
  'plot',
  'let',
  'film',
  'slide',
  'r',
  'rating',
  'mormon',
  'bound',
  'hear',
  'uproar',
  'religious',
  'zealot',
  'warn',
  'see',
  'film',
  'trouble',
  'already',
  'see',
  'like',
  'wild',
  'thing',
  'film',
  'noir

# Building Feature Set
In nltk we want to provide the data to classifier in format of tuple containing a dictionary of all the features and the category. We use top words with max frequency in the whole dataset as our features.

In [17]:
#Make the features only for the training data, not for test data.
training_documents = documents[0: 1500]  #Taking first 1500 documents which are randomly shuffled
testing_documents = documents[1500: ]  

In [18]:
all_words = []
for doc in training_documents:
    all_words += doc[0]  #as doc is a tuple with 0th entry as the words and the first category to be the category. 

### Finding frequency distribution

In [19]:
import nltk

freq = nltk.FreqDist(all_words)  #It is a freq dist object, not an array or list. It has all the words and the freq of the words
common = freq.most_common(3000)  #returns an array of tuples gives the top 3000 words in freq.
#We only need the features/words not the freq.
features = [i[0] for i in common]  #common contains an array of tupples.

### Creating feature dictionary

In [20]:
def get_feature_dict(words):
    current_features = {}
    words_set = set(words)
    for w in features:
        current_features[w] = w in words_set   #stores true and false for every word if the word exists.
    return current_features

In [21]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_documents]

In [28]:
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_documents]

In [23]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': True,
  'see': True,
  'go': True,
  'time': True,
  'well': True,
  'scene': True,
  'even': True,
  'good': True,
  'story': True,
  'take': True,
  'would': True,
  'much': True,
  'come': True,
  'bad': True,
  'also': True,
  'life': False,
  'give': True,
  'look': True,
  'two': True,
  'first': True,
  'know': True,
  'way': True,
  'end': True,
  '--': True,
  'year': False,
  'seem': True,
  'work': True,
  'thing': True,
  'plot': True,
  'really': True,
  'say': True,
  'play': True,
  'show': True,
  'people': True,
  'little': False,
  'could': False,
  'star': False,
  'man': False,
  'love': True,
  'never': False,
  'great': False,
  'try': False,
  'performance': True,
  'director': True,
  'new': False,
  'best': True,
  'many': True,
  'actor': True,
  'big': True,
  'want': True,
  'action': False,
  'watch': True,
  'u': True,
  'find': True,
  'act': Fals

# Training the data using Naive Bayes in nltk implementation

In [29]:
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training_data)

nltk.classify.accuracy(classifier, testing_data)  #returns accuracy

0.752

In [30]:
classifier.show_most_informative_features()  #shows the most informative features

Most Informative Features
               ludicrous = True              neg : pos    =     19.9 : 1.0
             outstanding = True              pos : neg    =     11.8 : 1.0
                 freddie = True              neg : pos    =     11.1 : 1.0
                  prinze = True              neg : pos    =     10.4 : 1.0
                 idiotic = True              neg : pos    =      9.9 : 1.0
                  turkey = True              neg : pos    =      9.9 : 1.0
                 winslet = True              pos : neg    =      8.9 : 1.0
                   jolie = True              neg : pos    =      8.4 : 1.0
            breathtaking = True              pos : neg    =      7.8 : 1.0
                   mulan = True              pos : neg    =      7.6 : 1.0


# Using the SVC (implemented in sklearn) for the feature set created above using nltk

In [32]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [33]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)  #Pass the sklearn implemented classifier ie svc into the nltk's dummy SklearnClassifier

In [34]:
#Train the model
classifier_sklearn.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [35]:
#Predict the accuracy
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.752

# Using the RandomForestClassifier (implemented in sklearn) for the feature set created above using nltk

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [39]:
#Train the model
classifier_sklearn1.train(training_data)



<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [40]:
#Predict the accuracy
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.674

### So we can implement any classifier created in sklearn using the feature set we created in nltk.

# Count Vectorizer 
Used to get the data in the format that sklearn requires in X and y format. Count Vectorizer does not implement cleaning, so we just need to clean our data using nltk, then we can get the required format.

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
#Sample example - to show how CountVectorizer works
train_set = {'the sky is blue', 'the sun is very bright'}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [45]:
a.todense()  #this gives the freq of the 3 words that are selected as features

matrix([[0, 1, 1],
        [1, 1, 1]], dtype=int64)

In [46]:
#to know the features selected
count_vec.get_feature_names()

['blue', 'is', 'the']

# In our movie_reviews dataset

In [47]:
documents[0]  #tuple words and the category.

(['much',
  'want',
  'like',
  'film',
  'go',
  'pleased',
  'left',
  'first',
  'get',
  'usual',
  'line',
  'newspaper',
  'glitz',
  'like',
  'scream',
  'love',
  'wild',
  'thing',
  'campbell',
  'give',
  'stun',
  'performance',
  'twisty',
  'plot',
  'dazzle',
  'day',
  'oh',
  'yeah',
  'day',
  'newspaper',
  'us',
  'day',
  'neve',
  'campbell',
  'fall',
  'love',
  'onto',
  'serious',
  'stuff',
  'film',
  'mean',
  'nearly',
  'sarcastic',
  'way',
  'wild',
  'thing',
  'single',
  'handedly',
  'broken',
  'rule',
  'surround',
  'r',
  'rating',
  'mpaa',
  'say',
  'bad',
  'way',
  'see',
  'film',
  'may',
  'shock',
  'see',
  'however',
  'perhaps',
  'mpaa',
  'pleased',
  'film',
  'charm',
  'cast',
  'tricky',
  'plot',
  'let',
  'film',
  'slide',
  'r',
  'rating',
  'mormon',
  'bound',
  'hear',
  'uproar',
  'religious',
  'zealot',
  'warn',
  'see',
  'film',
  'trouble',
  'already',
  'see',
  'like',
  'wild',
  'thing',
  'film',
  'noir

In [49]:
categories = [category for document, category in documents]
categories  #this is the y in our data

['pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',


In [52]:
#Since in the CountVectorizer we pass whole long text as array, not words as array, so we join the words inside the individual documents
text_documents = [' '.join(document) for document, category in documents]
text_documents[0]  #A very huge text

'much want like film go pleased left first get usual line newspaper glitz like scream love wild thing campbell give stun performance twisty plot dazzle day oh yeah day newspaper us day neve campbell fall love onto serious stuff film mean nearly sarcastic way wild thing single handedly broken rule surround r rating mpaa say bad way see film may shock see however perhaps mpaa pleased film charm cast tricky plot let film slide r rating mormon bound hear uproar religious zealot warn see film trouble already see like wild thing film noir 90 fill 40 sensibility yet break rule modern day taboo 90 culture style film rich color actor gorgeous people seemingly drawn gene pool place film sure good true fun watch movie like come along sort well version basic instinct bound people either love hate people want admit like see tell anyone see like wild thing begin blue bay high school one high school world teenager would love get almost student put entrance exam admittance totally base look alone cour

In [56]:
#Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_documents, categories, test_size=0.33, random_state=42)

In [61]:
#NOTE : Apply the CountVectorizer only on the training data

count_vec = CountVectorizer(max_features = 2000, ngram_range = (1, 3), max_df = 0.8)  #n_grams can be 1, 2,3 and max_df means the word appearing in documents more than 80% can be ignored.
X_train_features = count_vec.fit_transform(X_train)
X_train_features.todense()  #This is the X matrix of training data

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [62]:
count_vec.get_feature_names()  #all the 2000 features

['000',
 '10',
 '100',
 '13',
 '15',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'alan',
 'alex',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'annie',
 'annoy',
 'another',
 'answer',
 'anti',
 'anyone',
 'anything'

In [63]:
X_test_features = count_vec.transform(X_test) #only do transform, dont fit
X_test_features

<660x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 110713 stored elements in Compressed Sparse Row format>

# Apply the Sklearn Classifiers to above prepared dataset

In [64]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train_features, y_train)
svc.score(X_test_features, y_test)



0.8151515151515152

### Remember : We have to clean the data before passing it to CountVectorizer.