In [1]:
import nltk
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import random
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [0]:
data = []
for category in movie_reviews.categories():
    for review in movie_reviews.fileids(category):
        data.append((movie_reviews.words(review), category))

random.shuffle(data)

In [0]:
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation))

In [0]:
lemmatizer = WordNetLemmatizer()

def pos_for_lemmatizer(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [0]:
def clean_reviews(review_words):
    clean_review_words = []
    for word in review_words:
        if word.lower() not in stop_words:
            pos = pos_for_lemmatizer(pos_tag([word])[0][1])
            clean_word = lemmatizer.lemmatize(word, pos=pos)
            clean_review_words.append(clean_word.lower())
    return clean_review_words

In [0]:
data = [(clean_reviews(review_words), category) for review_words, category in data]

In [0]:
reviews = [" ".join(review_words) for review_words, category in data]
categories = [category for review_words, category in data]

In [0]:
x_train, x_test, y_train, y_test = train_test_split(reviews, categories)

In [0]:
count_vec = CountVectorizer(max_features=2000, ngram_range=(1,2), max_df=0.75)
x_train = count_vec.fit_transform(x_train)
x_test = count_vec.transform(x_test)

In [0]:
# Uncomment to see 'feature names' and 'X'
# print(count_vec.get_feature_names(), end="\n\n")
# print(x_train.todense(), end="\n\n")
# print(x_test.todense())

In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
print("Score on training data: " + str(rfc.score(x_train, y_train)))
print("Score on testing data: " + str(rfc.score(x_test, y_test)))

Score on training data: 1.0
Score on testing data: 0.838


In [13]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
gnb.fit(x_train, y_train)
print("Score on training data: " + str(gnb.score(x_train, y_train)))
print("Score on testing data: " + str(gnb.score(x_test, y_test)))

Score on training data: 0.884
Score on testing data: 0.834


In [0]:
from sklearn.svm import SVC
svc = SVC(C=100, gamma=0.001)

# Used for getting max_depth
# grid = {'C' : [1e2, 5e2, 1e3, 5e3, 1e4, 5e4],
#         'gamma' : [5e-3, 1e-3, 5e-4, 1e-4]}
# grid_search = GridSearchCV(svc, grid)
# grid_search.fit(x_train, y_train)
# grid_search.best_estimator_

In [15]:
svc.fit(x_train, y_train)
print("Score on training data: " + str(svc.score(x_train, y_train)))
print("Score on testing data: " + str(svc.score(x_test, y_test)))

Score on training data: 1.0
Score on testing data: 0.83


> Accuracy of **0.838** on **RandomForestClassifier**

> Accuracy of **0.834** on **MultinomialNB**

> Accuracy of **0.83** on **SVC**