In [1]:
import random, nltk, pickle
from scipy.stats import mode
from statistics import mean
from nltk.tokenize import word_tokenize, sent_tokenize, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
import xgboost
%matplotlib inline

In [2]:
positive_doc = open("../Data/rt-polarity.pos", "r").read()
negative_doc = open("../Data/rt-polarity.neg", "r").read()

In [3]:
len(positive_doc)

626168

In [4]:
len(negative_doc)

612290

In [5]:
stop_words = set(stopwords.words("English"))

In [6]:
document = []
all_words = []

In [7]:
allowed_word_types = ["JJ", "JJR", "JJS", "NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP"]

In [8]:
tokenize = WhitespaceTokenizer()

In [9]:
for p in positive_doc.split("\n"):
    document.append((p, "pos"))

In [10]:
for n in negative_doc.split("\n"):
    document.append((n, "neg"))

In [11]:
short_pos_words = tokenize.tokenize(positive_doc)
short_neg_words = tokenize.tokenize(negative_doc)

In [12]:
for p in nltk.pos_tag(short_pos_words):
    if p[1] in allowed_word_types and p[0] not in stop_words:
        all_words.append(p[0].lower())

In [13]:
for n in nltk.pos_tag(short_neg_words):
    if n[1] in allowed_word_types and n[0] not in stop_words:
        all_words.append(n[0].lower())

In [14]:
len(all_words)

102872

In [15]:
len(document)

10664

In [16]:
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:15000]

In [17]:
def find_features(documents):
    words = word_tokenize(documents)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [18]:
featuresets = [(find_features(rev), category) for (rev, category) in document]

In [19]:
for i in range(0,5,1):
    random.shuffle(featuresets)
print(len(featuresets))

10664


In [20]:
training_set = featuresets[:7000]
testing_set = featuresets[7000:]

In [21]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy[%]: ", (nltk.classify.accuracy(classifier, testing_set))*100)
#x1 = (nltk.classify.accuracy(classifier, testing_set))

Naive Bayes Algo accuracy[%]:  73.08951965065502


In [22]:
BNB = SklearnClassifier(BernoulliNB())
BNB.train(training_set)
print("BernoulliNB Naive Bayes Algo accuracy[%]: ", (nltk.classify.accuracy(BNB, testing_set))*100)
#x3 = (nltk.classify.accuracy(BNB, testing_set))

BernoulliNB Naive Bayes Algo accuracy[%]:  72.8438864628821


In [23]:
LR = SklearnClassifier(LogisticRegression())
LR.train(training_set)
print("LogisticRegression Algo accuracy[%]: ", (nltk.classify.accuracy(LR, testing_set))*100)
#x4 = (nltk.classify.accuracy(LR, testing_set))



LogisticRegression Algo accuracy[%]:  73.66266375545851


In [24]:
#LinearSVC, NuSVC, SVC

In [25]:
Ada = SklearnClassifier(AdaBoostClassifier())
Ada.train(training_set)
print("Ada Boost Classifier Algo accuracy[%]: ", (nltk.classify.accuracy(Ada, testing_set))*100)
#x3 = (nltk.classify.accuracy(Ada, testing_set))

Ada Boost Classifier Algo accuracy[%]:  58.67903930131004


In [26]:
Gboost = SklearnClassifier(GradientBoostingClassifier())
Gboost.train(training_set)
print("Gradient Boosting Classifier Algo accuracy[%]: ", (nltk.classify.accuracy(Gboost, testing_set))*100)
#x3 = (nltk.classify.accuracy(Ada, testing_set))

Gradient Boosting Classifier Algo accuracy[%]:  61.98144104803494


In [None]:
Xgb = SklearnClassifier(xgboost)
Xgb.train(training_set)
print("Gradient Boosting Classifier Algo accuracy[%]: ", (nltk.classify.accuracy(Xgb, testing_set))*100)
#x3 = (nltk.classify.accuracy(Ada, testing_set))