In [1]:
import sklearn as sk
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import numpy as np
import seaborn as sns
import os

# Methods for pre-processing

In [2]:
def computeTrainScore(filename,dict):
    result = []
    with open('/Users/kdassharma1/Documents/GitHub/aclImdb/train/neg/'+filename, 'r') as file:
        data = file.read().replace('\n', '')
        data = data.replace('<br />','')
        data = data.replace('.',' ')
        data = data.replace(',',' ')
        result = data.split(" ")
    score = 0
    for word in result:
        if word.lower() in dict:
            score += dict[word.lower()]
    return score

In [3]:
def filterAndCombine(directoryNeg, directoryPos):
    outputX = []
    outputY = []
    for filename in os.listdir(directoryNeg):
        with open(''+directoryNeg+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />','')
            data = data.replace('.',' ')
            data = data.replace(',',' ')
            #result = data.split(" ")
            outputX.append(data)
            outputY.append(findRating(filename))
    for filename in os.listdir(directoryPos):
        with open(''+directoryPos+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />','')
            data = data.replace('.',' ')
            data = data.replace(',',' ')
            #result = data.split(" ")
            outputX.append(data)
            outputY.append(findRating(filename))
            
    return outputX,outputY
        

In [4]:
def findRating(filename):
    start = filename.find('_')
    end = filename.find('.')
#   return filename[start+1:end]
    if (int)(filename[start+1:end]) <= 4:
        return 0
    else:
        return 1

In [5]:
vectorizer = CountVectorizer(strip_accents= 'ascii', lowercase = True, stop_words = 'english')
# vectorizer = CountVectorizer()

list_of_reviews , list_of_ratings = filterAndCombine('/Users/kdassharma1/Documents/GitHub/aclImdb/train/neg',
                                                     '/Users/kdassharma1/Documents/GitHub/aclImdb/train/pos')
X_train = vectorizer.fit_transform(list_of_reviews)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
Y_train = np.asarray(list_of_ratings, dtype=np.int32)

list_of_reviews_test , list_of_ratings_test = filterAndCombine('/Users/kdassharma1/Documents/GitHub/aclImdb/test/neg',
                                                              '/Users/kdassharma1/Documents/GitHub/aclImdb/test/pos')
X_test = vectorizer.transform(list_of_reviews_test)

tfidf_transformer_test = TfidfTransformer()
X_test_tfidf = tfidf_transformer_test.fit_transform(X_test)
Y_test = np.asarray(list_of_ratings_test, dtype=np.int32)

# Multinomial Naive Bayes

In [None]:
clf_NB = MultinomialNB().fit(X_train_tfidf, Y_train)

In [None]:
predicted_NB = clf_NB.predict(X_test_tfidf)
np.mean(predicted == Y_test)

# Logistic Regression

In [6]:
clf_LR = LogisticRegression().fit(X_train_tfidf, Y_train)



In [8]:
predicted_LR = clf_LR.predict(X_test_tfidf)
np.mean(predicted_LR == Y_test)

0.88004

# Decision Tree 

In [9]:
clf_DT = DecisionTreeClassifier().fit(X_train_tfidf, Y_train)

In [10]:
predicted_DT = clf_DT.predict(X_test_tfidf)
np.mean(predicted_DT == Y_test)

0.70816

# Support Vector Machine

In [11]:
clf_SVM = LinearSVC().fit(X_train_tfidf, Y_train)

In [12]:
predicted_SVM = clf_SVM.predict(X_test_tfidf)
np.mean(predicted_SVM == Y_test)

0.86512

# Ada Boost

In [13]:
clf_AB = AdaBoostClassifier().fit(X_train_tfidf, Y_train)

In [14]:
predicted_AB = clf_AB.predict(X_test_tfidf)
np.mean(predicted_AB == Y_test)

0.80088

# Random Forest

In [None]:
clf_RF = RandomForestClassifier()().fit(X_train_tfidf, Y_train)

In [None]:
predicted_RF = clf_RF.predict(X_test_tfidf)
np.mean(predicted_RF == Y_test)