In [None]:
import sklearn as sk
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import numpy as np
import seaborn as sns
import os
import nltk
from nltk.stem.snowball import SnowballStemmer
nltk.download('words')
nltk.download('stopwords')

# Methods for pre-processing

In [None]:
def filterAndCombine(directoryNeg, directoryPos, stopping_words):
    outputX = []
    outputY = []
    for filename in os.listdir(directoryNeg):
        with open(''+directoryNeg+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />','')
            data = re.sub(r'[^A-Za-z0-9 ]+', '', data)
#             data = " ".join(w for w in nltk.wordpunct_tokenize(data) \
#                  if w.lower() in words or not w.isalpha())
            
#             for word in stopping_words:
#                 if len(word) < 5:
#                     continue
#                 if word in data:
#                     data = data.replace(word,'')
            outputX.append(data)
            outputY.append(0)
    for filename in os.listdir(directoryPos):
        with open(''+directoryPos+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />','')
            data = re.sub(r'[^A-Za-z0-9 ]+', '', data)

#             for word in stopping_words:
#                 if len(word) < 5:
#                     continue
#                 if word in data:
#                     data = data.replace(word,'')
            outputX.append(data)
            outputY.append(1)
            
    return outputX,outputY
    

In [None]:
def filterAndCombineTest(directoryNeg, directoryPos):
    outputX = []
    outputY = []
    for filename in os.listdir(directoryNeg):
        with open(''+directoryNeg+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            outputX.append(data)
            outputY.append(0)
    for filename in os.listdir(directoryPos):
        with open(''+directoryPos+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            outputX.append(data)
            outputY.append(1)
            
    return outputX,outputY
        

In [None]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

# Feature selection

In [None]:
vectorizer = CountVectorizer(lowercase = True, stop_words = 'english')
# vectorizer = CountVectorizer()
# stemmer = SnowballStemmer("english", ignore_stopwords=True)
# vectorizer = StemmedCountVectorizer(stop_words='english')

stopping_words = []
with open('./function_words.txt', 'r') as f:
    for line in f:
        no_numbers = ''+re.sub('\d', '', line) 
        stopping_words.append(no_numbers.strip())
stopping_words = sorted(stopping_words, key=len, reverse=True)

list_of_reviews , list_of_ratings = filterAndCombine('/Users/kdassharma1/Documents/GitHub/aclImdb/train/neg',
                                                     '/Users/kdassharma1/Documents/GitHub/aclImdb/train/pos', 
                                                     stopping_words)
    
X_train = vectorizer.fit_transform(list_of_reviews)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
Y_train = np.asarray(list_of_ratings, dtype=np.int32)

list_of_reviews_test , list_of_ratings_test = filterAndCombineTest('/Users/kdassharma1/Documents/GitHub/aclImdb/test/neg',
                                                              '/Users/kdassharma1/Documents/GitHub/aclImdb/test/pos')
X_test = vectorizer.transform(list_of_reviews_test)

tfidf_transformer_test = TfidfTransformer()
X_test_tfidf = tfidf_transformer_test.fit_transform(X_test)
Y_test = np.asarray(list_of_ratings_test, dtype=np.int32)

In [None]:
# print(stopping_words)
# print(list_of_reviews[0])

# Multinomial Naive Bayes

In [None]:
# text_clf = Pipeline([('vect', CountVectorizer()),
#                       ('tfidf', TfidfTransformer()),
#                       ('clf', MultinomialNB()),
#                     ])
# text_clf = text_clf.fit(list_of_reviews, list_of_ratings)
# parameters = {
#               'vect__ngram_range': [(1, 1), (1, 2),(2,2)],
#               'tfidf__use_idf': (True,False),
#               'clf__alpha': (1e-1,1e-2, 1e-3,1e-4,1),
#               'clf__fit_prior': (True,False)
#              }

# gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(list_of_reviews, list_of_ratings)
# print(gs_clf.best_score_)
# print(gs_clf.best_params_)

In [None]:
# MultinomialNB().get_params().keys()
parameters = {
              'alpha': (1e-1,1e-2, 1e-3,1e-4,1),
              'fit_prior': (True,False)
             }
gs_clf = GridSearchCV(MultinomialNB(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_NB = MultinomialNB(alpha = 1,fit_prior = True).fit(X_train_tfidf, Y_train)
predicted_NB = clf_NB.predict(X_test_tfidf)
np.mean(predicted_NB == Y_test)

# Logistic Regression

In [None]:
SGDClassifier().get_params().keys()

In [None]:
parameters = {
              'alpha': (1e-1,1e-2, 1e-3,1e-4,1),
              'max_iter': (10,100,1000,10000),
              'penalty': ('l2', 'l1', 'elasticnet')
             }
gs_clf = GridSearchCV(SGDClassifier(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_NB = SGDClassifier(alpha = 0.0001,penalty = 'l2').fit(X_train_tfidf, Y_train)
predicted_NB = clf_NB.predict(X_test_tfidf)
np.mean(predicted_NB == Y_test)

# Decision Tree 

In [None]:
parameters = {
              'criterion': ('gini', 'entropy'),
              'splitter': ('best', 'random'),
              'max_depth' : (10,20,30,40,50),
#               'max_depth' : (10,20,30,40,50,100,150,200,250,300,350,400,450,500)
#               'min_samples_split' : (2,4,6,8,10),
#               'min_samples_leaf' : (1,2,3,4,5),
#               'ccp_alpha' : ()
             }
gs_clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_DT = DecisionTreeClassifier(random_state=10).fit(X_train_tfidf, Y_train)
predicted_DT = clf_DT.predict(X_test_tfidf)
np.mean(predicted_DT == Y_test)

# Support Vector Machine

In [None]:
clf_SVM = LinearSVC(random_state=10).fit(X_train_tfidf, Y_train)

In [None]:
predicted_SVM = clf_SVM.predict(X_test_tfidf)
np.mean(predicted_SVM == Y_test)

# Ada Boost

In [None]:
clf_AB = AdaBoostClassifier(random_state=10).fit(X_train_tfidf, Y_train)

In [None]:
predicted_AB = clf_AB.predict(X_test_tfidf)
np.mean(predicted_AB == Y_test)

# Random Forest

In [None]:
clf_RF = RandomForestClassifier(random_state=10).fit(X_train_tfidf, Y_train)

In [None]:
predicted_RF = clf_RF.predict(X_test_tfidf)
np.mean(predicted_RF == Y_test)

In [None]:
# stopping_words = []
# with open('./function_words.txt', 'r') as f:
#     for line in f:
#         no_numbers = ''+re.sub('\d', '', line) 
#         stopping_words.append(no_numbers.strip())
# stopping_words = sorted(stopping_words, key=len, reverse=True)

In [None]:
# data = "Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."
# data = data.replace('<br />','')
# #             data = data.replace('.',' ')
# #             data = data.replace(',',' ')
# data = re.sub(r'[^A-Za-z0-9 ]+', ' ', data)

# # d.check("Helo") # returns true or false
# # d.suggest("Helo") # returns a suggested spell check

# for word in stopping_words:
#     if len(word) < 3:
#         continue
#     if word in data:
#         data = data.replace(word,'')
# print(data)