In [None]:
import sklearn as sk
from sklearn.naive_bayes import MultinomialNB
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
import numpy as np
import seaborn as sns
import os
import nltk
nltk.download('words')
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer

# Methods for pre-processing

In [None]:
def filterAndCombine(directoryNeg, directoryPos, stopping_words):
    outputX = [] # Temporary list for list of reviews
    outputY = [] # Temporary list for for list of ratings
    
    for filename in os.listdir(directoryNeg): # Goes through every file in the directory
        with open(''+directoryNeg+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '') # Gets rid of any \n keywords
            data = data.replace('<br />','') # Gets rid of any break tags
            data = re.sub(r'[^A-Za-z0-9 ]+', '', data) # Gets rid of any non-alphanumerics but not spaces
            for word in stopping_words: # Gets rid of all stopping words
                if len(word) < 5: # So that words like 'is' are not removed from within bigger words
                    continue
                if word in data: 
                    data = data.replace(word,'')
            outputX.append(data)
            outputY.append(0)
    for filename in os.listdir(directoryPos): # Doing the same as above but for the positive classes
        with open(''+directoryPos+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            data = data.replace('<br />','')
            data = re.sub(r'[^A-Za-z0-9 ]+', '', data)
            for word in stopping_words:
                if len(word) < 5:
                    continue
                if word in data:
                    data = data.replace(word,'')
            outputX.append(data)
            outputY.append(1)
            
    return outputX,outputY
    

In [None]:
def filterAndCombineTest(directoryNeg, directoryPos): # This function does the same as above, but for the test 
    outputX = [] # data instead. Nothing is pre-processed aside from the removal of \n in order to get file.read() into a string.
    outputY = []
    for filename in os.listdir(directoryNeg):
        with open(''+directoryNeg+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            outputX.append(data)
            outputY.append(0)
    for filename in os.listdir(directoryPos):
        with open(''+directoryPos+'/'+filename, 'r') as file:
            data = file.read().replace('\n', '')
            outputX.append(data)
            outputY.append(1)
            
    return outputX,outputY
        

In [None]:
class StemmedCountVectorizer(CountVectorizer): # This class was tested in order to remove stems, but yielded lower 
    def build_analyzer(self): # accuracies, and are hence not used for preprocessing.
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

# Feature selection

In [None]:
vectorizer  = CountVectorizer() # Initialising the vectoriser for the datasets 

stopping_words = []
with open('./function_words.txt', 'r') as f: # This adds the function words from the text file into a list
    for line in f:
        no_numbers = ''+re.sub('\d', '', line) 
        stopping_words.append(no_numbers.strip())
stopping_words = sorted(stopping_words, key=len, reverse=True) # Reverses the list in order to not get rid of smaller
# substrings containing stopping words

list_of_reviews , list_of_ratings = filterAndCombine('./aclImdb/train/neg',
                                                     './aclImdb/train/pos', 
                                                     stopping_words)
    
X_train = vectorizer.fit_transform(list_of_reviews) # Using the vectoriser to fit and transform the X values 
#to the frequency features

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train) # Using the transformer to fit and transform the 
# vectorised X values to frequency of words in all the documents

Y_train = np.asarray(list_of_ratings, dtype=np.int32) # Converts the list into a numpy array

# The following does the same as above for the test dataset
list_of_reviews_test , list_of_ratings_test = filterAndCombineTest('./aclImdb/test/neg',
                                                              './aclImdb/test/pos') 
X_test = vectorizer.transform(list_of_reviews_test)

tfidf_transformer_test = TfidfTransformer()
X_test_tfidf = tfidf_transformer_test.fit_transform(X_test)
Y_test = np.asarray(list_of_ratings_test, dtype=np.int32)

# Model approach
- The same process is used for every single model
- First grid search is used with the relevant parameters for each model to find the best parameters (with 5-cross validation)
- Using these values, the model is tested with the held-out test set

# Multinomial Naive Bayes

In [None]:
parameters = {
              'alpha': (1e-1,1e-2, 1e-3,1e-4,1),
              'fit_prior': (True,False)
             }
gs_clf = GridSearchCV(MultinomialNB(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_NB = MultinomialNB(alpha = 1,fit_prior = True).fit(X_train_tfidf, Y_train)
predicted_NB = clf_NB.predict(X_test_tfidf)
np.mean(predicted_NB == Y_test)

# Logistic Regression

In [None]:
parameters = {
              'alpha': (1e-1,1e-2, 1e-3,1e-4,1),
              'max_iter': (10,100,1000,10000),
              'penalty': ('l2', 'l1', 'elasticnet') 
             }
gs_clf = GridSearchCV(SGDClassifier(), parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_NB = SGDClassifier(alpha = 0.0001,penalty = 'l2',max_iter=10).fit(X_train_tfidf, Y_train)
predicted_NB = clf_NB.predict(X_test_tfidf)
np.mean(predicted_NB == Y_test)

# Decision Tree 

In [None]:
parameters = {
              'ccp_alpha' : (0.1,0.0,1.0)
             }

gs_clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=20,verbose=10)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_DT = DecisionTreeClassifier().fit(X_train_tfidf, Y_train)
predicted_DT = clf_DT.predict(X_test_tfidf)
np.mean(predicted_DT == Y_test)

# Support Vector Machine

In [None]:
parameters = {
    'max_iter' : (100,1000)
             }
gs_clf = GridSearchCV(LinearSVC(), parameters, n_jobs=-1,verbose=10)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_SVM = LinearSVC(random_state=10,max_iter=100).fit(X_train_tfidf, Y_train)
predicted_SVM = clf_SVM.predict(X_test_tfidf)
np.mean(predicted_SVM == Y_test)

# Ada Boost

In [None]:
parameters = {
    'n_estimators':(100,200,300,400,500),
    'learning_rate': (5e-1,6e-1,7e-1,8e-1,9e-1,1)
             }
gs_clf = GridSearchCV(AdaBoostClassifier(), parameters, n_jobs=-1,verbose=10)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_AB = AdaBoostClassifier(n_estimators=300, learning_rate=0.7).fit(X_train_tfidf, Y_train)
#clf_AB = AdaBoostClassifier().fit(X_train_tfidf, Y_train)
predicted_AB = clf_AB.predict(X_test_tfidf)
np.mean(predicted_AB == Y_test)

# Random Forest

In [None]:
parameters = {
    'n_estimators':(10,100)
             }
gs_clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1,verbose=10)
gs_clf = gs_clf.fit(X_train_tfidf, Y_train)
print(gs_clf.best_score_)
print(gs_clf.best_params_)

In [None]:
clf_RF = RandomForestClassifier(n_estimators=100).fit(X_train_tfidf, Y_train)
predicted_RF = clf_RF.predict(X_test_tfidf)
np.mean(predicted_RF == Y_test)