### Parsing to get the set of adjective


In [1]:
import time
time_start = time.clock()
import re
import pickle
import string
import csv

def splitString(line):
	# removing punctuation, making all lowercase, and spliting line into a list of words, called 'wordsInReview'
	line.translate(None, string.punctuation)
	myLine = line.translate(string.maketrans("",""), string.punctuation)
	lineLowerCase = myLine.lower()
	wordsInReview = lineLowerCase.split()	# split string into array

	return wordsInReview

def parseAdj(wordsInReview):
	# finding adjectives within review
	adjectives = [x for x in allWordsList if x in wordsInReview]

	# adding adjectives to list without repeats
	for element in adjectives:
		if element not in allAdjList:
			allAdjList.append(element)
	return allAdjList

## Open the file with read only permit
fallComments = open('AllComments.csv','r')
posWordArr = [posWord.rstrip('\n') for posWord in open('2006_positiveWords.txt')]
negWordArr = [negWord.rstrip('\n') for negWord in open('4783_negativeWords.txt')]
allWordsList = posWordArr + negWordArr

allAdjList = []

## Read the first line
line = fallComments.readline()
while line:
	wordsInReview = splitString(line)
	allAdjList = parseAdj(wordsInReview)
	line = fallComments.readline()

print allAdjList
    

fallComments.close()
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

The computational time is 14.071993


### Data preprocessing

In [2]:
time_start = time.clock()
import pandas as pd       
data = pd.read_csv('D:\ML\mixData.csv')
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.snowball import EnglishStemmer
st = EnglishStemmer()
#nltk.download()
from nltk.corpus import stopwords # Import the stop word list
def review_to_words( raw_review ):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text() 
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    # 4. Convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 5. Remove stop words and tranform the words to their stem 
    meaningful_words = [w for w in words if not w in stops]
    stem_words = []
    num_meanWords = len(meaningful_words)
    for i in xrange( 0, num_meanWords):
        w = st.stem(meaningful_words[i])
        stem_words.append(w)
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( stem_words )) 
    
num_reviews = data["review"].size


clean_train_reviews = []
for i in xrange( 0, num_reviews ):
    clean_train_reviews.append( review_to_words( data["review"][i] ) )
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

The computational time is 4.255637


### Create the feature by using only adjective

In [3]:
time_start = time.clock()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary = allAdjList)
train_data_features = vectorizer.fit_transform(clean_train_reviews)

train_data_features = train_data_features.toarray()
print train_data_features.shape

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

(790L, 884L)
The computational time is 0.113517


In [4]:
vocab = vectorizer.get_feature_names()
print vocab



In [5]:
from collections import Counter
y = data.values[:, -1]
print y.shape, Counter(y.tolist())
x = train_data_features
print x.shape

(790L,) Counter({0L: 720, 1L: 70})
(790L, 884L)


### Logistic Regression (use Sigmoid function)

In [6]:
time_start = time.clock()
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cross_validation import KFold

training_step = 100
X = train_data_features
Xnew = X.astype(None).reshape(X.shape)   #Change to non-type
Y = np.array(data['class'])
n_sample = len(Y)
n_features = X.shape[1]


x = T.matrix('x')
y = T.vector('y')
w = theano.shared(np.zeros(n_features),name = 'w')    #Start at zero vector
b = theano.shared(0.,name = 'b')

#print('Initial model:')
#print(w.get_value(), b.get_value())

prob_y_x = 1/(1+T.exp(-T.dot(x,w)-b))
prediction = prob_y_x > 0.5
logli = T.sum(y*T.log(prob_y_x) + (1-y)*T.log(1-prob_y_x))
cost_function = -(logli)
gw , gb = T.grad(cost_function,[w,b])


train_model = theano.function(
                        inputs = [x,y], 
                        updates = [(w, w-0.1*gw), (b, b-0.1*gb)])
predict = theano.function(inputs = [x], outputs = prediction)


testing_errors =[]
training_errors =[]
kf = KFold(790, n_folds=10)
for train, test in kf:
    ytrain = Y[train]
    xtrain = Xnew[train]
    ytest = Y[test]
    xtest = Xnew[test]
    for i in range(training_step):
        train_model(xtrain,ytrain)
    ytrain_predict = np.array(predict(xtrain))
    ytest_predict = np.array(predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 8.487592
The training error is 0.038537
The testing error is 0.113924


### Bernoulli Naïve Bayes

In [7]:
time_start = time.clock()
from sklearn.naive_bayes import BernoulliNB 
import numpy as np

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = BernoulliNB()
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 0.320460
The training error is 0.088608
The testing error is 0.088608


### Multinomial Naïve Bayes

In [8]:
time_start = time.clock()
from sklearn.naive_bayes import MultinomialNB 
import numpy as np

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = MultinomialNB()
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 0.193436
The training error is 0.082700
The testing error is 0.083544


### Support Vector Machines

In [9]:
time_start = time.clock()
from sklearn import svm
testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = svm.SVC()
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 2.920590
The training error is 0.088608
The testing error is 0.088608


### 3NN

In [10]:
time_start = time.clock()
from sklearn.neighbors import KNeighborsClassifier

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 9.794936
The training error is 0.066667
The testing error is 0.091139


### 4NN

In [11]:
time_start = time.clock()
from sklearn.neighbors import KNeighborsClassifier

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = KNeighborsClassifier(n_neighbors=4)
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 9.932802
The training error is 0.072574
The testing error is 0.097468


### 5NN

In [12]:
time_start = time.clock()
from sklearn.neighbors import KNeighborsClassifier

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 10.076653
The training error is 0.074965
The testing error is 0.093671
