### Data preprocessing

In [1]:
import time
time_start = time.clock()
import pandas as pd       
data = pd.read_csv('D:\ML\mixData.csv')
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem.snowball import EnglishStemmer
st = EnglishStemmer()
#nltk.download()
from nltk.corpus import stopwords # Import the stop word list
def review_to_words( raw_review ):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text() 
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    # 4. Convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 5. Remove stop words and tranform the words to their stem 
    meaningful_words = [w for w in words if not w in stops]
    stem_words = []
    num_meanWords = len(meaningful_words)
    for i in xrange( 0, num_meanWords):
        w = st.stem(meaningful_words[i])
        stem_words.append(w)
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( stem_words )) 
    
num_reviews = data["review"].size


clean_train_reviews = []
for i in xrange( 0, num_reviews ):
    clean_train_reviews.append( review_to_words( data["review"][i] ) )
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

The computational time is 2.906090


### Create the feature by using the bag of words (define number of feature at max_features)

In [2]:
time_start = time.clock()
from sklearn.feature_extraction.text import CountVectorizer 
#build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
vectorizer = CountVectorizer(max_features = 884) 
train_data_features = vectorizer.fit_transform(clean_train_reviews)

train_data_features = train_data_features.toarray()
time_elapsed = (time.clock() - time_start)

print ("The computational time is %f") % time_elapsed
print train_data_features.shape

The computational time is 0.097415
(790L, 884L)


In [3]:
vocab = vectorizer.get_feature_names()
print vocab

[u'abl', u'absolut', u'accept', u'access', u'accessori', u'accord', u'account', u'act', u'action', u'activ', u'actual', u'ad', u'add', u'addit', u'address', u'admit', u'advertis', u'advic', u'advis', u'agent', u'ago', u'agre', u'ahead', u'allow', u'almost', u'along', u'alreadi', u'also', u'although', u'alway', u'amaz', u'amazon', u'amount', u'anoth', u'answer', u'anymor', u'anyon', u'anyth', u'anyway', u'anywher', u'apolog', u'appar', u'appear', u'appli', u'approv', u'april', u'around', u'arriv', u'ask', u'assist', u'assum', u'assur', u'attach', u'attempt', u'august', u'author', u'autom', u'automat', u'avail', u'ave', u'avoid', u'aw', u'away', u'back', u'backord', u'bad', u'bait', u'balanc', u'bank', u'base', u'basic', u'batteri', u'becom', u'begin', u'behind', u'believ', u'benefit', u'best', u'better', u'bewar', u'beyond', u'big', u'bill', u'birthday', u'black', u'blame', u'book', u'bother', u'bottom', u'bought', u'box', u'brand', u'broken', u'bs', u'buck', u'bunch', u'busi', u'button

In [4]:
from collections import Counter
y = data.values[:, -1]
print y.shape, Counter(y.tolist())
x = train_data_features
print x.shape

(790L,) Counter({0L: 720, 1L: 70})
(790L, 884L)


### Logistic Regression (use Sigmoid function)

In [5]:
time_start = time.clock()
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cross_validation import KFold

training_step = 100
X = train_data_features
Xnew = X.astype(None).reshape(X.shape)   #Change to non-type
Y = np.array(data['class'])
n_sample = len(Y)
n_features = X.shape[1]


x = T.matrix('x')
y = T.vector('y')
w = theano.shared(np.zeros(n_features),name = 'w')    #Start at zero vector
b = theano.shared(0.,name = 'b')

#print('Initial model:')
#print(w.get_value(), b.get_value())

prob_y_x = 1/(1+T.exp(-T.dot(x,w)-b))
prediction = prob_y_x > 0.5
logli = T.sum(y*T.log(prob_y_x) + (1-y)*T.log(1-prob_y_x))
cost_function = -(logli)
gw , gb = T.grad(cost_function,[w,b])


train_model = theano.function(
                        inputs = [x,y], 
                        updates = [(w, w-0.1*gw), (b, b-0.1*gb)])
predict = theano.function(inputs = [x], outputs = prediction)


testing_errors =[]
training_errors =[]
kf = KFold(790, n_folds=10)
for train, test in kf:
    ytrain = Y[train]
    xtrain = Xnew[train]
    ytest = Y[test]
    xtest = Xnew[test]
    for i in range(training_step):
        train_model(xtrain,ytrain)
    ytrain_predict = np.array(predict(xtrain))
    ytest_predict = np.array(predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed
print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))


The computational time is 7.066550
The training error is 0.000844
The testing error is 0.011392


### Bernoulli Naïve Bayes

In [6]:
time_start = time.clock()
from sklearn.naive_bayes import BernoulliNB 
import numpy as np

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = BernoulliNB()
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))

time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 0.371844
The training error is 0.167089
The testing error is 0.173418


### Multinomial Naïve Bayes

In [7]:
time_start = time.clock()
from sklearn.naive_bayes import MultinomialNB 
import numpy as np

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = MultinomialNB()
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 0.188142
The training error is 0.035724
The testing error is 0.068354


### Support Vector Machines

In [8]:
time_start = time.clock()
from sklearn import svm
testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = svm.SVC()
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 2.941601
The training error is 0.088608
The testing error is 0.088608


### 3NN

In [9]:
time_start = time.clock()
from sklearn.neighbors import KNeighborsClassifier

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 11.363228
The training error is 0.071449
The testing error is 0.174684


### 4NN

In [10]:
time_start = time.clock()
from sklearn.neighbors import KNeighborsClassifier

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = KNeighborsClassifier(n_neighbors=4)
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 12.310483
The training error is 0.051617
The testing error is 0.136709


### 5NN

In [11]:
time_start = time.clock()
from sklearn.neighbors import KNeighborsClassifier

testing_errors = []
training_errors = []
kf = KFold(790, n_folds=10, shuffle=True)
for train, test in kf:
    ytrain = data['class'][train]
    xtrain = train_data_features[train]
    ytest = data['class'][test]
    xtest = train_data_features[test]
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(xtrain, ytrain)
    ytrain_predict = np.array(model.predict(xtrain))
    ytest_predict = np.array(model.predict(xtest))
    testing_errors.append(1 - metrics.accuracy_score(ytest,ytest_predict))
    training_errors.append(1 - metrics.accuracy_score(ytrain,ytrain_predict))
    
time_elapsed = (time.clock() - time_start)
print ("The computational time is %f") % time_elapsed

print ('The training error is %f'  % np.average(training_errors))

print ('The testing error is %f' % np.average(testing_errors))

The computational time is 14.513743
The training error is 0.094233
The testing error is 0.191139
