In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('Amazon.csv')
print(data.shape)
data.head(5)

In [None]:
###FIXING THE MISSING DATA IN THE SUMMARY COLUMN
data['newSummary'] = np.where(pd.isnull(data['Summary']) == True, 'nn', data['Summary'])
##GETTING A REPORT OF MISSING VALUES
data.isnull().sum()

In [None]:
#features from Amazon.csv to add to feature set
#pandas creating new columns in the data frame

#length of the review
data['reviewLen'] = data['Text'].str.len()
#number of semi-colons
data['hasSC'] = data['Text'].str.count(';')
#number of exclaimation marks
data['hasEX'] = data['Text'].str.count('!')
#number of questions marks
data['hasQ'] = data['Text'].str.count('\?')
#total common punctuation count
data['punctCount'] = data['Text'].str.count('[.,!;:()/\?-]')
#ratio of punctuation to words
data['punctToWords'] = data['punctCount'] / data['reviewLen']
#average word length
data['avWordLength'] = data['Text'].str.len() // (data['Text'].str.count(' ') + 1)
#summary length
data['sumLen'] = data['newSummary'].str.count('\S')
#summary average word length
data['avSumWordLen'] = data['newSummary'].str.len() // (data['newSummary'].str.count(' ') + 1)
#summary exclaimation marks
data['sumHasEX'] = data['newSummary'].str.count('!')
#summary question marks
data['sumHasQ'] = data['newSummary'].str.count('\?')

data['date'] = pd.to_datetime(data['Time'],unit='s')
###getting day of the week (maybe Sundays are better times to write reviews)
data['day_of_week'] = data['date'].dt.dayofweek
#score is less than four
data['scoreType'] = np.where(data['Score'] < 4, 1, 0)

    
#wanted to get time of day but the timestamps only have dates...
#data['time_of_day'] = data['date'].dt.hour

#adjective to word ratio

print(data.shape)
data.head(5)


In [None]:


##pulling out values and making them vectors
XScore = data.iloc[:, 7].values.reshape(data.shape[0], 1)
XreviewLen = data.iloc[:, 14].values.reshape(data.shape[0], 1)
XhasSC = data.iloc[:, 15].values.reshape(data.shape[0], 1)
XhasEX = data.iloc[:, 16].values.reshape(data.shape[0], 1)
XhasQ = data.iloc[:, 17].values.reshape(data.shape[0], 1)
XpunctCount = data.iloc[:, 18].values.reshape(data.shape[0], 1)
XpunctToWords = data.iloc[:, 19].values.reshape(data.shape[0], 1)
XavWordLength = data.iloc[:, 20].values.reshape(data.shape[0], 1)
XsumLen = data.iloc[:, 21].values.reshape(data.shape[0], 1)
XavSumWordLen = data.iloc[:, 22].values.reshape(data.shape[0], 1)
XsumHasEX = data.iloc[:, 23].values.reshape(data.shape[0], 1)
XsumHasQ = data.iloc[:, 24].values.reshape(data.shape[0], 1)
#Xdate = data.iloc[:, 24].values.reshape(data.shape[0], 1)
Xday_of_week = data.iloc[:, 26].values.reshape(data.shape[0], 1)
XscoreType = data.iloc[:, 27].values.reshape(data.shape[0], 1)
#Xtoadd = np.concatenate((XScore, XreviewLen, XhasSC, XhasEX, XhasQ, XpunctCount, XpunctToWords, XavWordLength, Xday_of_week, XscoreType), axis=1)
Xtoadd = np.concatenate((XScore, XreviewLen, XhasSC, XhasEX, XhasQ, XpunctCount, XpunctToWords, XavWordLength, XsumLen, XavSumWordLen, XsumHasEX, XsumHasQ, Xday_of_week, XscoreType), axis=1)

In [None]:
# report on training and test sets
def print_results():
    print('Error rate on training set: ')
    print((y_train != y_pred).sum() / X_train.shape[0])
    print('Accuracy rate on training set: ')
    print(1 - (y_train != y_pred).sum() / X_train.shape[0])
    print('True positive rate on training tet:')
    print(((y_train==True) & (y_pred==True)).sum() / y_train.sum())
    print('**************')
    print('Error rate on test set: ')
    print((y_test != y_pred_test).sum() / X_test.shape[0])
    print('Accuracy rate on test set: ')
    print(1 - (y_test != y_pred_test).sum() / X_test.shape[0])
    print('True positive rate on test set')
    print(((y_test==True) & (y_pred_test==True)).sum() / y_test.sum())
    print('True negative rate on test set')
    print(((y_test==False) & (y_pred_test==False)).sum() / (y_test.shape[0] - y_test.sum()))

In [None]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
#restricting to 2 to the power of 17 features
hv = HashingVectorizer(n_features=2 ** 17, non_negative=True)
X = hv.transform(data.Text)

In [None]:
# convert additional features to sparse matrix and concatenate onto the bag of words sparse matrix
from scipy.sparse import csr_matrix, hstack
XtoaddSparse = csr_matrix(Xtoadd)
Xfinal = hstack([X, XtoaddSparse])
X = csr_matrix(Xfinal)

In [None]:
# size of feature set
print(X.shape)

In [None]:
# define y
y = data.iloc[:, 12].values
y.shape

In [None]:
# create training and test sets
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [None]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
# MODEL: SVM, linear defaults to Scaled Vector Machine
#incrementing each weight to reduce cost (defaults five passes)
# dealing with two vectors weights over and over
from sklearn import linear_model
clf = linear_model.SGDClassifier()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

In [None]:
# MODEL: logistic regression
#log parameter, meaning logistic / tinker with alpha 
#here alpha is the regularization parameter  (default is 0.0001)
#here more penalizing the weights (hope to stop overfitting)
#need to specific the seed so that it doesn't totally randomize
#could loop through alphas and graph the accuracy rate
#could graphic the cost function and see when the number of iterations plateau
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='log', n_iter=50, alpha=0.00001)
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

In [None]:
# MODEL: Naive Bayes
#this improved the true positives / maybe a different model for true positives
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()

In [None]:
# Perceptron
from sklearn import linear_model
clf = linear_model.SGDClassifier(loss='perceptron')
clf.fit(X_train_std, y_train)
y_pred = clf.fit(X_train_std, y_train).predict(X_train_std)
y_pred_test = clf.predict(X_test_std)
print_results()