In [2]:
import nltk
# nltk.download()

In [3]:
files=['amazon_cells_labelled','imdb_labelled','yelp_labelled']
dataset={} #structure {'amazon': [[bad comments],[good comments]], ...}
for file in files:
    dataset[file]=[[] for i in range(2)]
    f = open(file+'.txt')
    for line in f:
        comment, label=line.split('\t')
        dataset[file][int(label)].append(comment)
    
for key, value in dataset.items():
    print('In',key,':',len(value[0]),'bad comments',len(value[1]),'good comments')

In amazon_cells_labelled : 500 bad comments 500 good comments
In imdb_labelled : 500 bad comments 500 good comments
In yelp_labelled : 500 bad comments 500 good comments


In [4]:
stopwords=nltk.corpus.stopwords.words('english')
stemmer=nltk.SnowballStemmer("english")
import re
for key, value in dataset.items():
    for comments in value:
        for i in range(len(comments)):
            lowered=comments[i].lower()
            word_list=re.findall(r"[a-z]+", lowered)
            filtered_words = [stemmer.stem(word) for word in word_list if word not in stopwords]
            comments[i] = filtered_words

In [5]:
trainData, trainLabel, testData, testLabel=[], [], [], []
for key, value in dataset.items():
    for i in range(2):
        trainData+=value[i][:400]
        trainLabel+=[i]*400
        testData+=value[i][400:]
        testLabel+=[i]*100

In [6]:
words=set()
for comment in trainData: words.update(set(comment))
words=list(words)
BoWTrainData=[]
for comment in trainData: 
    bag =[0 for _ in range(len(words))]
    for word in comment: 
        if word in words: bag[words.index(word)]+=1
    BoWTrainData.append(bag)
BoWTestData=[]
for comment in testData: 
    bag =[0 for _ in range(len(words))]
    for word in comment: 
        if word in words: bag[words.index(word)]+=1
    BoWTestData.append(bag)
# for i in range(2): print(BoWTrainData[i])

  We choose to do log-normalization.
  In the context of product review, the label of whether the review of the product is good or bad
  should align with human intuition, that is, the more positive and optimistic words in the review
  the more positive the review it is. However, a more confusing review will be comprehensive one,
  where people write both positive and negative comments all in one. In this case, we the best way
  we can tell the sentiment is to check the ratio of positive comment to negative comment. If we log
  them, we are making the difference of the two parts less noticeable if the comments are significantly
  long in terms of total word counts, and this is what we need, and it decreases the variance.
  And the reason of not using the others is the following:
  l1-norm is adjusting the weight of each word with respect to the length of comment linearly, which also make the difference of the two parts less noticeable
  however, it does not have the log property which make the difference addressable when comments are short
  but not when it is significantly long. l2-norm did it in an exponential way, which is not satisfactory
  when comment length vary when the length is small, say less than 5 critical words.
  The last approach of standardizing the data by subtracting the mean and dividing by the variance
  will do the exactly the reverse which increase the variance because our data are either positive
  or zero, which will gives a variance in between 0 and 1 which finally increase the relative
  distance of the data drastically if data are divided by the variance.

In [7]:
import numpy as np
logNormedBoWTrainData=np.log(np.array(BoWTrainData)+1)
logNormedBoWTestData=np.log(np.array(BoWTestData)+1)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score

logistic = LogisticRegression().fit(logNormedBoWTrainData, trainLabel)
gnb = GaussianNB().fit(logNormedBoWTrainData, trainLabel)

weight_vector = logistic.coef_
sorted_weight = [indexweight[0] for indexweight in sorted(enumerate(weight_vector[0]), key = lambda x: x[1])]
for i in range(10):
    print(words[sorted_weight[i]])
# TODO PRINT "THE TOP 10 MOST IMPORTANT" WORD STEM?

logistic_pred = logistic.predict(logNormedBoWTestData)
gnb_pred = gnb.predict(logNormedBoWTestData)
print('Using Logistic Regression:')
print(accuracy_score(testLabel, logistic_pred))
print(confusion_matrix(testLabel,logistic_pred))
print('Using Gaussian Naive Bayes:')
print(accuracy_score(testLabel, gnb_pred))
print(confusion_matrix(testLabel,gnb_pred))

bad
poor
worst
terribl
wast
slow
suck
aw
disappoint
stupid
Using Logistic Regression:
0.821666666667
[[265  35]
 [ 72 228]]
Using Gaussian Naive Bayes:
0.638333333333
[[267  33]
 [184 116]]


In [14]:
def NgramDataTransformation(n, row):
    ret = []
    for i in range(len(row) - n):
        stringToAdd = ""
        for j in range(n):
            stringToAdd += row[i+j]
        ret.append(stringToAdd)
    return ret

newTrainingData = [NgramDataTransformation(2, row) for row in trainData]
newTestData = [NgramDataTransformation(2, row) for row in testData]

words=set()
for comment in newTrainingData: words.update(set(comment))
words=list(words)
BoWnewTrainData=[]
for comment in newTrainingData: 
    bag =[0 for _ in range(len(words))]
    for word in comment: 
        if word in words: bag[words.index(word)]+=1
    BoWnewTrainData.append(bag)
BoWnewTestData=[]
for comment in newTestData: 
    bag =[0 for _ in range(len(words))]
    for word in comment: 
        if word in words: bag[words.index(word)]+=1
    BoWnewTestData.append(bag)
# for i in range(2): print(BoWnewTrainData[i])

In [15]:
import numpy as np
logNormedBoWnewTrainData=np.log(np.array(BoWnewTrainData)+1)
logNormedBoWnewTestData=np.log(np.array(BoWnewTestData)+1)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score

logistic = LogisticRegression().fit(logNormedBoWnewTrainData, trainLabel)
gnb = GaussianNB().fit(logNormedBoWnewTrainData, trainLabel)
logistic_pred = logistic.predict(logNormedBoWnewTestData)
gnb_pred = gnb.predict(logNormedBoWnewTestData)
print('Using Logistic Regression:')
print(accuracy_score(testLabel, logistic_pred))
print(confusion_matrix(testLabel,logistic_pred))
print('Using Gaussian Naive Bayes:')
print(accuracy_score(testLabel, gnb_pred))
print(confusion_matrix(testLabel,gnb_pred))

Using Logistic Regression:
0.571666666667
[[271  29]
 [228  72]]
Using Gaussian Naive Bayes:
0.576666666667
[[283  17]
 [237  63]]


In [17]:
mean=logNormedBoWTrainData.mean(0)
_,_,vTrain=np.linalg.svd(logNormedBoWTrainData - mean[np.newaxis, :])

In [21]:
def pca_implementation(q):
    matrix_w=vTrain[:q, :]

    reducedTrain = (logNormedBoWTrainData - mean[np.newaxis, :]).dot(matrix_w.T)
    reducedTest = (logNormedBoWTestData - mean[np.newaxis, :]).dot(matrix_w.T)

    logistic = LogisticRegression().fit(reducedTrain, trainLabel)
    gnb = GaussianNB().fit(reducedTrain, trainLabel)
    logistic_pred = logistic.predict(reducedTest)
    gnb_pred = gnb.predict(reducedTest)
    print('Using Logistic Regression:')
    print(accuracy_score(testLabel, logistic_pred))
    print(confusion_matrix(testLabel,logistic_pred))
    print('Using Gaussian Naive Bayes:')
    print(accuracy_score(testLabel, gnb_pred))
    print(confusion_matrix(testLabel,gnb_pred))
    
print("q = 10:")
pca_implementation(10)
print("q = 50:")
pca_implementation(50)
print("q = 100:")
pca_implementation(100)

q = 10:
Using Logistic Regression:
0.61
[[248  52]
 [182 118]]
Using Gaussian Naive Bayes:
0.586666666667
[[238  62]
 [186 114]]
q = 50:
Using Logistic Regression:
0.696666666667
[[256  44]
 [138 162]]
Using Gaussian Naive Bayes:
0.64
[[239  61]
 [155 145]]
q = 100:
Using Logistic Regression:
0.708333333333
[[240  60]
 [115 185]]
Using Gaussian Naive Bayes:
0.66
[[251  49]
 [155 145]]


In [22]:
from sklearn.decomposition import PCA
def pca_library(q):
    pca = PCA(n_components=q,svd_solver='full').fit(logNormedBoWTrainData)
    reducedTrain=pca.transform(logNormedBoWTrainData)
    # reducedTest=pca.transform(logNormedBoWTestData)
    reducedTest=(logNormedBoWTestData - pca.mean_[np.newaxis, :]).dot(pca.components_.T)

    logistic = LogisticRegression().fit(reducedTrain, trainLabel)
    gnb = GaussianNB().fit(reducedTrain, trainLabel)
    logistic_pred = logistic.predict(reducedTest)
    gnb_pred = gnb.predict(reducedTest)
    print('Using Logistic Regression:')
    print(accuracy_score(testLabel, logistic_pred))
    print(confusion_matrix(testLabel,logistic_pred))
    print('Using Gaussian Naive Bayes:')
    print(accuracy_score(testLabel, gnb_pred))
    print(confusion_matrix(testLabel,gnb_pred))

print("q = 10:")
pca_library(10)
print("q = 50:")
pca_library(50)
print("q = 100:")
pca_library(100)

q = 10:
Using Logistic Regression:
0.61
[[248  52]
 [182 118]]
Using Gaussian Naive Bayes:
0.586666666667
[[238  62]
 [186 114]]
q = 50:
Using Logistic Regression:
0.696666666667
[[256  44]
 [138 162]]
Using Gaussian Naive Bayes:
0.64
[[239  61]
 [155 145]]
q = 100:
Using Logistic Regression:
0.708333333333
[[240  60]
 [115 185]]
Using Gaussian Naive Bayes:
0.66
[[251  49]
 [155 145]]


Run logistic regression on bag of words performs the best in the prediction task, which has 0.8216
accuracy rate, because naive bayes will be better only when features are independent, but
also when dependencies of features from each other are similar between features, but this is
not the case in the context of reviews. 2-gram is worse because the combination of the adjacent
words provides less additional information on classifying the comment sentiment but strip more
relevant information provided by single word that is critical. That explains the reason that Logistic
regression and naive bayes are worse off after applying 2-gram. For PCA for bag of words, it reduces
dimensionality by performing SVD on rank q approximation, at the mean time, it reduces the
computation but also throw away less relevant features, therefore, it is reasonable to get less
accurate results. However, they are not off by a lot.