## Preprocessing and feature extraction

In [40]:
import sklearn, pandas
import library
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD, SparsePCA
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import average_precision_score, recall_score, f1_score, precision_score, classification_report
from sklearn.random_projection import sparse_random_matrix

df_train = pandas.read_csv('training.txt', delimiter = "\t", names = ["Review", "Rating"])
df_test = pandas.read_csv('test.txt', delimiter = "\t", names = ["Review", "Rating"])

text = df_train.Review
rating = df_train.Rating

text_test = df_test.Review
rating_test = df_test.Rating

X_train, y_train = library.balance_classes(text, rating)
X_test, y_test = library.balance_classes(text_test, rating_test)

#Learn vocabulary and idf from training set
vector = TfidfVectorizer().fit(X_train)
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vector.transform(X_train)



# Task 1: 
### Run	5-fold	Cross	Validation	on	the	training.txt	using	5 learning	algorithms:  MLPClassifier(Neural network), MultinomialNB, LogisticRegression, AdaBoostClassifier, and SVC .  Report	the	average-precision,	average-recall	and	average-F1-scores.

## Neural Network Classifier, the optimal setting: hidden size (70, 50, 100), accuracy is 0.54

In [44]:
#Neural Network model
hsize = [(100,100), (70,50,100),(5,2),(5,)]

model_NN = []
for h in hsize:
    classifier_NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes= h, random_state=1)
    model_NN.append(classifier_NN)

X_train, X_val, y_train, y_val = train_test_split(X_train_vectorized,y_train,test_size = 0.3)

count = 0
for model in model_NN:
    model.fit(X_train, y_train)
    
    pred_NN = model.predict(X_val)
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    prescore = precision_score(pred_NN, y_val, average='micro')
    rescore = recall_score(pred_NN, y_val, average='micro')
    f1score = f1_score(pred_NN, y_val, average='micro')
    
    print("hiden size: ", hsize[count], ' average cv score:', sum(scores)/len(scores))
    print("hiden size: ", hsize[count], ' average precision score:', prescore)
    print("hiden size: ", hsize[count], ' average recall score:', rescore)
    print("hiden size: ", hsize[count], ' average f1 score:', f1score)
    count +=1

hiden size:  (100, 100)  average cv score: 0.49569963290009866
hiden size:  (100, 100)  average precision score: 0.5188773805546275
hiden size:  (100, 100)  average recall score: 0.5188773805546275
hiden size:  (100, 100)  average f1 score: 0.5188773805546275
hiden size:  (70, 50, 100)  average cv score: 0.5269095480941693
hiden size:  (70, 50, 100)  average precision score: 0.5399264951553625
hiden size:  (70, 50, 100)  average recall score: 0.5399264951553625
hiden size:  (70, 50, 100)  average f1 score: 0.5399264951553625
hiden size:  (5, 2)  average cv score: 0.4717906923660453
hiden size:  (5, 2)  average precision score: 0.49381891079184764
hiden size:  (5, 2)  average recall score: 0.49381891079184764
hiden size:  (5, 2)  average f1 score: 0.49381891079184764
hiden size:  (5,)  average cv score: 0.4872504868403995
hiden size:  (5,)  average precision score: 0.4827931840962245
hiden size:  (5,)  average recall score: 0.4827931840962245
hiden size:  (5,)  average f1 score: 0.48279

## Naive Bayes Classifier, average cv score: 0.43

In [5]:
#naieve Base model
classifier_NB = MultinomialNB()

classifier_NB.fit(X_train_vectorized, y_train)
    
pred_NB = classifier_NB.predict(vector.transform(X_test))
scores = cross_val_score(model, X_train_vectorized, y_train, cv = 5)
prescore = precision_score(pred_NB, y_test, average='micro')
rescore = recall_score(pred_NB, y_test, average='micro')
f1score = f1_score(pred_NB, y_test, average='micro')
print('Naive Bayes Classifier: ')
print('average cv score:', sum(scores)/len(scores))
print('average precision score:', prescore)
print('average recall score:', rescore)
print('average f1 score:', f1score)


Naive Bayes Classifier: 
average cv score: 0.42736842105263156
average precision score: 0.506
average recall score: 0.506
average f1 score: 0.506


## Logistic Regression, the optimal setting: ('l2', 1.0), average score is  0.535

In [7]:
#Logistic regression model
penaty = ['l1', 'l2']
C = [1.0, 2.0, 3.0, 4.0]

model_LR = []
pc = []
for p in penaty:
    for c in C:
        classifier_LR = LogisticRegression(penalty = p, C = c)
        model_LR.append(classifier_LR)
        pc.append((p,c))

count = 0
for model in model_LR:
    model.fit(X_train_vectorized, y_train)
    
    pred_NN = model.predict(vector.transform(X_test))
    scores = cross_val_score(model, X_train_vectorized, y_train, cv = 5)
    prescore = precision_score(pred_NN, y_test, average='micro')
    rescore = recall_score(pred_NN, y_test, average='micro')
    f1score = f1_score(pred_NN, y_test, average='micro')
    
    print(pc[count], ' average cv score:', sum(scores)/len(scores))
    print(pc[count], ' average precision score:', prescore)
    print(pc[count], ' average recall score:', rescore)
    print(pc[count], ' average f1 score:', f1score)
    count +=1

('l1', 1.0)  average cv score: 0.4669674185463659
('l1', 1.0)  average precision score: 0.517
('l1', 1.0)  average recall score: 0.517
('l1', 1.0)  average f1 score: 0.517
('l1', 2.0)  average cv score: 0.4606516290726817
('l1', 2.0)  average precision score: 0.509
('l1', 2.0)  average recall score: 0.509
('l1', 2.0)  average f1 score: 0.509
('l1', 3.0)  average cv score: 0.4542355889724311
('l1', 3.0)  average precision score: 0.49
('l1', 3.0)  average recall score: 0.49
('l1', 3.0)  average f1 score: 0.49
('l1', 4.0)  average cv score: 0.44551378446115286
('l1', 4.0)  average precision score: 0.498
('l1', 4.0)  average recall score: 0.498
('l1', 4.0)  average f1 score: 0.498
('l2', 1.0)  average cv score: 0.4670676691729323
('l2', 1.0)  average precision score: 0.535
('l2', 1.0)  average recall score: 0.535
('l2', 1.0)  average f1 score: 0.535
('l2', 2.0)  average cv score: 0.46335839598997497
('l2', 2.0)  average precision score: 0.527
('l2', 2.0)  average recall score: 0.527
('l2',

## Adaboosting, the optimal setting: n_estimators = 50, average score is 0.47

In [8]:
#adaboosting model
n_estimators = [10, 20, 50, 100]

model_AB = []
for n in n_estimators:
    classifier_AB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=n)
    model_AB.append(classifier_AB)

count = 0
for model in model_AB:
    model.fit(X_train_vectorized, y_train)
    
    pred = model.predict(vector.transform(X_test))
    scores = cross_val_score(model, X_train_vectorized, y_train, cv = 5)
    prescore = precision_score(pred, y_test, average='micro')
    rescore = recall_score(pred, y_test, average='micro')
    f1score = f1_score(pred, y_test, average='micro')
    
    print("n_estimators: ", n_estimators[count], ' average cv score:', sum(scores)/len(scores))
    print("n_estimators: ", n_estimators[count], ' average precision score:', prescore)
    print("n_estimators: ", n_estimators[count], ' average recall score:', rescore)
    print("n_estimators: ", n_estimators[count], ' average f1 score:', f1score)
    count +=1

n_estimators:  10  average cv score: 0.39458646616541354
n_estimators:  10  average precision score: 0.41
n_estimators:  10  average recall score: 0.41
n_estimators:  10  average f1 score: 0.41
n_estimators:  20  average cv score: 0.41563909774436086
n_estimators:  20  average precision score: 0.462
n_estimators:  20  average recall score: 0.462
n_estimators:  20  average f1 score: 0.462
n_estimators:  50  average cv score: 0.4140350877192982
n_estimators:  50  average precision score: 0.47
n_estimators:  50  average recall score: 0.47
n_estimators:  50  average f1 score: 0.47
n_estimators:  100  average cv score: 0.4109273182957393
n_estimators:  100  average precision score: 0.443
n_estimators:  100  average recall score: 0.443
n_estimators:  100  average f1 score: 0.443


## SVM, the opotimal setting: (1.0, 'linear') , average score is 0.528

In [10]:
#SVC model
C1 = [1.0, 2.0, 3.0, 4.0]
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

model_svm = []
kc = []
for c in C1:
    for k in kernel:
        classifier_SVC = SVC(C=c, kernel = k)
        model_svm.append(classifier_SVC)
        kc.append((c,k))

count = 0
for model in model_svm:
    model.fit(X_train_vectorized, y_train)
    
    pred = model.predict(vector.transform(X_test))
    scores = cross_val_score(model, X_train_vectorized, y_train, cv = 5)
    prescore = precision_score(pred, y_test, average='micro')
    rescore = recall_score(pred, y_test, average='micro')
    f1score = f1_score(pred, y_test, average='micro')
    
    print(kc[count], ' average cv score:', sum(scores)/len(scores))
    print(kc[count], ' average precision score:', prescore)
    print(kc[count], ' average recall score:', rescore)
    print(kc[count], ' average f1 score:', f1score)
    count +=1


(1.0, 'rbf')  average cv score: 0.3625062656641604
(1.0, 'rbf')  average precision score: 0.396
(1.0, 'rbf')  average recall score: 0.396
(1.0, 'rbf')  average f1 score: 0.396
(1.0, 'linear')  average cv score: 0.4719799498746868
(1.0, 'linear')  average precision score: 0.528
(1.0, 'linear')  average recall score: 0.528
(1.0, 'linear')  average f1 score: 0.528
(1.0, 'poly')  average cv score: 0.2693734335839599
(1.0, 'poly')  average precision score: 0.301
(1.0, 'poly')  average recall score: 0.301
(1.0, 'poly')  average f1 score: 0.301
(1.0, 'sigmoid')  average cv score: 0.36260651629072677
(1.0, 'sigmoid')  average precision score: 0.396
(1.0, 'sigmoid')  average recall score: 0.396
(1.0, 'sigmoid')  average f1 score: 0.396
(2.0, 'rbf')  average cv score: 0.3625062656641604
(2.0, 'rbf')  average precision score: 0.396
(2.0, 'rbf')  average recall score: 0.396
(2.0, 'rbf')  average f1 score: 0.396
(2.0, 'linear')  average cv score: 0.46466165413533833
(2.0, 'linear')  average precisi

# Task 2: 
## 1. Perform feature selection using PCA and re-run the algorithms with their optimal settings
## 2. But PCA does not support sparse input, therefore here I am using TruncatedSVD instead to reduce the dimentionality.
## Result:
### 1. n_components increases, the cross validation score is higher.
### 2. In this example, n_component = 10, and model is neural network, the score is the highest.

In [41]:
#classifier model with optimal settings
classifier_NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes= (70, 50, 100), random_state=1)
classifier_NB = MultinomialNB()
classifier_LR = LogisticRegression(penalty = 'l2', C = 1.0)
classifier_AB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=50)
classifier_SVC = SVC(C=1.0, kernel = 'linear')

models = [classifier_NN, classifier_LR,classifier_AB, classifier_SVC]
n_component = [2, 3, 4, 5, 10, 50, 100]
for n in n_component:
    svd = TruncatedSVD(n_components=n)
    svd.fit(X_train_vectorized)
    X = svd.transform(X_train_vectorized)

    for model in models:
        model.fit(X, y_train)
        scores = cross_val_score(model, X, y_train, cv = 5)
        print("n_component: ", n,' average cv score:', sum(scores)/len(scores))


n_component:  2  average cv score: 0.24010025062656642
n_component:  2  average cv score: 0.2218546365914787
n_component:  2  average cv score: 0.2336842105263158
n_component:  2  average cv score: 0.2270676691729323
n_component:  3  average cv score: 0.3237092731829574
n_component:  3  average cv score: 0.2771929824561404
n_component:  3  average cv score: 0.29834586466165414
n_component:  3  average cv score: 0.2744862155388471
n_component:  4  average cv score: 0.3651127819548872
n_component:  4  average cv score: 0.3413533834586466
n_component:  4  average cv score: 0.3335338345864662
n_component:  4  average cv score: 0.34205513784461156
n_component:  5  average cv score: 0.361203007518797
n_component:  5  average cv score: 0.351077694235589
n_component:  5  average cv score: 0.34135338345864663
n_component:  5  average cv score: 0.35328320802005014
n_component:  10  average cv score: 0.41423558897243107
n_component:  10  average cv score: 0.3991979949874687
n_component:  10  aver

# Task 3: 
## 1. Sentiment words processing
## 2. Use sentiment dataset positive.txt and netgative.txt to filter review text
## 3. 	Evaluate the algorithms on test.txt dataset

# Result: the accuracy doesn't become better

In [11]:
import sklearn, pandas
import library
from sklearn.feature_extraction.text import TfidfVectorizer
      
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load the document
filename = 'positive-words.txt'
text_p = load_doc(filename)
filename = 'negative-words.txt'
text_n = load_doc(filename)
# merge two texts into one text
text = text_p+text_n

tokens = text.split()

a = df_test["Review"][0]
y = a.split(" ")
print(y)

l = []
for i in range(len(df_test)):
    a = df_test["Review"][i]
    for n in tokens:
        if n in a.split(" "):
            l.append(n)
    df_test["Review"][i] = " ".join(l)
    l = []

print(df_test.head())
                      

['Have', 'only', 'done', 'tapas', 'here.', '\xa0', 'Must.', '\xa0Get.', '\xa0Ham', 'croquettes.', '\xa0', 'Those', 'and', 'the', 'stuffed', 'plantains.', '\xa0Lollipop', 'chicken', 'good,', 'ribs', 'good,', 'Cuban', 'sandwich', 'ace,', 'empanadas', 'solid.', '\xa0Reasonably', 'priced,', 'service', 'attentive,', 'nice', 'walkway', 'to', 'the', 'restaurant.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


                                              Review  Rating
0                                               nice     4.0
1  amazing attractively generous happy impressive...     5.0
2       attentive enough like nice ready loud scream     3.0
3                                    good liked loud     4.0
4                                     fun lead right     4.0


In [17]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

text_test_new = df_test.Review
rating_test_new = df_test.Rating

X_test_new, y_test_new = library.balance_classes(text_test_new, rating_test_new)
#Learn vocabulary and idf from training set
vector = TfidfVectorizer().fit(X_test_new)
# transform the documents in the training data to a document-term matrix
X_test_vectorized = vector.transform(X_test_new)

#classifier model with optimal settings
classifier_NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes= (70, 50, 100), random_state=1)
classifier_NB = MultinomialNB()
classifier_LR = LogisticRegression(penalty = 'l2', C = 1.0)
classifier_AB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=50)
classifier_SVC = SVC(C=1.0, kernel = 'linear')

models = [classifier_NN, classifier_NB, classifier_LR,classifier_AB, classifier_SVC]

X_train1, X_val1, y_train1, y_val1 = train_test_split(X_test_vectorized, y_test_new,test_size = 0.3)

#Output the precision, recall and f1-score
for model in models:
    model.fit(X_train1, y_train1)  
    pred = model.predict(X_val1)
    scores = cross_val_score(model, X_train1, y_train1, cv = 5)
    prescore = precision_score(pred, y_val1, average='micro')
    rescore = recall_score(pred, y_val1, average='micro')
    f1score = f1_score(pred, y_val1, average='micro')

    print('average cv score:', sum(scores)/len(scores))
    print('average precision score:', prescore)
    print('average recall score:', rescore)
    print('average f1 score:', f1score)

average cv score: 0.3390121411120486
average precision score: 0.35
average recall score: 0.35
average f1 score: 0.35
average cv score: 0.3972283190137029
average precision score: 0.4066666666666667
average recall score: 0.4066666666666667
average f1 score: 0.4066666666666667
average cv score: 0.408597390891563
average precision score: 0.38333333333333336
average recall score: 0.38333333333333336
average f1 score: 0.38333333333333336
average cv score: 0.30872142042354805
average precision score: 0.2966666666666667
average recall score: 0.2966666666666667
average f1 score: 0.2966666666666667
average cv score: 0.4142724404796561
average precision score: 0.3566666666666667
average recall score: 0.3566666666666667
average f1 score: 0.35666666666666674


# Task 4
## 1. Perform evaluation on	the	test dataset using the optimal parameter settings that	 were obtained from the training set
## 2. Report its precision, recall and f-scores
## 3. For the best performing algorithm(Logistic Regression), compute precision and recall for every rating score separately
## Which	types	of	reviews	were	the	hardest	to	predict?


In [7]:
#classifier model with optimal settings
classifier_NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes= (70, 50, 100), random_state=1)
classifier_NB = MultinomialNB()
classifier_LR = LogisticRegression(penalty = 'l2', C = 1.0)
classifier_AB = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=50)
classifier_SVC = SVC(C=1.0, kernel = 'linear')

models = [classifier_NN, classifier_NB, classifier_LR,classifier_AB, classifier_SVC]

# transform the documents in the testing data to a document-term matrix
X_test_vectorized = vector.transform(X_test)
X_train, X_val, y_train, y_val = train_test_split(X_test_vectorized, y_test,test_size = 0.3)

#Output the precision, recall and f1-score
for model in models:
    model.fit(X_train, y_train)  
    pred = model.predict(X_val)
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    prescore = precision_score(pred, y_val, average='micro')
    rescore = recall_score(pred, y_val, average='micro')
    f1score = f1_score(pred, y_val, average='micro')

    print('average cv score:', sum(scores)/len(scores))
    print('average precision score:', prescore)
    print('average recall score:', rescore)
    print('average f1 score:', f1score)
    

average cv score: 0.4927638277906814
average precision score: 0.4666666666666667
average recall score: 0.4666666666666667
average f1 score: 0.4666666666666667
average cv score: 0.502825343795269
average precision score: 0.5
average recall score: 0.5
average f1 score: 0.5
average cv score: 0.502928277174745
average precision score: 0.5133333333333333
average recall score: 0.5133333333333333
average f1 score: 0.5133333333333333
average cv score: 0.3487336310415127
average precision score: 0.37666666666666665
average recall score: 0.37666666666666665
average f1 score: 0.37666666666666665
average cv score: 0.5085199555434283
average precision score: 0.49666666666666665
average recall score: 0.49666666666666665
average f1 score: 0.49666666666666665


In [38]:
classifier_LR = LogisticRegression(penalty = 'l2', C = 1.0)

X_test_vectorized = vector.transform(X_test)
X_train, X_val, y_train, y_val = train_test_split(X_test_vectorized, y_test,test_size = 0.3)

classifier_LR.fit(X_train, y_train)  
pred = classifier_LR.predict(X_val)

precision = precision_score(pred, y_val, average=None)
recall = recall_score(pred, y_val, average=None)
print('                   score 1,  score 2,   score 3,   score 4,   socre 5')
print('precision score:', precision)
print('recall score:   ', recall)
    

                   score 1,  score 2,   score 3,   score 4,   socre 5
precision score: [0.57142857 0.51612903 0.27777778 0.37096774 0.71186441]
recall score:    [0.52941176 0.47058824 0.30612245 0.60526316 0.54545455]


### From the result, the rating score of the review is 3, the precision and recall is not good. That means if the the reviews are neural, don't contain sentiment(positive or negative) words, then they are hard to predict the rating score.

# Task 5
## Discuss	some ideas that could help improve	your prediction. 
### 1. Use sentiment analysis, we capture semantic similarities between words, and capture the sentiments of individual words within a review, in this way, it may help improve the accuracy of prediction. 
### 2. Use natural language processing method, such as Latent Semantic Analysis, LSA can finds hidden relationship between words in order to improve information  understanding. 