# COMP 8745 Final Project

In [54]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold

In [137]:
data_train = pd.read_csv('training.txt', sep='\t', header=None,engine='python', error_bad_lines=False, warn_bad_lines=False)
data_train.columns = ['Reviews','Ratings']

data_test = pd.read_csv('test.txt', sep='\t',header=None,engine='python', error_bad_lines=False, warn_bad_lines=False)
data_test.columns = ['Reviews', 'Ratings']

X_train = data_train['Reviews']
y_train = data_train['Ratings']

X_test = data_test['Reviews']
y_test = data_test['Ratings']



In [179]:
positive_text = pd.read_csv('positive-words.txt', sep='delimiter', header=None,engine='python', 
                            error_bad_lines=False, warn_bad_lines=False)

positive_text.columns = ['sentiment_words']
pos_data = positive_text['sentiment_words']

negative_text = pd.read_csv('negative-words.txt', sep='delimiter', header=None,engine='python', 
                            error_bad_lines=False, warn_bad_lines=False)

negative_text.columns = ['sentiment_words']
neg_data = negative_text['sentiment_words']

filter_data = pos_data.append(neg_data)

In [55]:
kf = KFold(n_splits=5)

## Implementation of bag of words model using TfidfVectorizer

In [235]:
vectorizer = TfidfVectorizer(stop_words='english')
vector_train = vectorizer.fit_transform(X_train)

vector_test = vectorizer.transform(X_test)

# Task 1

## Neural Nets

In [165]:
hidden_layers = [(25, 25), (25, 25, 25), (30, 30, 30), (40, 40, 40), (40, 40, 40, 40), (50, 50, 50, 50), (100, 100, 100, 100)] 

### 5-Fold Cross Validation

In [122]:
fold = 1;
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for layer in hidden_layers:
        nn = MLPClassifier(hidden_layer_sizes=layer, learning_rate_init=0.01)
        nn.fit(X_traincv, y_traincv)
        y_predcv = nn.predict(X_testcv)
        #accuracy_scores.append(metrics.accuracy_score(y_testcv, y_predcv))
        print('--------- Scores for hidden layers {}---------'.format(layer))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
--------- Scores for hidden layers (25, 25)---------
             precision    recall  f1-score   support

        1.0       0.65      0.62      0.63       421
        2.0       0.32      0.39      0.35       311
        3.0       0.27      0.46      0.34       320
        4.0       0.42      0.29      0.34       481
        5.0       0.57      0.42      0.48       463

avg / total       0.46      0.43      0.44      1996

--------- Scores for hidden layers (25, 25, 25)---------
             precision    recall  f1-score   support

        1.0       0.59      0.62      0.61       421
        2.0       0.33      0.43      0.37       311
        3.0       0.34      0.33      0.34       320
        4.0       0.39      0.39      0.39       481
        5.0       0.55      0.43      0.48       463

avg / total       0.45      0.44      0.45      1996

--------- Scores for hidden layers (30, 30, 30)---------
             precision    recall  f1-score   support


--------- Scores for hidden layers (50, 50, 50, 50)---------
             precision    recall  f1-score   support

        1.0       0.35      0.62      0.45       118
        2.0       0.22      0.41      0.28       153
        3.0       0.29      0.29      0.29       279
        4.0       0.48      0.41      0.44       702
        5.0       0.58      0.49      0.53       743

avg / total       0.47      0.44      0.44      1995

--------- Scores for hidden layers (100, 100, 100, 100)---------
             precision    recall  f1-score   support

        1.0       0.34      0.59      0.43       118
        2.0       0.22      0.49      0.31       153
        3.0       0.27      0.36      0.31       279
        4.0       0.49      0.41      0.45       702
        5.0       0.60      0.39      0.47       743

avg / total       0.47      0.41      0.42      1995

------------ Fold 4 -----------
--------- Scores for hidden layers (25, 25)---------
             precision    recall  f1-scor

  'recall', 'true', average, warn_for)


--------- Scores for hidden layers (25, 25, 25)---------
             precision    recall  f1-score   support

        1.0       0.75      0.56      0.64       817
        2.0       0.49      0.37      0.42       788
        3.0       0.37      0.30      0.33       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.43      0.49      1995

--------- Scores for hidden layers (30, 30, 30)---------
             precision    recall  f1-score   support

        1.0       0.79      0.51      0.62       817
        2.0       0.50      0.39      0.44       788
        3.0       0.30      0.33      0.31       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.58      0.42      0.49      1995

--------- Scores for hidden layers (40, 40, 40)---------
             precision    recall  f1-score   support

        1.0       0.79     

## Naive Bayes

### 5-Fold Cross Validation

In [110]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    nb = MultinomialNB()
    nb.fit(X_traincv, y_traincv)
    y_predcv = nb.predict(X_testcv)
    fold = fold + 1
    print(metrics.classification_report(y_testcv, y_predcv))

------------ Fold 1 -----------
             precision    recall  f1-score   support

        1.0       0.68      0.53      0.60       421
        2.0       0.29      0.70      0.41       311
        3.0       0.24      0.41      0.30       320
        4.0       0.41      0.11      0.18       481
        5.0       0.67      0.34      0.45       463

avg / total       0.48      0.39      0.39      1996

------------ Fold 2 -----------
             precision    recall  f1-score   support

        1.0       0.54      0.72      0.62       415
        2.0       0.38      0.50      0.43       424
        3.0       0.33      0.38      0.35       395
        4.0       0.43      0.22      0.29       417
        5.0       0.64      0.42      0.51       345

avg / total       0.46      0.45      0.44      1996

------------ Fold 3 -----------
             precision    recall  f1-score   support

        1.0       0.30      0.69      0.42       118
        2.0       0.11      0.59      0.18       

  'recall', 'true', average, warn_for)


## Logistic Regression

### 5-Fold Cross Validation

#### Logistic Regression using L1 Regularization

In [114]:
C = [0.01, 0.1, 1, 10, 100]
fold = 1;
print('--------------- Logistic Regression using L1 Regularization ----------------')
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for c in C:
        lgr_l1 = LogisticRegression(penalty='l1', C=c)
        lgr_l1.fit(X_traincv, y_traincv)
        y_predcv = lgr_l1.predict(X_testcv) 
        print('---------- Scores using strength parameter C = {} -----------'.format(c))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

--------------- Logistic Regression using L1 Regularization ----------------
------------ Fold 1 -----------
---------- Scores using strength parameter C = 0.01 -----------


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

---------- Scores using strength parameter C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.61      0.34      0.44       421
        2.0       0.23      0.61      0.33       311
        3.0       0.25      0.30      0.27       320
        4.0       0.44      0.03      0.06       481
        5.0       0.53      0.57      0.55       463

avg / total       0.43      0.36      0.33      1996

---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.61      0.78      0.68       421
        2.0       0.3

---------- Scores using strength parameter C = 100 -----------
             precision    recall  f1-score   support

        1.0       0.46      0.59      0.52       225
        2.0       0.27      0.37      0.31       318
        3.0       0.47      0.32      0.38       608
        4.0       0.31      0.33      0.32       398
        5.0       0.50      0.49      0.49       446

avg / total       0.41      0.40      0.40      1995

------------ Fold 5 -----------
---------- Scores using strength parameter C = 0.01 -----------


  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

---------- Scores using strength parameter C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.83      0.12      0.21       817
        2.0       0.00      0.00      0.00       788
        3.0       0.32      0.05      0.09       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.40      0.06      0.10      1995

---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.77      0.68      0.73       817
        2.0       0.61      0.23      0.33       788
        3.0       0.3

#### Logistic Regression using L2 Regularization

In [115]:
fold = 1;
print('--------------- Logistic Regression using L2 Regularization ----------------')
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for c in C:
        lgr_l1 = LogisticRegression(penalty='l2', C=c)
        lgr_l1.fit(X_traincv, y_traincv)
        y_predcv = lgr_l1.predict(X_testcv) 
        print('---------- Scores using strength parameter C = {} -----------'.format(c))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

--------------- Logistic Regression using L2 Regularization ----------------
------------ Fold 1 -----------
---------- Scores using strength parameter C = 0.01 -----------
             precision    recall  f1-score   support

        1.0       0.77      0.19      0.31       421
        2.0       0.24      0.85      0.38       311
        3.0       0.21      0.39      0.27       320
        4.0       0.00      0.00      0.00       481
        5.0       0.68      0.30      0.42       463

avg / total       0.39      0.31      0.26      1996



  'precision', 'predicted', average, warn_for)


---------- Scores using strength parameter C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.59      0.67      0.63       421
        2.0       0.31      0.55      0.40       311
        3.0       0.30      0.40      0.34       320
        4.0       0.54      0.11      0.18       481
        5.0       0.61      0.59      0.60       463

avg / total       0.49      0.45      0.43      1996

---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.61      0.75      0.67       421
        2.0       0.34      0.44      0.39       311
        3.0       0.31      0.37      0.34       320
        4.0       0.47      0.24      0.32       481
        5.0       0.63      0.60      0.61       463

avg / total       0.49      0.48      0.47      1996

---------- Scores using strength parameter C = 10 -----------
             precision    recall  f1-score   support

        1.0 

---------- Scores using strength parameter C = 0.01 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995



  'recall', 'true', average, warn_for)


---------- Scores using strength parameter C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.90      0.31      0.46       817
        2.0       0.86      0.01      0.02       788
        3.0       0.22      0.22      0.22       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.75      0.17      0.24      1995

---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.78      0.70      0.74       817
        2.0       0.63      0.22      0.33       788
        3.0       0.31      0.42      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.63      0.46      0.50      1995

---------- Scores using strength parameter C = 10 -----------
             precision    recall  f1-score   support

        1.0 

## AdaBoosting

### 5-Fold Cross Validation

In [103]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for n in range(40, 110, 10):
        adb = AdaBoostClassifier(n_estimators=n)
        adb.fit(X_traincv, y_traincv)
        y_predcv = adb.predict(X_testcv)
        print('--------- Scores for {} estimators---------'.format(n))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
--------- Scores for 40 estimator---------
             precision    recall  f1-score   support

        1.0       0.47      0.80      0.59       421
        2.0       0.34      0.19      0.24       311
        3.0       0.31      0.35      0.33       320
        4.0       0.46      0.24      0.32       481
        5.0       0.54      0.58      0.56       463

avg / total       0.44      0.45      0.42      1996

--------- Scores for 50 estimator---------
             precision    recall  f1-score   support

        1.0       0.64      0.57      0.61       421
        2.0       0.30      0.50      0.38       311
        3.0       0.32      0.36      0.34       320
        4.0       0.46      0.25      0.32       481
        5.0       0.54      0.56      0.55       463

avg / total       0.47      0.45      0.45      1996

--------- Scores for 60 estimator---------
             precision    recall  f1-score   support

        1.0       0.65      0.57     

--------- Scores for 60 estimator---------
             precision    recall  f1-score   support

        1.0       0.48      0.55      0.51       225
        2.0       0.30      0.45      0.36       318
        3.0       0.63      0.24      0.34       608
        4.0       0.34      0.44      0.39       398
        5.0       0.49      0.56      0.52       446

avg / total       0.47      0.42      0.41      1995

--------- Scores for 70 estimator---------
             precision    recall  f1-score   support

        1.0       0.39      0.67      0.50       225
        2.0       0.33      0.36      0.35       318
        3.0       0.61      0.25      0.35       608
        4.0       0.34      0.43      0.38       398
        5.0       0.50      0.57      0.53       446

avg / total       0.46      0.42      0.41      1995

--------- Scores for 80 estimator---------
             precision    recall  f1-score   support

        1.0       0.43      0.65      0.52       225
        2.0     

  'recall', 'true', average, warn_for)


--------- Scores for 50 estimator---------
             precision    recall  f1-score   support

        1.0       0.74      0.49      0.59       817
        2.0       0.53      0.23      0.32       788
        3.0       0.28      0.36      0.32       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.36      0.43      1995



  'recall', 'true', average, warn_for)


--------- Scores for 60 estimator---------
             precision    recall  f1-score   support

        1.0       0.75      0.49      0.59       817
        2.0       0.54      0.25      0.34       788
        3.0       0.31      0.39      0.35       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.58      0.37      0.44      1995



  'recall', 'true', average, warn_for)


--------- Scores for 70 estimator---------
             precision    recall  f1-score   support

        1.0       0.75      0.49      0.59       817
        2.0       0.52      0.24      0.32       788
        3.0       0.30      0.38      0.33       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.37      0.44      1995



  'recall', 'true', average, warn_for)


--------- Scores for 80 estimator---------
             precision    recall  f1-score   support

        1.0       0.75      0.49      0.59       817
        2.0       0.49      0.25      0.33       788
        3.0       0.31      0.39      0.34       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.56      0.37      0.44      1995



  'recall', 'true', average, warn_for)


--------- Scores for 90 estimator---------
             precision    recall  f1-score   support

        1.0       0.77      0.50      0.60       817
        2.0       0.50      0.26      0.35       788
        3.0       0.30      0.37      0.33       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.38      0.45      1995



  'recall', 'true', average, warn_for)


## SVM

### 5-Fold Cross Validation

#### SVC using Gausian Kernel

In [116]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for c in C:
        svc = SVC(C=c, kernel='rbf')
        svc.fit(X_traincv, y_traincv)
        y_predcv = svc.predict(X_testcv)
        print('---------- Scores using C = {} -----------'.format(c))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
---------- Scores using C = 0.01 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996



  'precision', 'predicted', average, warn_for)


---------- Scores using C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

---------- Scores using C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

---------- Scores using C = 10 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0     

  'recall', 'true', average, warn_for)


---------- Scores using C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

---------- Scores using C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

---------- Scores using C = 10 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0     

#### SVC using Poly Kernel

In [258]:
fold = 1
degree = [1, 2, 3, 5, 10]
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for d in degree:
        svc = SVC(C=1000, kernel='poly', degree=d)
        svc.fit(X_traincv, y_traincv)
        y_predcv = svc.predict(X_testcv)
        print('---------- Scores using poly kernel of degree = {} -----------'.format(d))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
---------- Scores using poly kernel of degree = 1 -----------
             precision    recall  f1-score   support

        1.0       0.60      0.71      0.65       421
        2.0       0.35      0.52      0.42       311
        3.0       0.30      0.32      0.31       320
        4.0       0.50      0.20      0.28       481
        5.0       0.58      0.62      0.60       463

avg / total       0.48      0.47      0.46      1996

---------- Scores using poly kernel of degree = 2 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996



  'precision', 'predicted', average, warn_for)


---------- Scores using poly kernel of degree = 3 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

---------- Scores using poly kernel of degree = 5 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

---------- Scores using poly kernel of degree = 10 -----------
             precision    recall  f1-score   support

        1.0

  'recall', 'true', average, warn_for)


---------- Scores using poly kernel of degree = 2 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

---------- Scores using poly kernel of degree = 3 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

---------- Scores using poly kernel of degree = 5 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0 

# Task 2

## Imlementation of TfidfVectorizer using sentiment words

In [241]:
vectorizer_filter = TfidfVectorizer(stop_words='english')
vectorizer_filter.fit_transform(filter_data)

vectorizer = TfidfVectorizer(stop_words='english', vocabulary=vectorizer_filter.get_feature_names())
vector_train = vectorizer.fit_transform(X_train)

vector_test = vectorizer.transform(X_test)

## Neural Nets

### 5-Fold Cross Validation using sentiment words

In [174]:
fold = 1;
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for layer in hidden_layers:
        nn = MLPClassifier(hidden_layer_sizes=layer, learning_rate_init=0.01)
        nn.fit(X_traincv, y_traincv)
        y_predcv = nn.predict(X_testcv)
        print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
        print('--------- Scores for hidden layers {}---------'.format(layer))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
-------------Accuracy = 0.41282565130260523 ------------------
--------- Scores for hidden layers (25, 25)---------
             precision    recall  f1-score   support

        1.0       0.51      0.58      0.54       421
        2.0       0.28      0.30      0.29       311
        3.0       0.28      0.28      0.28       320
        4.0       0.40      0.38      0.39       481
        5.0       0.53      0.46      0.49       463

avg / total       0.41      0.41      0.41      1996

-------------Accuracy = 0.4088176352705411 ------------------
--------- Scores for hidden layers (25, 25, 25)---------
             precision    recall  f1-score   support

        1.0       0.55      0.55      0.55       421
        2.0       0.29      0.38      0.33       311
        3.0       0.26      0.32      0.29       320
        4.0       0.43      0.27      0.33       481
        5.0       0.51      0.50      0.50       463

avg / total       0.42      0.41      0

-------------Accuracy = 0.3879699248120301 ------------------
--------- Scores for hidden layers (40, 40, 40)---------
             precision    recall  f1-score   support

        1.0       0.32      0.53      0.40       118
        2.0       0.17      0.31      0.22       153
        3.0       0.26      0.37      0.30       279
        4.0       0.44      0.36      0.40       702
        5.0       0.57      0.41      0.48       743

avg / total       0.43      0.39      0.40      1995

-------------Accuracy = 0.37593984962406013 ------------------
--------- Scores for hidden layers (40, 40, 40, 40)---------
             precision    recall  f1-score   support

        1.0       0.31      0.47      0.37       118
        2.0       0.18      0.47      0.26       153
        3.0       0.24      0.33      0.28       279
        4.0       0.44      0.36      0.40       702
        5.0       0.62      0.37      0.46       743

avg / total       0.45      0.38      0.39      1995

---------

  'recall', 'true', average, warn_for)


-------------Accuracy = 0.37644110275689224 ------------------
--------- Scores for hidden layers (25, 25, 25)---------
             precision    recall  f1-score   support

        1.0       0.73      0.44      0.55       817
        2.0       0.43      0.30      0.36       788
        3.0       0.29      0.40      0.33       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.53      0.38      0.43      1995

-------------Accuracy = 0.36290726817042607 ------------------
--------- Scores for hidden layers (30, 30, 30)---------
             precision    recall  f1-score   support

        1.0       0.75      0.40      0.52       817
        2.0       0.45      0.34      0.39       788
        3.0       0.29      0.35      0.32       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.54      0.36      0.43      1995

------------

## Naive Bayes

In [232]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    nb = MultinomialNB()
    nb.fit(X_traincv, y_traincv)
    y_predcv = nb.predict(X_testcv)
    print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
    print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
-------------Accuracy = 0.46593186372745493 ------------------
             precision    recall  f1-score   support

        1.0       0.70      0.63      0.67       421
        2.0       0.36      0.57      0.44       311
        3.0       0.30      0.40      0.34       320
        4.0       0.40      0.18      0.25       481
        5.0       0.56      0.59      0.57       463

avg / total       0.48      0.47      0.46      1996

------------ Fold 2 -----------
-------------Accuracy = 0.44138276553106215 ------------------
             precision    recall  f1-score   support

        1.0       0.56      0.65      0.60       415
        2.0       0.36      0.36      0.36       424
        3.0       0.33      0.34      0.34       395
        4.0       0.44      0.31      0.37       417
        5.0       0.48      0.57      0.52       345

avg / total       0.44      0.44      0.44      1996

------------ Fold 3 -----------
-------------Accuracy = 0.3192

  'recall', 'true', average, warn_for)


## Logistic Regression

#### Logistic Regression using L1 Regularization

In [186]:
fold = 1;
print('--------------- Logistic Regression using L1 Regularization ----------------')
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for c in C:
        lgr_l1 = LogisticRegression(penalty='l1', C=c)
        lgr_l1.fit(X_traincv, y_traincv)
        y_predcv = lgr_l1.predict(X_testcv) 
        print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
        print('---------- Scores using strength parameter C = {} -----------'.format(c))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

--------------- Logistic Regression using L1 Regularization ----------------
------------ Fold 1 -----------
-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using strength parameter C = 0.01 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

-------------Accuracy = 0.4338677354709419 ------------------
---------- Scores using strength parameter C = 0.1 -----------


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

        1.0       0.49      0.75      0.59       421
        2.0       0.31      0.39      0.34       311
        3.0       0.29      0.28      0.28       320
        4.0       0.46      0.10      0.16       481
        5.0       0.53      0.64      0.58       463

avg / total       0.43      0.43      0.40      1996

-------------Accuracy = 0.4779559118236473 ------------------
---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.58      0.76      0.66       421
        2.0       0.38      0.43      0.40       311
        3.0       0.32      0.32      0.32       320
        4.0       0.47      0.22      0.30       481
        5.0       0.54      0.63      0.58       463

avg / total       0.47      0.48      0.46      1996

-------------Accuracy = 0.44488977955911824 ------------------
---------- Scores using strength parameter C = 10 -----------
    

-------------Accuracy = 0.42255639097744363 ------------------
---------- Scores using strength parameter C = 10 -----------
             precision    recall  f1-score   support

        1.0       0.41      0.62      0.49       225
        2.0       0.32      0.36      0.34       318
        3.0       0.52      0.31      0.39       608
        4.0       0.34      0.39      0.37       398
        5.0       0.52      0.54      0.53       446

avg / total       0.44      0.42      0.42      1995

-------------Accuracy = 0.3744360902255639 ------------------
---------- Scores using strength parameter C = 100 -----------
             precision    recall  f1-score   support

        1.0       0.39      0.56      0.46       225
        2.0       0.29      0.34      0.31       318
        3.0       0.44      0.26      0.32       608
        4.0       0.30      0.37      0.33       398
        5.0       0.46      0.47      0.46       446

avg / total       0.39      0.37      0.37      1995

--

  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

        1.0       0.70      0.56      0.62       817
        2.0       0.56      0.07      0.13       788
        3.0       0.30      0.26      0.28       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.31      0.36      1995

-------------Accuracy = 0.4130325814536341 ------------------
---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.73      0.56      0.64       817
        2.0       0.52      0.26      0.35       788
        3.0       0.31      0.41      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.41      0.47      1995

-------------Accuracy = 0.3829573934837093 ------------------
---------- Scores using strength parameter C = 10 -----------
     

#### Logistic Regression using L2 Regularization

In [187]:
fold = 1;
print('--------------- Logistic Regression using L2 Regularization ----------------')
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for c in C:
        lgr_l1 = LogisticRegression(penalty='l2', C=c)
        lgr_l1.fit(X_traincv, y_traincv)
        y_predcv = lgr_l1.predict(X_testcv) 
        print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
        print('---------- Scores using strength parameter C = {} -----------'.format(c))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

--------------- Logistic Regression using L2 Regularization ----------------
------------ Fold 1 -----------
-------------Accuracy = 0.42084168336673344 ------------------
---------- Scores using strength parameter C = 0.01 -----------
             precision    recall  f1-score   support

        1.0       0.61      0.60      0.61       421
        2.0       0.30      0.55      0.39       311
        3.0       0.26      0.37      0.31       320
        4.0       0.55      0.02      0.05       481
        5.0       0.54      0.62      0.57       463

avg / total       0.47      0.42      0.38      1996

-------------Accuracy = 0.47044088176352705 ------------------
---------- Scores using strength parameter C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.56      0.76      0.65       421
        2.0       0.37      0.45      0.41       311
        3.0       0.29      0.31      0.30       320
        4.0       0.50      0.15      0.23       48

-------------Accuracy = 0.44611528822055135 ------------------
---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.43      0.69      0.53       225
        2.0       0.34      0.39      0.36       318
        3.0       0.55      0.30      0.39       608
        4.0       0.36      0.38      0.37       398
        5.0       0.54      0.61      0.57       446

avg / total       0.46      0.45      0.44      1995

-------------Accuracy = 0.4320802005012531 ------------------
---------- Scores using strength parameter C = 10 -----------
             precision    recall  f1-score   support

        1.0       0.44      0.64      0.52       225
        2.0       0.33      0.39      0.36       318
        3.0       0.52      0.31      0.39       608
        4.0       0.34      0.38      0.36       398
        5.0       0.53      0.57      0.54       446

avg / total       0.45      0.43      0.43      1995

----

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


-------------Accuracy = 0.35137844611528823 ------------------
---------- Scores using strength parameter C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.76      0.57      0.65       817
        2.0       0.58      0.13      0.21       788
        3.0       0.25      0.35      0.29       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.59      0.35      0.41      1995

-------------Accuracy = 0.41904761904761906 ------------------
---------- Scores using strength parameter C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.75      0.58      0.65       817
        2.0       0.51      0.27      0.35       788
        3.0       0.30      0.39      0.34       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.57      0.42      0.47      1995

--

## AdaBoosting

In [188]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for n in range(40, 110, 10):
        adb = AdaBoostClassifier(n_estimators=n)
        adb.fit(X_traincv, y_traincv)
        y_predcv = adb.predict(X_testcv)
        print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
        print('--------- Scores for {} estimators---------'.format(n))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
-------------Accuracy = 0.437875751503006 ------------------
--------- Scores for 40 estimators---------
             precision    recall  f1-score   support

        1.0       0.51      0.72      0.60       421
        2.0       0.32      0.31      0.32       311
        3.0       0.28      0.33      0.31       320
        4.0       0.44      0.17      0.24       481
        5.0       0.53      0.62      0.57       463

avg / total       0.43      0.44      0.42      1996

-------------Accuracy = 0.4393787575150301 ------------------
--------- Scores for 50 estimators---------
             precision    recall  f1-score   support

        1.0       0.52      0.72      0.61       421
        2.0       0.31      0.33      0.32       311
        3.0       0.28      0.28      0.28       320
        4.0       0.44      0.20      0.27       481
        5.0       0.53      0.61      0.57       463

avg / total       0.43      0.44      0.42      1996

---------

-------------Accuracy = 0.4205513784461153 ------------------
--------- Scores for 70 estimators---------
             precision    recall  f1-score   support

        1.0       0.25      0.59      0.36       118
        2.0       0.25      0.35      0.29       153
        3.0       0.29      0.43      0.35       279
        4.0       0.49      0.28      0.36       702
        5.0       0.58      0.53      0.56       743

avg / total       0.46      0.42      0.43      1995

-------------Accuracy = 0.42656641604010026 ------------------
--------- Scores for 80 estimators---------
             precision    recall  f1-score   support

        1.0       0.26      0.58      0.36       118
        2.0       0.24      0.35      0.29       153
        3.0       0.29      0.43      0.35       279
        4.0       0.50      0.29      0.37       702
        5.0       0.59      0.54      0.56       743

avg / total       0.47      0.43      0.43      1995

-------------Accuracy = 0.4190476190476

  'recall', 'true', average, warn_for)


-------------Accuracy = 0.3433583959899749 ------------------
--------- Scores for 50 estimators---------
             precision    recall  f1-score   support

        1.0       0.74      0.42      0.53       817
        2.0       0.53      0.23      0.32       788
        3.0       0.32      0.42      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.58      0.34      0.42      1995



  'recall', 'true', average, warn_for)


-------------Accuracy = 0.3954887218045113 ------------------
--------- Scores for 60 estimators---------
             precision    recall  f1-score   support

        1.0       0.75      0.44      0.55       817
        2.0       0.46      0.34      0.39       788
        3.0       0.31      0.42      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.55      0.40      0.45      1995



  'recall', 'true', average, warn_for)


-------------Accuracy = 0.3919799498746867 ------------------
--------- Scores for 70 estimators---------
             precision    recall  f1-score   support

        1.0       0.73      0.43      0.55       817
        2.0       0.45      0.34      0.39       788
        3.0       0.32      0.41      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.54      0.39      0.45      1995



  'recall', 'true', average, warn_for)


-------------Accuracy = 0.3829573934837093 ------------------
--------- Scores for 80 estimators---------
             precision    recall  f1-score   support

        1.0       0.73      0.45      0.55       817
        2.0       0.46      0.33      0.38       788
        3.0       0.30      0.36      0.33       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.54      0.38      0.44      1995



  'recall', 'true', average, warn_for)


-------------Accuracy = 0.3919799498746867 ------------------
--------- Scores for 90 estimators---------
             precision    recall  f1-score   support

        1.0       0.72      0.45      0.55       817
        2.0       0.45      0.32      0.38       788
        3.0       0.31      0.41      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.53      0.39      0.44      1995



  'recall', 'true', average, warn_for)


-------------Accuracy = 0.38847117794486213 ------------------
--------- Scores for 100 estimators---------
             precision    recall  f1-score   support

        1.0       0.72      0.42      0.53       817
        2.0       0.44      0.34      0.39       788
        3.0       0.32      0.41      0.36       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.53      0.39      0.44      1995



  'recall', 'true', average, warn_for)


## SVM

#### SVC using Gausian Kernel

In [189]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for c in C:
        svc = SVC(C=c, kernel='rbf')
        svc.fit(X_traincv, y_traincv)
        y_predcv = svc.predict(X_testcv)
        print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
        print('---------- Scores using C = {} -----------'.format(c))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using C = 0.01 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996



  'precision', 'predicted', average, warn_for)


-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

-------------Accuracy = 0.15581162324649

-------------Accuracy = 0.11278195488721804 ------------------
---------- Scores using C = 10 -----------
             precision    recall  f1-score   support

        1.0       0.11      1.00      0.20       225
        2.0       0.00      0.00      0.00       318
        3.0       0.00      0.00      0.00       608
        4.0       0.00      0.00      0.00       398
        5.0       0.00      0.00      0.00       446

avg / total       0.01      0.11      0.02      1995

-------------Accuracy = 0.3263157894736842 ------------------
---------- Scores using C = 100 -----------
             precision    recall  f1-score   support

        1.0       0.25      0.91      0.40       225
        2.0       0.20      0.22      0.21       318
        3.0       0.67      0.00      0.01       608
        4.0       0.38      0.58      0.46       398
        5.0       0.67      0.32      0.44       446

avg / total       0.49      0.33      0.27      1995

------------ Fold 5 -----------
--------

  'recall', 'true', average, warn_for)


-------------Accuracy = 0.0 ------------------
---------- Scores using C = 0.1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

-------------Accuracy = 0.0 ------------------
---------- Scores using C = 1 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

-------------Accuracy = 0.0 ------------------
---------- Scores using C = 10 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00  

#### SVC using Poly Kernel

In [199]:
fold = 1
for train_index, test_index in kf.split(X_train):
    print('------------ Fold {} -----------'.format(fold))
    X_traincv, X_testcv = vector_train[train_index], vector_train[test_index]
    y_traincv, y_testcv = y_train[train_index], y_train[test_index]
    for d in degree:
        svc = SVC(C=500, kernel='poly', degree=d)
        svc.fit(X_traincv, y_traincv)
        y_predcv = svc.predict(X_testcv)
        print('-------------Accuracy = {} ------------------'.format(metrics.accuracy_score(y_testcv, y_predcv)))
        print('---------- Scores using poly kernel of degree = {} -----------'.format(d))
        print(metrics.classification_report(y_testcv, y_predcv))
    fold = fold + 1

------------ Fold 1 -----------
-------------Accuracy = 0.4468937875751503 ------------------
---------- Scores using poly kernel of degree = 1 -----------
             precision    recall  f1-score   support

        1.0       0.61      0.65      0.63       421
        2.0       0.31      0.59      0.41       311
        3.0       0.27      0.28      0.27       320
        4.0       0.47      0.15      0.22       481
        5.0       0.57      0.60      0.58       463

avg / total       0.47      0.45      0.43      1996

-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using poly kernel of degree = 2 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02   

  'precision', 'predicted', average, warn_for)


-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using poly kernel of degree = 3 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

-------------Accuracy = 0.15581162324649298 ------------------
---------- Scores using poly kernel of degree = 5 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       421
        2.0       0.16      1.00      0.27       311
        3.0       0.00      0.00      0.00       320
        4.0       0.00      0.00      0.00       481
        5.0       0.00      0.00      0.00       463

avg / total       0.02      0.16      0.04      1996

--

-------------Accuracy = 0.11278195488721804 ------------------
---------- Scores using poly kernel of degree = 10 -----------
             precision    recall  f1-score   support

        1.0       0.11      1.00      0.20       225
        2.0       0.00      0.00      0.00       318
        3.0       0.00      0.00      0.00       608
        4.0       0.00      0.00      0.00       398
        5.0       0.00      0.00      0.00       446

avg / total       0.01      0.11      0.02      1995

------------ Fold 5 -----------
-------------Accuracy = 0.27769423558897244 ------------------
---------- Scores using poly kernel of degree = 1 -----------
             precision    recall  f1-score   support

        1.0       0.75      0.51      0.61       817
        2.0       0.50      0.04      0.07       788
        3.0       0.20      0.27      0.23       390
        4.0       0.00      0.00      0.00         0
        5.0       0.00      0.00      0.00         0

avg / total       0.54 

  'recall', 'true', average, warn_for)


-------------Accuracy = 0.0 ------------------
---------- Scores using poly kernel of degree = 2 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

-------------Accuracy = 0.0 ------------------
---------- Scores using poly kernel of degree = 3 -----------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       817
        2.0       0.00      0.00      0.00       788
        3.0       0.00      0.00      0.00       390
        4.0       0.00      0.00      0.00         0

avg / total       0.00      0.00      0.00      1995

-------------Accuracy = 0.0 ------------------
---------- Scores using poly kernel of degree = 5 -----------
             precision    recal

# Task 3

## Experiment using optimal parameters on test data without filtering

## Neural Net

In [203]:
nn = MLPClassifier(hidden_layer_sizes=(75, 75, 75, 75), learning_rate_init=0.01)
nn.fit(vector_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(75, 75, 75, 75), learning_rate='constant',
       learning_rate_init=0.01, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [211]:
y_pred = nn.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.46846846846846846
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.56      0.66      0.60       199
        2.0       0.47      0.41      0.44       200
        3.0       0.41      0.39      0.40       200
        4.0       0.39      0.46      0.42       200
        5.0       0.51      0.44      0.47       200

avg / total       0.47      0.47      0.47       999



## Naive Bayes

In [213]:
nb = MultinomialNB()
nb.fit(vector_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [215]:
y_pred = nb.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.5005005005005005
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.50      0.65      0.57       199
        2.0       0.49      0.33      0.39       200
        3.0       0.43      0.41      0.42       200
        4.0       0.44      0.52      0.48       200
        5.0       0.65      0.59      0.62       200

avg / total       0.50      0.50      0.50       999



## Logistic Regression

In [217]:
lgr = LogisticRegression(C=1, penalty='l2')
lgr.fit(vector_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [219]:
y_pred = lgr.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.5325325325325325
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.54      0.76      0.64       199
        2.0       0.56      0.40      0.46       200
        3.0       0.46      0.38      0.42       200
        4.0       0.47      0.50      0.48       200
        5.0       0.62      0.62      0.62       200

avg / total       0.53      0.53      0.52       999



## AdaBoosting

In [222]:
adb = AdaBoostClassifier(n_estimators=90)
adb.fit(vector_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=90, random_state=None)

In [224]:
y_pred = adb.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.46646646646646645
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.58      0.60      0.59       199
        2.0       0.41      0.38      0.39       200
        3.0       0.38      0.34      0.36       200
        4.0       0.39      0.39      0.39       200
        5.0       0.54      0.62      0.58       200

avg / total       0.46      0.47      0.46       999



## SVC

In [239]:
svc = SVC(C = 100000, kernel='poly', degree=1000)
svc.fit(vector_train, y_train)

SVC(C=100000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1000, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [240]:
y_pred = svc.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.2002002002002002
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       199
        2.0       0.00      0.00      0.00       200
        3.0       0.00      0.00      0.00       200
        4.0       0.20      1.00      0.33       200
        5.0       0.00      0.00      0.00       200

avg / total       0.04      0.20      0.07       999



  'precision', 'predicted', average, warn_for)


## Experiment on test data using optimal parameters using sentiment filtering

## Neural Net

In [243]:
nn = MLPClassifier(hidden_layer_sizes=(75, 75, 75, 75), learning_rate_init=0.01)
nn.fit(vector_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(75, 75, 75, 75), learning_rate='constant',
       learning_rate_init=0.01, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [245]:
y_pred = nn.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.4144144144144144
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.57      0.57      0.57       199
        2.0       0.39      0.32      0.35       200
        3.0       0.32      0.33      0.33       200
        4.0       0.35      0.42      0.38       200
        5.0       0.47      0.43      0.45       200

avg / total       0.42      0.41      0.41       999



## Naive Bayes

In [246]:
nb = MultinomialNB()
nb.fit(vector_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [247]:
y_pred = nb.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.47647647647647645
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.67      0.56      0.61       199
        2.0       0.44      0.40      0.42       200
        3.0       0.37      0.43      0.40       200
        4.0       0.39      0.44      0.41       200
        5.0       0.58      0.56      0.56       200

avg / total       0.49      0.48      0.48       999



## Logistic Regression

In [248]:
lgr = lgr = LogisticRegression(C=1, penalty='l2')
lgr.fit(vector_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [252]:
y_pred = lgr.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.4744744744744745
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.59      0.64      0.62       199
        2.0       0.43      0.34      0.38       200
        3.0       0.38      0.41      0.39       200
        4.0       0.41      0.38      0.39       200
        5.0       0.54      0.60      0.57       200

avg / total       0.47      0.47      0.47       999



## AdaBoosting

In [253]:
adb = AdaBoostClassifier(n_estimators=90)
adb.fit(vector_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=90, random_state=None)

In [254]:
y_pred = adb.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.46046046046046046
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.58      0.60      0.59       199
        2.0       0.45      0.38      0.41       200
        3.0       0.38      0.36      0.37       200
        4.0       0.38      0.39      0.38       200
        5.0       0.51      0.57      0.54       200

avg / total       0.46      0.46      0.46       999



## SVC

In [255]:
svc = SVC(C = 100000, kernel='poly', degree=1000)
svc.fit(vector_train, y_train)

SVC(C=100000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=1000, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [257]:
y_pred = svc.predict(vector_test)
print('Accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred)))
print('---------- Scores for test data ------------')
print(metrics.classification_report(y_test, y_pred))

Accuracy : 0.2002002002002002
---------- Scores for test data ------------
             precision    recall  f1-score   support

        1.0       0.00      0.00      0.00       199
        2.0       0.00      0.00      0.00       200
        3.0       0.00      0.00      0.00       200
        4.0       0.20      1.00      0.33       200
        5.0       0.00      0.00      0.00       200

avg / total       0.04      0.20      0.07       999

(999,)


  'precision', 'predicted', average, warn_for)
