# This analysis process for sentiment analysis

In [1]:
import timeit

from gensim.models import word2vec
import numpy as np

from sentiment.utilities import dataset_split
from sentiment.utilities import load_data
from sentiment.utilities import text2vec
from sentiment.utilities import y_trainable

data = load_data()

# Preprocess and convert words and bigrams to integer IDs
# We're using the 20000 most frequent words and 20000 most frequent bigrams
# from the IMDB dataset here: http://ai.stanford.edu/~amaas/data/sentiment/
X = text2vec(data.text)
y = data.ratings.values * 10

z = data.reviewers.values

idc_all = dataset_split(y, holdout=0.3, validation=0.0)

X_train = X[idc_all[0]]
y_train_origin = y[idc_all[0]]
y_train = y_trainable(y_train_origin)

X_val = X[idc_all[1]]
y_val_origin = y[idc_all[1]]
y_val = y_trainable(y_val_origin)

data_test = load_data(category='test')
X_test = text2vec(data_test.text)
y_test_origin = data_test.ratings.values * 10

Using TensorFlow backend.


## Classifier 1:SVM(Support Vector Machine)

In [2]:
from sklearn.svm import SVC
from sentiment.utilities import mean_absolute_error

clf_1 = SVC(C = 2, probability = True)
clf_1.fit(X_train, y_train)

y_val_predicted = clf_1.predict(X_val)
mean_absolute_error(y_val_origin, y_val_predicted)

0.14337662337662338

In [3]:
y_test_predicted = clf_1.predict(X_test)
mean_absolute_error(y_test_origin, y_test_predicted)

0.15410199556541021

## Classifier 2:Naive Bayes

In [4]:
from sklearn.naive_bayes import GaussianNB

clf_2 = GaussianNB()
clf_2.fit(X_train, y_train)

y_val_predicted = clf_2.predict(X_val)
mean_absolute_error(y_val_origin, y_val_predicted)

0.2257316017316017

In [5]:
y_test_predicted = clf_2.predict(X_test)
mean_absolute_error(y_test_origin, y_test_predicted)

0.24223946784922396

## Classifier 3:Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

clf_3 = RandomForestClassifier(min_samples_leaf = 3, n_estimators = 100)
clf_3.fit(X_train, y_train)

y_val_predicted = clf_3.predict(X_val)
mean_absolute_error(y_val_origin, y_val_predicted)

0.13045887445887444

In [None]:
y_test_predicted = clf_3.predict(X_test)
mean_absolute_error(y_test_origin, y_test_predicted)

0.16363636363636364

## Classifier 4:BernoulliRBM

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM

rbm = BernoulliRBM(random_state = 0, verbose = True)
logistic = LogisticRegression()

rbm.learning_rate = 0.07
rbm.n_iter = 50
# more components tend to give better prediction performance, but larger fitting time
rbm.n_components = 800
rbm.batch_size = 10
logistic.C = 10000.0

clf_4 = Pipeline(steps = [('rbm', rbm), ('logistic', logistic)])
clf_4.fit(X_train, y_train)

[BernoulliRBM] Iteration 1, pseudo-likelihood = -22.14, time = 0.54s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -25.52, time = 0.74s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -29.67, time = 0.74s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -19.14, time = 0.73s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -27.89, time = 0.74s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -22.97, time = 0.76s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -34.20, time = 1.36s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -17.95, time = 0.91s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -15.79, time = 0.94s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -18.57, time = 1.35s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -27.72, time = 1.11s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -30.97, time = 0.68s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -32.23, time = 0.67s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -19.77, time = 0.67s
[BernoulliRBM] Iteration 15, 

In [None]:
y_val_predicted = clf_4.predict(X_val)
mean_absolute_error(y_val_origin, y_val_predicted)

In [None]:
y_test_predicted = clf_4.predict(X_test)
mean_absolute_error(y_test_origin, y_test_predicted)