In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn import metrics

In [2]:
#importing dataset
dataset = pd.read_csv('dataset.tsv',delimiter = '\t',header = None)
X = dataset[1]
y = dataset[0]

In [3]:
dataset.head()

Unnamed: 0,0,1
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [4]:
dataset.tail()

Unnamed: 0,0,1
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.
6917,0,"Oh, and Brokeback Mountain was a terrible movie."


In [5]:
dataset[0].sum()

3943

In [6]:
len(dataset)

6918

In [7]:
#stemming
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', X[i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus

['da vinci code book awesom',
 'first clive cussler ever read even book like relic da vinci code plausibl',
 'like da vinci code lot',
 'like da vinci code lot',
 'like da vinci code ultimatli seem hold',
 'even exagger midnight went wal mart buy da vinci code amaz cours',
 'love da vinci code want someth better differ',
 'thought da vinci code great kite runner',
 'da vinci code actual good movi',
 'thought da vinci code pretti good book',
 'da vinci code one beauti movi ive ever seen',
 'da vinci code amaz book get wrong',
 'turn light radio enjoy da vinci code',
 'da vinci code realli good',
 'love da vinci code',
 'love da vinci code',
 'night da vinci code beauti mind',
 'da vinci code awesom book',
 'thing enjoy da vinci code',
 'da vinci code slash amaz race',
 'hey love da vinci code',
 'also love da vinci code',
 'realli enjoy da vinci code thought would disappoint book',
 'like angel demon da vinci code',
 'da vinci code realli good movi',
 'yeah da vinci code awesom movi lik

In [9]:
type(X)

pandas.core.series.Series

In [10]:
type(corpus)

list

In [11]:
X

0                 The Da Vinci Code book is just awesome.
1       this was the first clive cussler i've ever rea...
2                        i liked the Da Vinci Code a lot.
3                        i liked the Da Vinci Code a lot.
4       I liked the Da Vinci Code but it ultimatly did...
5       that's not even an exaggeration ) and at midni...
6       I loved the Da Vinci Code, but now I want some...
7       i thought da vinci code was great, same with k...
8           The Da Vinci Code is actually a good movie...
9       I thought the Da Vinci Code was a pretty good ...
10      The Da Vinci Code is one of the most beautiful...
11      The Da Vinci Code is an * amazing * book, do n...
12      then I turn on the light and the radio and enj...
13                     The Da Vinci Code was REALLY good.
14                               i love da vinci code....
15                                i loved da vinci code..
16      TO NIGHT:: THE DA VINCI CODE AND A BEAUTIFUL M...
17            

In [12]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [13]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



# Different classifiers - Unigram model

In [14]:
# Fitting Multinomial Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Naive Bayes
Accuracy Score: 98.5549132948%
Confusion Matrix: 
[[597  10]
 [ 10 767]]


In [15]:
# Fitting SVM classifier to the Training set
from sklearn.svm import LinearSVC
SVM = LinearSVC()
SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Support Vector Machine
Accuracy Score: 99.4942196532%
Confusion Matrix: 
[[603   4]
 [  3 774]]


In [16]:
# Fitting K Nearest Neighbor classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print('\nK Nearest Neighbors')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


K Nearest Neighbors
Accuracy Score: 97.9046242775%
Confusion Matrix: 
[[586  21]
 [  8 769]]


In [17]:
# Fitting Logistic Regression model to the Training set
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print(X_test.shape)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

(1384, 1707)

Logistic Regression
Accuracy Score: 99.6387283237%
Confusion Matrix: 
[[602   5]
 [  0 777]]


In [18]:
# Fitting Random Forest Classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print('\nRandom Forest')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')


Random Forest
Accuracy Score: 98.7716763006%
Confusion Matrix: 
[[595  12]
 [  5 772]]


In [22]:
#Input Review
print('\nTest a custom review message')
print('Enter review to be analysed: ', end=" ")
test = []
test.append(input())
test_dtm = cv.transform(test)
predLabel = LR.predict(test_dtm)
tags = ['Negative','Positive']
#Display Output
print('The review is predicted',tags[predLabel[0]])


Test a custom review message
Enter review to be analysed:  awesome book
The review is predicted Positive


# BIGRAMS

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range = (2,2))
X = cv.fit_transform(corpus).toarray()

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# Fitting Multinomial Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting SVM classifier to the Training set
from sklearn.svm import LinearSVC
SVM = LinearSVC()
SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting K Nearest Neighbor classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print('\nK Nearest Neighbors')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting Logistic Regression model to the Training set
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting Random Forest Classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print('\nRandom Forest')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

# TRIGRAMS

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = .85, max_features = 1500, ngram_range = (3,3))
X = cv.fit_transform(corpus).toarray()

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
# Fitting Multinomial Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
print('\nNaive Bayes')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting SVM classifier to the Training set
from sklearn.svm import LinearSVC
SVM = LinearSVC()
SVM.fit(X_train, y_train)
y_pred = SVM.predict(X_test)
print('\nSupport Vector Machine')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting K Nearest Neighbor classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
print('\nK Nearest Neighbors')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting Logistic Regression model to the Training set
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Fitting Random Forest Classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
print('\nRandom Forest')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
from IPython.display import Image
Image(filename='accuracy.png') 

# Hyperparameters

In [None]:
"""from sklearn.model_selection import GridSearchCV
parameters = [{'max_df': [1, 0.95, 0.9, 0.85]}]
grid_search = GridSearchCV(estimator = LR,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X, y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_"""

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'max_df':(1,0.9,0.8), 'min_df':[0.1,0.2,0.3]}
grid_search = GridSearchCV(estimator = LR,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           )

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.fit(X_train, y_train).best_params_

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = .9)
X = cv.fit_transform(corpus).toarray()

# Fitting Logistic Regression model to the Training set
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = .8)
X = cv.fit_transform(corpus).toarray()

# Fitting Logistic Regression model to the Training set
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = .7)
X = cv.fit_transform(corpus).toarray()

# Fitting Logistic Regression model to the Training set
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print('\nLogistic Regression')
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred), sep = '\n')