In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['database.sqlite', 'hashes.txt', 'Reviews.csv']


In [2]:
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
import time
import tqdm

In [3]:
data = pd.read_csv('../input/Reviews.csv')

In [4]:
print(data.head(5))
print(data.shape)

   Id                        ...                                                                       Text
0   1                        ...                          I have bought several of the Vitality canned d...
1   2                        ...                          Product arrived labeled as Jumbo Salted Peanut...
2   3                        ...                          This is a confection that has been around a fe...
3   4                        ...                          If you are looking for the secret ingredient i...
4   5                        ...                          Great taffy at a great price.  There was a wid...

[5 rows x 10 columns]
(568454, 10)


In [5]:
#to remove the datapoints which are neither positive nor negative
data = data[data['Score']!=3]

In [6]:
data.shape

(525814, 10)

In [7]:
#for score>3, it is 'positive' review and score<3 is 'negative' review
def partition(x):
    if x>3:
        return 'positive'
    else:
        return 'negative'

In [8]:
data['Score'] = data['Score'].apply(partition)

In [9]:
sorted_data = data.sort_values(by='ProductId', axis=0)

In [10]:
final = sorted_data.drop_duplicates(subset={"UserId","Time","Text","ProfileName"})
final['Id'].size/sorted_data['Id'].size

0.6925890143662968

In [11]:
final = final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [12]:
final['Score_n'] = final['Score'].apply(lambda x:1 if x == 'positive' else 0)

In [25]:
final_1 = final[:]

In [26]:
#to remove html tags in the text data
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

#to remove punctuations in the text data
def cleanpunc(raw_string):
    #print(raw_string)
    newstr = [char for char in raw_string.split() if char not in string.punctuation]
    #print(newstr)
    newstr = ' '.join(newstr)
    return newstr

def cleantext(raw_string):
    cleant = re.sub('[^a-zA-Z0-9\n]', ' ',raw_string)
    return cleant

In [27]:
final_1['Text'] = final_1['Text'].apply(cleanhtml)
final_1['Text'] = final_1['Text'].apply(cleanpunc)
final_1['Text'] = final_1['Text'].apply(cleantext)

final_data = final_1

X_train, X_test, y_train, y_test = train_test_split(final_data['Text'], final_data['Score_n'],test_size = 0.25, stratify = final_data['Score_n']  )

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

count_vect = CountVectorizer(ngram_range=(1,2))
X_train_count = count_vect.fit_transform(X_train.values)
X_test_count = count_vect.transform(X_test.values)

sc = StandardScaler(with_mean=False)
X_train_std = sc.fit_transform(X_train_count)
X_test_std = sc.transform(X_test_count)

(273128,)
(91043,)
(273128,)
(91043,)


In [16]:
start_time = time.clock()
knn = KNeighborsClassifier()
knn.fit(X_train_std, y_train)
y_pred = knn.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("the time taken for the code execution is: ", (time.clock()-start_time)/60,"minutes")

0.8396
[[   0  401]
 [   0 2099]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       401
           1       0.84      1.00      0.91      2099

   micro avg       0.84      0.84      0.84      2500
   macro avg       0.42      0.50      0.46      2500
weighted avg       0.70      0.84      0.77      2500

the time taken for the code execution is:  0.04281418333333331 minutes


In [17]:
start_time = time.clock()
knn2 = KNeighborsClassifier()
gs = GridSearchCV(knn2, param_grid={'n_neighbors':[1,3,5,9]},scoring = 'f1_macro', cv=5)
gs.fit(X_train_std, y_train)
print("the time taken for the code execution is: ", (time.clock()-start_time)/60,"minutes")

the time taken for the code execution is:  1.2614259166666668 minutes


In [19]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [None]:
#As per the grid search, the optimum number of neighbors is 1

In [23]:
start_time = time.clock()
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_std, y_train)
y_pred = knn.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("the time taken for the code execution is: ", (time.clock()-start_time)/60,"minutes")

0.8404
[[   3  398]
 [   1 2098]]
              precision    recall  f1-score   support

           0       0.75      0.01      0.01       401
           1       0.84      1.00      0.91      2099

   micro avg       0.84      0.84      0.84      2500
   macro avg       0.80      0.50      0.46      2500
weighted avg       0.83      0.84      0.77      2500

the time taken for the code execution is:  0.022910699999999905 minutes


In [24]:
#The poor result is due to imbalanced data and less number of training points. SO the entire training data is 
#considered for running the KNN with number of neighbors = 1

In [28]:
start_time = time.clock()
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_std, y_train)
y_pred = knn.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("the time taken for the code execution is: ", (time.clock()-start_time)/60,"minutes")

0.8437221972035193
[[  943 13335]
 [  893 75872]]
              precision    recall  f1-score   support

           0       0.51      0.07      0.12     14278
           1       0.85      0.99      0.91     76765

   micro avg       0.84      0.84      0.84     91043
   macro avg       0.68      0.53      0.52     91043
weighted avg       0.80      0.84      0.79     91043

the time taken for the code execution is:  51.151398666666665 minutes


In [None]:
#as all the training points are taken, the time taken for KNN processing increses. Hence grid search was done for 
#less number of training points as it is computationally expensive and time consuming

**PART 2: LOGISTIC REGRESSION**[](http://)

In [None]:
#Now logistic regression is used as training algorithm and results are compared.

In [29]:
#logistic regression with default vaules
start_time = time.clock()
lrmodel = LogisticRegression()
lrmodel.fit(X_train_std, y_train)
y_pred = lrmodel.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

0.9244642641389234
[[ 8948  5330]
 [ 1547 75218]]
              precision    recall  f1-score   support

           0       0.85      0.63      0.72     14278
           1       0.93      0.98      0.96     76765

   micro avg       0.92      0.92      0.92     91043
   macro avg       0.89      0.80      0.84     91043
weighted avg       0.92      0.92      0.92     91043

Time taken by code to run is:  6.14073965 minutes


In [None]:
#in comparison with KNN, the model has better values of precision, recall and f1-score. This is logistic regression
#with default values. However, a grid search will be conducted to find the best estimator

In [31]:
#run a grid search for finding the best hyper parameter
start_time = time.clock()
lrmodel_2 = LogisticRegression()
gs = GridSearchCV(lrmodel_2,param_grid={'C':[0.01,0.1,1,10]}, scoring = 'f1_macro',cv=5)
gs.fit(X_train_std, y_train)
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

Time taken by code to run is:  107.82228396666669 minutes


In [32]:
gs.best_estimator_

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
#as per the grid search the best C is 10

In [34]:
#logistic regression with C=10
start_time = time.clock()
lrmodel = LogisticRegression(C = 10)
lrmodel.fit(X_train_std, y_train)
y_pred = lrmodel.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

0.9241457333347979
[[ 9079  5199]
 [ 1707 75058]]
              precision    recall  f1-score   support

           0       0.84      0.64      0.72     14278
           1       0.94      0.98      0.96     76765

   micro avg       0.92      0.92      0.92     91043
   macro avg       0.89      0.81      0.84     91043
weighted avg       0.92      0.92      0.92     91043

Time taken by code to run is:  7.059089183333344 minutes


In [51]:
features  = count_vect.get_feature_names()
feat_weights = list(zip(features, lrmodel.coef_[0]))
feat_table = pd.DataFrame(feat_weights, columns = ['features','coefficients'])

In [52]:
feat_table = feat_table.sort_values(by = 'coefficients')

In [55]:
#top 20 positive features
feat_table[-25:]

Unnamed: 0,features,coefficients
776504,favorite,0.122884
944928,great product,0.12327
2102912,tastes great,0.124255
1265660,love the,0.126164
2180466,these are,0.12627
2180210,these,0.133417
1265685,love these,0.13477
166930,are excellent,0.136647
1115935,is,0.141674
1125091,is the,0.144654


In [56]:
#top 20 negative features
feat_table[0:20]

Unnamed: 0,features,coefficients
1451407,not worth,-0.286853
1445625,not buy,-0.158046
2343942,very disappointed,-0.147343
1449602,not recommend,-0.145028
2454570,would not,-0.1449
2452198,worst,-0.120319
192029,at all,-0.119305
2345880,very weak,-0.115304
1447452,not good,-0.11451
2164650,the worst,-0.114309


**Part 3: Naive bayes**

In [None]:
#bernoulli naive bayes is applied for the training data

In [41]:
start_time = time.clock()
ber_nb = BernoulliNB()
ber_nb.fit(X_train_std, y_train)
y_pred = ber_nb.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')


0.8622519029469591
[[ 3229 11049]
 [ 1492 75273]]
              precision    recall  f1-score   support

           0       0.68      0.23      0.34     14278
           1       0.87      0.98      0.92     76765

   micro avg       0.86      0.86      0.86     91043
   macro avg       0.78      0.60      0.63     91043
weighted avg       0.84      0.86      0.83     91043

Time taken by code to run is:  0.03509916666668384 minutes


In [36]:
#run a grid search for finding the best hyper parameter
start_time = time.clock()
ber_nb1 = BernoulliNB()
gs = GridSearchCV(ber_nb1,param_grid={'alpha':[0.01,0.1,1,10]}, scoring = 'f1_macro',cv=5)
gs.fit(X_train_std, y_train)
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

Time taken by code to run is:  1.7005239833333083 minutes


In [37]:
gs.best_estimator_

BernoulliNB(alpha=0.01, binarize=0.0, class_prior=None, fit_prior=True)

In [40]:
#naive bayes with alpha = 0.01
start_time = time.clock()
ber_nb = BernoulliNB(alpha = 0.01)
ber_nb.fit(X_train_std, y_train)
y_pred = ber_nb.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

0.9237063805015212
[[ 8883  5395]
 [ 1551 75214]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.72     14278
           1       0.93      0.98      0.96     76765

   micro avg       0.92      0.92      0.92     91043
   macro avg       0.89      0.80      0.84     91043
weighted avg       0.92      0.92      0.92     91043

Time taken by code to run is:  0.034288433333343464 minutes


In [58]:
features  = count_vect.get_feature_names()
feat_weights = list(zip(features, ber_nb.coef_[0]))
feat_table = pd.DataFrame(feat_weights, columns = ['features','coefficients'])
feat_table = feat_table.sort_values(by = 'coefficients')
#top 20 positive features
print(feat_table[-25:])
#top 20 negative features
print(feat_table[0:20])


        features  coefficients
942368     great     -1.296605
922004      good     -1.283715
179855        as     -1.274307
1224885     like     -1.257904
2365697      was     -1.252602
2185624     they     -1.238195
1955065       so     -1.168489
1444836      not     -1.130147
1503878       on     -1.122446
164826       are     -1.120527
2470562      you     -1.087420
988699      have     -0.947675
350197       but     -0.932092
2131204     that     -0.929570
2428685     with     -0.923221
1393900       my     -0.734621
1072457       in     -0.662278
835717       for     -0.615422
1473459       of     -0.549679
1115935       is     -0.480706
2196289     this     -0.454761
1129784       it     -0.416112
2224412       to     -0.382893
122412       and     -0.211613
2140077      the     -0.200902
                   features  coefficients
1091600     industrial kind    -12.347129
2013879   starkist followed    -12.347129
838850        for execution    -12.347129
2013887        starkist on

**Logistic regression using sgd classifier**

In [59]:
start_time = time.clock()
sgdcl = SGDClassifier(loss = 'log')
sgdcl.fit(X_train_std, y_train)
y_pred = sgdcl.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

0.8831650978109246
[[ 9793  4485]
 [ 6152 70613]]
              precision    recall  f1-score   support

           0       0.61      0.69      0.65     14278
           1       0.94      0.92      0.93     76765

   micro avg       0.88      0.88      0.88     91043
   macro avg       0.78      0.80      0.79     91043
weighted avg       0.89      0.88      0.89     91043

Time taken by code to run is:  0.06232443333331806 minutes


In [60]:
#the logisitc regression using sgd fares a slightly better recall for class '0'. However, f1 score is reduced

**SVM using SGD classifier**

In [61]:
start_time = time.clock()
sgdcl = SGDClassifier(loss = 'hinge')
sgdcl.fit(X_train_std, y_train)
y_pred = sgdcl.predict(X_test_std)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Time taken by code to run is: ", (time.clock()-start_time)/60,'minutes')

0.8825609876651692
[[ 9786  4492]
 [ 6200 70565]]
              precision    recall  f1-score   support

           0       0.61      0.69      0.65     14278
           1       0.94      0.92      0.93     76765

   micro avg       0.88      0.88      0.88     91043
   macro avg       0.78      0.80      0.79     91043
weighted avg       0.89      0.88      0.89     91043

Time taken by code to run is:  0.047794599999997446 minutes


In [None]:
svm = SVC()
