In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Heli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Heli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Heli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Heli\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
df = pd.read_csv('amazon_alexa.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,4,5,31-Jul-18,Charcoal Fabric,Music,1


In [5]:
df.shape

(3150, 6)

### Removing Null values

In [6]:
df.isnull().sum()

Unnamed: 0          0
rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

### Preprocessing data

In [7]:
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
lm = WordNetLemmatizer()
sw = stopwords.words('English')

In [9]:
data = []
for i in df['verified_reviews']:
    t = i.lower()                        # conversion to lower case
    t = re.sub('[^A-Za-z]',' ',t)        # removal of punc and num
    t = word_tokenize(t)                 # word tokenization
    t = [i for i in t if i not in sw]    # stop words removal
    t = [lm.lemmatize(i) for i in t]     # lemmatization (walking -> walk)
    t = " ".join(t)                      # returns a sentence
    data.append(t) 
print(data)    



### Select x(independent feature) and y(dependent feature)

In [10]:
x = data
y = df['feedback']
print(type(x))
print(type(y))

<class 'list'>
<class 'pandas.core.series.Series'>


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)
print(len(x_train))
print(len(x_test))
print(y_train.shape)
print(y_test.shape)

2205
945
(2205,)
(945,)


In [13]:
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score

In [14]:
def eval_model(ytest,ypred):
    print('Accuracy Score: ',accuracy_score(ytest,ypred))
    print(confusion_matrix(ytest,ypred))
    print(classification_report(ytest,ypred))

### Count Vectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer(max_features=1500)
cv.fit(x_train)

x_train_cv = cv.transform(x_train)
x_test_cv = cv.transform(x_test)
print(x_train_cv.shape)
print(x_test_cv.shape)

(2205, 1500)
(945, 1500)


### Multinomial Naïve Bayes Classification

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
mnb = MultinomialNB()
mnb.fit(x_train_cv,y_train)

MultinomialNB()

In [19]:
print('Train Score',mnb.score(x_train_cv,y_train))
print('Test Score',mnb.score(x_test_cv,y_test))

Train Score 0.9591836734693877
Test Score 0.9164021164021164


In [20]:
ypred_test = mnb.predict(x_test_cv)
print(ypred_test)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [21]:
eval_model(y_test,ypred_test)

Accuracy Score:  0.9164021164021164
[[ 22  66]
 [ 13 844]]
              precision    recall  f1-score   support

           0       0.63      0.25      0.36        88
           1       0.93      0.98      0.96       857

    accuracy                           0.92       945
   macro avg       0.78      0.62      0.66       945
weighted avg       0.90      0.92      0.90       945



### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [23]:
lb = LabelEncoder()
x1 = lb.fit_transform(x).reshape(-1, 1)

In [24]:
x1_train,x1_test,y_train,y_test = train_test_split(x1,y,test_size=0.30,random_state=42)
print(len(x1_train))
print(len(x1_test))
print(y_train.shape)
print(y_test.shape)

2205
945
(2205,)
(945,)


In [25]:
m1 = LogisticRegression()
m1.fit(x1_train,y_train)

LogisticRegression()

In [26]:
print('Train Score',m1.score(x1_train,y_train))
print('Test Score',m1.score(x1_test,y_test))

Train Score 0.9233560090702948
Test Score 0.9068783068783068


In [27]:
ypred_m1 = m1.predict(x1_test)
print(ypred_m1)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

In [28]:
eval_model(y_test,ypred_m1)

Accuracy Score:  0.9068783068783068
[[  0  88]
 [  0 857]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        88
           1       0.91      1.00      0.95       857

    accuracy                           0.91       945
   macro avg       0.45      0.50      0.48       945
weighted avg       0.82      0.91      0.86       945



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### KNN Classification

In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(x1_train,y_train)

KNeighborsClassifier(n_neighbors=11)

In [31]:
print('Train score',knn.score(x1_train,y_train))  # Train Acc
print('Test score',knn.score(x1_test,y_test))

Train score 0.9260770975056689
Test score 0.9068783068783068


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [32]:
ypred_k1 = knn.predict(x1_test)
print(ypred_k1)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [33]:
eval_model(y_test,ypred_k1)

Accuracy Score:  0.9068783068783068
[[  2  86]
 [  2 855]]
              precision    recall  f1-score   support

           0       0.50      0.02      0.04        88
           1       0.91      1.00      0.95       857

    accuracy                           0.91       945
   macro avg       0.70      0.51      0.50       945
weighted avg       0.87      0.91      0.87       945



 ## Multinomial Naïve Bayes Classification has the best accuracy