ML-MAJOR-FEB-ML-02-BM4<br>
Sentiment Analysis for Amazon reviews

In [1]:
#Importing general libraries

import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

# Data Gather: Web Scraping

In [2]:
#Importing csv file

products = pd.read_csv('/content/drive/MyDrive/Machine Learning/Major Project/dataset/amazon_baby.csv')

In [3]:
#Dataframe head

products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [4]:
#Dataframe tail

products.tail()

Unnamed: 0,name,review,rating
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5
183530,Best 2 Pack Baby Car Shade for Kids - Window S...,I love this product very mush . I have bought ...,5


In [5]:
#Review count

products['review'].value_counts()

good                                                                                                                                                                                                                                                                                                                                                                   5
very nice                                                                                                                                                                                                                                                                                                                                                              5
Very good                                                                                                                                                                                                                                                                             

In [6]:
#Rating count

products['rating'].value_counts()

5    107054
4     33205
3     16779
1     15183
2     11310
Name: rating, dtype: int64

In [7]:
products.shape

(183531, 3)

# Preprocessing

In [8]:
#Total raw data
len(products)

183531

In [9]:
#Non null values
len(products) - len(products.dropna())

1147

In [10]:
#Removing null values

products = products.dropna()

In [11]:
#Classifying clomuns and checking for review column

for i in range(0,len(products)-1):
    if type(products.iloc[i]['review']) != str:
        products.iloc[i]['review'] = str(products.iloc[i]['review'])

In [12]:
#Splitting positive and negative reviews

def sentiment(n):
    return 'Positive' if n >= 4 else 'Negative'
products = products[products['rating'] != 3] # removing median review value
products['sentiment'] = products['rating'].apply(sentiment)
products.head()

Unnamed: 0,name,review,rating,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,Positive
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Positive
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,Positive
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,Positive
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,Positive


In [13]:
#Sentiment count

products['sentiment'].value_counts()

Positive    139318
Negative     26361
Name: sentiment, dtype: int64

In [14]:
#Adding feature 

def combined_features(row):
    return row['name'] + ' '+ row['review']
products['all_features'] = products.apply(combined_features, axis=1)
products.head()

Unnamed: 0,name,review,rating,sentiment,all_features
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,Positive,Planetwise Wipe Pouch it came early and was no...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Positive,Annas Dream Full Quilt with 2 Shams Very soft ...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,Positive,Stop Pacifier Sucking without tears with Thumb...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,Positive,Stop Pacifier Sucking without tears with Thumb...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,Positive,Stop Pacifier Sucking without tears with Thumb...


In [15]:
#Splitting

x = products.iloc[:,4] # contains all features
y = products.iloc[:,3] # contains positive or negative

# Vectorization: CountVectorizer

In [16]:
# train test splitting

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0,stratify=y)

In [17]:
#Shapes of x & y

print(x_train.shape)
print(x_test.shape)

(115975,)
(49704,)


In [18]:
#Unique values of y_train

np.unique(y_train,return_counts=True)

(array(['Negative', 'Positive'], dtype=object), array([18453, 97522]))

In [19]:
#Unique values of y_train

np.unique(y_test,return_counts=True)

(array(['Negative', 'Positive'], dtype=object), array([ 7908, 41796]))

In [20]:
#Vectorization & Importing Pipeline, CountVectorizer, LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

review_model = Pipeline([('tfidf',CountVectorizer()),('model',LogisticRegression(max_iter=2000))]) # Pipelining ( combining CountVectorizer & LogisticRegression )

# Logistic Regression

In [21]:
#Fitting the Model

review_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=2000,
                                    multi_class='auto', n_jobs=None,
                      

In [22]:
#Generating prediction model

y_pred = review_model.predict(x_test)

In [23]:
#Prediction

y_pred

array(['Positive', 'Positive', 'Positive', ..., 'Positive', 'Negative',
       'Positive'], dtype=object)

In [24]:
#Test

y_test

130708    Positive
135614    Positive
87955     Positive
163763    Positive
82752     Positive
            ...   
145268    Positive
138441    Positive
42890     Positive
182049    Positive
100627    Positive
Name: sentiment, Length: 49704, dtype: object

# Accuracy Tests

In [25]:
#Accuracy Score

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100

93.06695638178014

In [26]:
#Classification Report

from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

    Negative       0.73      0.81      0.77      7114
    Positive       0.97      0.95      0.96     42590

    accuracy                           0.93     49704
   macro avg       0.85      0.88      0.86     49704
weighted avg       0.93      0.93      0.93     49704



In [27]:
#Confusion Matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred,y_test)

array([[ 5788,  1326],
       [ 2120, 40470]])

# Random Test

In [28]:
import random # random
ran = int(random.randint(1,1000))
ranstr = x_test.iloc[ran]
ranstr

'Evenflo Aura Select Travel System, Alhambra Excelente producto, trae la silla de carro con la base incluida (car seat) y es f&aacute;cil de instalar en el carro y en el coche para pasear al recien nacido.La compra y la entrega fueron r&aacute;pidas, excelente servicio.'

In [29]:
#Random Test

review_model.predict([ranstr])

array(['Positive'], dtype=object)

# Positive & Negative Splitting

In [30]:
#Positive Frame

pos_frame = products[products['sentiment']=='Positive']

In [31]:
pos_frame.head()

Unnamed: 0,name,review,rating,sentiment,all_features
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,Positive,Planetwise Wipe Pouch it came early and was no...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Positive,Annas Dream Full Quilt with 2 Shams Very soft ...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,Positive,Stop Pacifier Sucking without tears with Thumb...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,Positive,Stop Pacifier Sucking without tears with Thumb...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,Positive,Stop Pacifier Sucking without tears with Thumb...


In [32]:
#Negative Frame

neg_frame = products[products['sentiment']=='Negative']

In [33]:
neg_frame.head()

Unnamed: 0,name,review,rating,sentiment,all_features
21,Nature\'s Lullabies Second Year Sticker Calendar,I only purchased a second-year calendar for my...,2,Negative,Nature\'s Lullabies Second Year Sticker Calend...
41,"SoftPlay Giggle Jiggle Funbook, Happy Bear",This bear is absolutely adorable and I would g...,2,Negative,"SoftPlay Giggle Jiggle Funbook, Happy Bear Thi..."
50,"SoftPlay Cloth Book, Love",This book is boring. Nothing to stimulate my g...,1,Negative,"SoftPlay Cloth Book, Love This book is boring...."
70,Hunnt&reg; Falling Flowers and Birds Kids Nurs...,The reason:Small sizeHard to apply on the wall...,1,Negative,Hunnt&reg; Falling Flowers and Birds Kids Nurs...
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Negative,Wall Decor Removable Decal Sticker - Colorful ...


# Saving & Retrieving the model: ( Pickle )

In [34]:
import pickle # importing pickle

#save the model to disk
filename = 'Amazon_reviews.sav'
pickle.dump(review_model, open(filename, 'wb'))

In [35]:
#for heroku - joblib
import joblib
joblib.dump(review_model,'Amazon_reviews')

['Amazon_reviews']

In [36]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)*100
print(result)

93.06695638178014


# Web App using Streamlit

In [37]:
#Installing requirements

!pip install streamlit --quiet
!pip install pyngrok==4.1.1 --quiet
from pyngrok import ngrok

[K     |████████████████████████████████| 8.2MB 5.6MB/s 
[K     |████████████████████████████████| 4.2MB 45.0MB/s 
[K     |████████████████████████████████| 81kB 8.4MB/s 
[K     |████████████████████████████████| 163kB 29.8MB/s 
[K     |████████████████████████████████| 112kB 45.4MB/s 
[K     |████████████████████████████████| 122kB 36.7MB/s 
[K     |████████████████████████████████| 71kB 7.3MB/s 
[?25h  Building wheel for blinker (setup.py) ... [?25l[?25hdone
[31mERROR: google-colab 1.0.0 has requirement ipykernel~=4.10, but you'll have ipykernel 5.5.3 which is incompatible.[0m
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone


In [38]:
%%writefile app.py
import streamlit as st
import sklearn
import pickle
import pandas as pd
products = pd.read_csv('/content/drive/MyDrive/Machine Learning/Major Project/dataset/small_csv_data.csv')
filename = 'Amazon_reviews.sav'
model = pickle.load(open(filename, 'rb'))
st.title('Sentiment Analysis of Amazon Reviews')
ip = st.text_input('Enter your message')
op = model.predict([ip])
if st.button('Predict'):
  st.title(op[0])
st.table(products['Example Reviews'])

Writing app.py


In [39]:
!nohup streamlit run app.py &
url=ngrok.connect(port='8501')
url

nohup: appending output to 'nohup.out'


'http://2555211a13bc.ngrok.io'

In [40]:
pip install pipreqs

Collecting pipreqs
  Downloading https://files.pythonhosted.org/packages/9b/83/b1560948400a07ec094a15c2f64587b70e1a5ab5f7b375ba902fcab5b6c3/pipreqs-0.4.10-py2.py3-none-any.whl
Collecting yarg
  Downloading https://files.pythonhosted.org/packages/8b/90/89a2ff242ccab6a24fbab18dbbabc67c51a6f0ed01f9a0f41689dc177419/yarg-0.1.9-py2.py3-none-any.whl
Installing collected packages: yarg, pipreqs
Successfully installed pipreqs-0.4.10 yarg-0.1.9


In [41]:
!pipreqs

INFO: Successfully saved requirements file in /content/requirements.txt
