# Preprocessing

In [3]:
# Import required Libs for Preprocessing

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df= pd.read_csv('walmart_product_reviews_dataset.csv', sep=',')
df.head()

ModuleNotFoundError: No module named 'pandas'

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20639 entries, 0 to 20638
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   crawled_at         20639 non-null  object 
 1   verified           20639 non-null  bool   
 2   reviewed_at        20639 non-null  object 
 3   reviewed_by        20626 non-null  object 
 4   helpful_count      20639 non-null  int64  
 5   not_helpful_count  20639 non-null  int64  
 6   content            16208 non-null  object 
 7   raw_content        20639 non-null  object 
 8   average_rating     19609 non-null  float64
 9   review_heading     14020 non-null  object 
 10  product_url        20639 non-null  object 
 11  review_page_url    20639 non-null  object 
 12  rating             20639 non-null  float64
dtypes: bool(1), float64(2), int64(2), object(8)
memory usage: 1.9+ MB


In [None]:
df = df.dropna(subset=['review_heading'])
df = df.dropna(subset=['average_rating'])

In [None]:
# Shape of Dataset

print("Number of Rows: " + str(df.shape[0]))
print("Number of Columns: " + str(df.shape[1]))

Number of Rows: 13133
Number of Columns: 13


In [None]:
df['sentiment'] = np.where(df['rating'] > 3, 'positive',
                           np.where(df['rating'] == 3, 'neutral', 'negative'))

In [None]:
# Select Required Columns Only

tweet_df = df[['content','sentiment']]

In [None]:
tweet_df.head()

Unnamed: 0,content,sentiment
3,These go up very easily and I love the cordles...,positive
4,"When using the rod to open the blind, the rod ...",positive
5,one of the top brackets was broken. how do I ...,positive
6,I love the sting free pushing and pulling to a...,positive
7,Very agitating to order a 25 x 72 inch blind (...,neutral


In [None]:
# Select only Positive and Negative Reviews

tweet_df = tweet_df[tweet_df['sentiment'] != 'neutral']

In [None]:
# Attribute and Labels
X = tweet_df.content
y = tweet_df.sentiment

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Checking Train Set

print("Training Set X Items: " + str(len(X_train)))
print("Training Set y Items: " + str(len(y_train)))

Training Set X Items: 9936
Training Set y Items: 9936


In [None]:
# Checking Test Set

print("Test Set X Items: " + str(len(X_test)))
print("Test Set y Items: " + str(len(y_test)))

Test Set X Items: 2485
Test Set y Items: 2485


In [None]:
# Getting required labels only and encoding

review_labels_train = y_train.factorize()

In [None]:
review_labels_train[0]

array([0, 1, 1, ..., 1, 1, 1])

In [None]:
# Check Review Labels
review_labels_train[1]

Index(['negative', 'positive'], dtype='object')

# Next

In [None]:
# Importing required tf modules
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab = X_train.values
vocab

array(["Ordered rocker for my Granddaughter's 1st birthday. Liked the fact that it was made in the usa and the price. Received rocker and am very disappointed with the quality. The finish looks like it was applied poorly. It is very rough in some areas and the nail (or staple holes) are very rough and unfinished. Will be sending it back.",
       'well I bought the first one you had before which was not a good buy but I kept it anyway then you can with this one which is much better thanks',
       'Great sheets my grands love the dogs', ...,
       "I can't believe I went all my life standing/prepping/cooking without this. What a way to spoil myself!",
       'I have used other waffle makers in the past. They either burn the waffles, the mix drips out, or the machine breaks after a few uses. I use this waffle maker a few times and I am blown away. It is extremely easy to use, extremely easy to clean, and the waffles come out perfectly with no stress.The waffles came out at the perfect 

In [None]:
# Vectorize a text corpus, by turning each text into sequence of integers

tokenizer = Tokenizer(num_words=8000,oov_token='OOV')
tokenizer.fit_on_texts(vocab)

vocab_size = len(tokenizer.word_index) + 1

print(tokenizer)
print(vocab_size)

<keras.src.preprocessing.text.Tokenizer object at 0x7c9a90d0f8e0>
11133


In [None]:
len(tokenizer.word_index)

11132

In [None]:
# # To save the tokenized vocab for Web app
# import pickle
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Store and Padding Converted Sequences
tweet = X_train.values

tweet_seqs = tokenizer.texts_to_sequences(tweet)

padded_sequence_train = pad_sequences(tweet_seqs, maxlen=200)
print(padded_sequence_train)

[[   0    0    0 ... 1262    6   96]
 [   0    0    0 ...   77  140  303]
 [   0    0    0 ...   26    2  809]
 ...
 [   0    0    0 ...    7    1  417]
 [   0    0    0 ... 3038   12  291]
 [   0    0    0 ...    9    5   38]]


In [None]:
# Check padded sequence element

print(padded_sequence_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  142 1358   11   12 5059 1623  988  489    2  480   18    6   13   87
   14    2 2115    4    2   72  174 1358    4   74   22  275   19    2
   54    2  718  104   32    6   13 3353 1442    6   10   22  842   14
  144 1152    4    2 1869   64 2223  592   21   22  842    4 2632   60
   41 

In [None]:
# Build the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding

embedding_vector_length = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,
                                     input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           356256    
                                                                 
 spatial_dropout1d (Spatial  (None, 200, 32)           0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 372907 (1.42 MB)
Trainable params: 372907 (1.42 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [None]:
# # Used to save trained model

# model.save("sentiment_analysis.h5")
# print("Model Saved")

# Train Model

In [None]:
# Training the model
trained = model.fit(padded_sequence_train,review_labels_train[0],
                  validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Encoding and Padding Test Data to Check Accuracy

encoded_docs = tokenizer.texts_to_sequences(X_test)
padded_sequence_test = pad_sequences(encoded_docs, maxlen=200)
print(padded_sequence_test)

[[  0   0   0 ...  34  99 171]
 [  0   0   0 ... 398 204  95]
 [  0   0   0 ... 360   3 135]
 ...
 [  0   0   0 ...   4  22 437]
 [  0   0   0 ...  51 218   6]
 [  0   0   0 ...  26   2 797]]


In [None]:
sentiment_label_test = y_test.factorize()
sentiment_label_test[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
score = model.evaluate(padded_sequence_test,sentiment_label_test[0],verbose=0)

In [None]:
print("Accuracy: {}".format(score[1]))

Accuracy: 0.05714285746216774


# Load Model

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models

In [None]:
#Code to load the saved model
model = models.load_model('sentiment_analysis.h5')
print("Model Loaded")
model.summary()

Model Loaded
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 32)           372352    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 200, 32)           0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 389003 (1.48 MB)
Trainable params: 389003 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_____

In [None]:
score = model.evaluate(padded_sequence_test,sentiment_label_test[0],verbose=0)

# Accuracy

In [None]:
print("Accuracy: {}".format(score[1]))

Accuracy: 0.8193159103393555


# Test Run

In [None]:
!pip install Keras-Preprocessing

Collecting Keras-Preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Keras-Preprocessing
Successfully installed Keras-Preprocessing-1.1.2


In [None]:
import pickle
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
len(tokenizer.word_index)

11635

In [None]:
# Test Review Sentence

test_word ="""
These masks were a steal! 50 for 12$!! They are soft, breathable, light, comfortable and professional. It doesn’t hurt or irritate my ears and where they glue the straps to the mask isn’t hard and crusty like the basic blue ones. These are great!
"""

# test_word = """
# i'm so in love with this product, even the price is cheap but the quality is actually very nice
# """

# To
tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)

tw

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [None]:
prediction = int(model.predict(tw).round().item())
outcome = (review_labels_train[1][prediction]).capitalize()

print("Actual Review: " + test_word)
print("\nSentiment Analysis Outcome ==> The review shows " + (review_labels_train[1][prediction]).capitalize() + " sentiment.")
print("\n======================================================================================")

print("\nAccuracy Criteria \n\nProbability Closer to 0 == Negative Sentiment\nProbability Closer to 1 == Positive Sentiment")

prob = model.predict(tw)[0][0]

print("\n ==> Probability is " + str(prob)+ " (" + outcome + ")")

Actual Review: 
These masks were a steal! 50 for 12$!! They are soft, breathable, light, comfortable and professional. It doesn’t hurt or irritate my ears and where they glue the straps to the mask isn’t hard and crusty like the basic blue ones. These are great!


Sentiment Analysis Outcome ==> The review shows Positive sentiment.


Accuracy Criteria 

Probability Closer to 0 == Negative Sentiment
Probability Closer to 1 == Positive Sentiment

 ==> Probability is 0.8342032 (Positive)


In [None]:
import h5py

f = h5py.File('sentiment_analysis.h5', 'r')

In [None]:
import h5py
f = h5py.File('sentiment_analysis.h5','r')
for item in f.keys():
    print(item) #+ ":", f[item]

model_weights
optimizer_weights


In [None]:
with h5py.File('sentiment_analysis.h5', 'r') as f:
	data = f['model_weights']
	print(min(data))
	print(max(data))
	print(data)

dense_1
spatial_dropout1d_1
<HDF5 group "/model_weights" (5 members)>


In [None]:
with h5py.File('sentiment_analysis.h5', 'r') as f:
	data = f['optimizer_weights']
	print(min(data))
	print(max(data))
	print(data)

Adam
Adam
<HDF5 group "/optimizer_weights" (1 members)>


In [None]:
list(f.keys())

['model_weights', 'optimizer_weights']

In [None]:
with open('tokenizer.pickle', 'rb') as f:
    x = pickle.load(f)

In [None]:
x

<keras_preprocessing.text.Tokenizer at 0x7c9a7df26290>