In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import re
import os
import string
import nltk
import emoji
from string import punctuation
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout, LSTM, Conv1D, MaxPooling1D, Bidirectional

Using TensorFlow backend.


In [2]:
reviews = pd.read_csv('AllProductReviews.csv')
reviews.head()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
0,Honest review of an edm music lover\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,Really good and durable.\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,Just Awesome Wireless Headphone under 1000...😉\n,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255


## Data Cleaning

In [3]:
#Combine the reviewTitle and ReviewBody columns into one
reviews['review'] = reviews['ReviewTitle'] + reviews['ReviewBody']

In [4]:
complete_review = reviews['review']

In [5]:
word = set(nltk.corpus.words.words())
sw = set(stopwords.words('english'))

In [6]:
def clean(text):
    text = text.strip()
    text = text.lower()
    text = text.split()
    text = ' '.join([string for string in text if not string in sw])
    text = re.sub(r'(?<=[.,])(?=[^\s])', r' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation+'0123456789'))
    text = ' '.join(text.split())
    text = ''.join(string for string in text if string not in emoji.UNICODE_EMOJI)
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in word)
    text = ' '.join([word for word in text.split() if len(word) > 2])
    return text
coll = complete_review.map(lambda x: clean(x))

## Keras Model Building

In [7]:
#Convert text into tokens with identitcal sequence length
max_seq_length = 100
vocab_size = 4000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(coll)
sequences = tokenizer.texts_to_sequences(coll)
x_train = pad_sequences(sequences, maxlen=max_seq_length, padding='post', truncating='post')

In [8]:
y_vector = reviews['ReviewStar'] - 1
y_train = to_categorical(y_vector, num_classes=5)

In [9]:
#Create the model architecture
desired_embedding_size = 100
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=desired_embedding_size,
                    input_length = max_seq_length))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adagrad', loss='binary_crossentropy',
             metrics=['accuracy'])
model.fit(x_train, y_train, validation_split=0.75, epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3584 samples, validate on 10753 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2574703fe48>

In [10]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          400000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1280128   
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 165       
Total params: 1,690,629
Trainable params: 1,690,629
Non-trainable params: 0
____________________________________________