In [48]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer.  You are using {}'.format(tf.__version__)
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please ensure you have installed TensorFlow correctly')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 2.0.0
Default GPU Device: /device:GPU:0


In [49]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from sklearn import metrics as mt
from matplotlib import pyplot as plt
%matplotlib inline

import pickle
import numpy as np
import pandas as pd

# graph visualization
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

keras.__version__

'2.3.1'

In [50]:
data = pd.read_csv("reviews.csv")
data

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
0,Honest review of an edm music lover\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,Really good and durable.\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,Just Awesome Wireless Headphone under 1000...ðŸ˜‰\n,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255
...,...,...,...,...
14332,Good\n,Good\n,4,JBL T110BT
14333,Amazing Product\n,An amazing product but a bit costly.\n,5,JBL T110BT
14334,Not bad\n,Sound\n,1,JBL T110BT
14335,a good product\n,the sound is good battery life is good but the...,5,JBL T110BT


In [51]:
data_title = data["ReviewTitle"]
data_body = data["ReviewBody"]
y = data["ReviewStar"]

In [52]:
X_data = data["ReviewTitle"].map(str) + " " + data["ReviewBody"]
X_data

0        Honest review of an edm music lover\n No doubt...
1        Unreliable earphones with high cost\n This  ea...
2        Really good and durable.\n i bought itfor 999,...
3        stopped working in just 14 days\n Its sound qu...
4        Just Awesome Wireless Headphone under 1000...ðŸ˜‰...
                               ...                        
14332                                        Good\n Good\n
14333    Amazing Product\n An amazing product but a bit...
14334                                    Not bad\n Sound\n
14335    a good product\n the sound is good battery lif...
14336    Average headphones , n overrated name\n M writ...
Length: 14337, dtype: object

In [53]:
%%time
NUM_TOP_WORDS = None
MAX_ART_LEN = 1000 # maximum and minimum number of words
NUM_CLASSES = 5

tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(word_index),top_words))

# X = pad_sequences(sequences, maxlen=MAX_ART_LEN)
X = pad_sequences(sequences)


y_ohe = keras.utils.to_categorical(y)
print('Shape of data tensor:', X.shape)
print('Shape of label tensor:', y_ohe.shape)
print(np.max(X))

Found 12156 unique tokens. Distilled to 12156 top words.
Shape of data tensor: (14337, 927)
Shape of label tensor: (14337, 6)
12156
Wall time: 650 ms


In [59]:
X

array([[   0,    0,    0, ...,  149,   37,  133],
       [   0,    0,    0, ..., 1634,    9,  301],
       [   0,    0,    0, ...,  150,   31,   45],
       ...,
       [   0,    0,    0, ...,   11,   70,    6],
       [   0,    0,    0, ...,   18,    3,    9],
       [   0,    0,    0, ...,   82,  298,  208]])

In [54]:
from sklearn.model_selection import train_test_split
# Split it into train / test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y_ohe, test_size=0.2,
                                                            stratify=y, 
                                                            random_state=42)

# get rid of empty column at begginning
y_train = y_train[:,1:]
y_test = y_test[:,1:]

print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
print(np.sum(y_train,axis=0))
print(np.sum(y_test,axis=0))

(11469, 927) (11469, 5)
(2868, 927) (2868, 5)
[1994.  751. 1203. 2551. 4970.]
[ 499.  188.  300.  638. 1243.]


In [55]:
from keras.models import Sequential, Input, Model
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers.embeddings import Embedding

max_review_length = 927
EMBED_SIZE = 50
input_holder = Input(shape=(X_train.shape[1], ))
input_embed = Embedding((np.max(X_train)+1), # input dimension (max int of OHE)
                EMBED_SIZE, # output dimension size
                input_length=max_review_length)(input_holder) # number of words in each sequence


x = SimpleRNN(25,dropout=0.2, recurrent_dropout=0.2)(input_embed)
x = Dense(NUM_CLASSES, activation='sigmoid')(x)
rnn=Model(inputs=input_holder,outputs=x)
rnn.compile(loss='binary_crossentropy', 
            optimizer='rmsprop', 
            metrics=['accuracy'])

rnn.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 927)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 927, 50)           607850    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 25)                1900      
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 130       
Total params: 609,880
Trainable params: 609,880
Non-trainable params: 0
_________________________________________________________________


In [56]:
rnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 11469 samples, validate on 2868 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1d44f38edc8>

In [57]:
rnn.fit(X_train, y_train, 
        epochs=15, 
        batch_size=64, 
        validation_data=(X_test, y_test), 
        callbacks=[EarlyStopping(monitor='val_loss', patience=2)]
       )

Train on 11469 samples, validate on 2868 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x1d44f8c3a48>

In [58]:
rnn.save_weights('model_1_weights.h5')