In [1]:
# Working
# Implementation of https://tm3.ghost.io/2017/04/21/amazon-food-reviews-part-vi/

import time
from time import time
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model

In [2]:
# Loading data
import pandas as pd
import os

# Cleaning data
from bs4 import BeautifulSoup

#apply the polarity score
import textblob
from textblob import TextBlob

toPickle = False
inputFilePath = "/notebooks/data/amazon/"
outputFilePath = "/notebooks/output/amazon/"
inputFileName = "Review100.csv"
outputFileName = "amz_data.pkl"

if toPickle:
    filePath = inputFilePath + inputFileName
    data = pd.read_csv(filePath)
    print(data.dtypes)
    print('------------------------------')
    
    data['text_cln']= data['Text'].map(lambda x: BeautifulSoup(x, "lxml").get_text())
    print(data['text_cln'][0:5])
    print('------------------------------')
    
    #apply the polarity score to each text feature using 

    data['tb_polarity']= data['text_cln'].map(lambda x: 
    TextBlob(x).sentiment.polarity)

    print(data['tb_polarity'][0:5])
    print('------------------------------')
    
    #pickle
    data.to_pickle(outputFilePath + outputFileName)

In [3]:
if not toPickle:
    data = pd.read_pickle(outputFilePath + outputFileName)



In [4]:
import datetime

#normalize date time
data2 = data.copy()
data2['datetime'] = data2['Time'].map(lambda x: (datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')))
data2['datetime'] = pd.to_datetime(data2['datetime'])


In [5]:
import sklearn.model_selection as sk

#train/test split 80/20
X_data, y_target = data2['text_cln'], data2['Score']
y_target = y_target.values
X_datatrain, X_datatest, y_train, y_test = sk.train_test_split(X_data, y_target, test_size=0.2, random_state=0)

In [6]:
print(data2['text_cln'].iloc[10])
print(data2['Score'].iloc[10])

I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.Thank you for the personal, incredible service!
5


In [14]:
import time
import numpy as np

# Process vocabulary  
learn = tf.contrib.learn
MAX_DOCUMENT_LENGTH = 800
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
start_time = time.time() #timing it 

x_train = np.array(list(vocab_processor.fit_transform(X_datatrain))) 
print("--- %s seconds ---" % (time.time() - start_time))  

x_test = np.array(list(vocab_processor.transform(X_datatest)))
print("--- %s seconds ---" % (time.time() - start_time))  

n_words = len(vocab_processor.vocabulary_)
print("--- %s seconds ---" % (time.time() - start_time))  

print('Total words: %d' % n_words)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

#confirm that the shape is consistent with max_document_lenght = 800
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


--- 0.018402814865112305 seconds ---
--- 0.02248692512512207 seconds ---
--- 0.022881269454956055 seconds ---
Total words: 1560
79 train sequences
20 test sequences
x_train shape: (79, 800)
x_test shape: (20, 800)
[[ 70 109   4 ...   0   0   0]
 [201 245 227 ...   0   0   0]
 [133   4   5 ...   0   0   0]
 ...
 [ 52  54   0 ...   0   0   0]
 [133   4   5 ...   0   0   0]
 [  0 259  90 ...   0   0   0]]


In [8]:
max_words = MAX_DOCUMENT_LENGTH
batch_size = 32
#An epoch is a full pass over your training data
#this one will pass over the training set 5 times
#In 32 batches
epochs = 5

In [9]:
#number of target classes
num_classes = np.max(y_train)+1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
  '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

6 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (79, 6)
y_test shape: (20, 6)


In [10]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

Building model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [11]:
model.compile(loss='categorical_crossentropy',
          optimizer='adam',
          metrics=['accuracy'])
print("--- %s seconds ---" % (time.time() - start_time))  

#timed it at around 8 minutes
history = model.fit(x_train, y_train,
                batch_size=batch_size,
                epochs=epochs,
                verbose=1,
                validation_split=0.1)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.3277416229248047 seconds ---
Train on 71 samples, validate on 8 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- 1.516113519668579 seconds ---


In [12]:
score = model.evaluate(x_test, y_test,
                   batch_size=batch_size, verbose=1)
print("--- %s seconds ---" % (time.time() - start_time))  

print('Test score:', score[0])
print('Test accuracy:', score[1])

--- 1.544198751449585 seconds ---
Test score: 11.468515396118164
Test accuracy: 0.2


In [13]:
model.predict_classes(x_test, batch_size=batch_size, verbose=1)
model.predict_proba(x_test, batch_size=batch_size, verbose=1)




array([[0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        2.0872548e-33, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        9.9709904e-01, 2.9009460e-03],
       [0.0000000e+00, 9.7543567e-22, 0.0000000e+00, 1.0000000e+00,
        3.3033477e-27, 0.0000000e+00],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.7725798e-25,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
        1.0000000e+00, 2.7891183e-09],
       [0.0000000e+00, 1.0000000e+00,