# In this project we are going to use an LSTM on Amazon reviews to identify emotions in a sentence and determine their polarity.

# Libraries

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import keras
from keras import Model
from tensorflow.keras.layers import Flatten, LSTM, Dense, Flatten, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from keras_preprocessing.text import Tokenizer
from keras.initializers import glorot_uniform
from sklearn import model_selection

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load the data

In [2]:
#read in data

with open('./amazon_review_polarity_csv/train.csv', 'r', encoding='utf-8-sig') as file:
    text = file.readlines()

In [3]:
#create empty dataframe

x_train = pd.DataFrame()

In [4]:
#fill in dataframe

word = []
label = []
for n in text:
    n = n.split()
    label.append(1) if n[0] == "__label__2" else label.append(0)
    word.append(" ".join(n[1:]))
x_train['consumer_review'] = word
x_train['polarity_label'] = label

#view dataframe

x_train

Unnamed: 0,consumer_review,polarity_label
0,"even for the non-gamer"",""This sound track was ...",0
1,"best soundtrack ever to anything."",""I'm readin...",0
2,"soundtrack is my favorite music of all time, h...",0
3,"Soundtrack"",""I truly like this soundtrack and ...",0
4,"Pull Your Jaw Off The Floor After Hearing it"",...",0
...,...,...
3599995,"do it!!"",""The high chair looks great when it f...",0
3599996,"nice, low functionality"",""I have used this hig...",0
3599997,"but hard to clean"",""We have a small house, and...",0
3599998,"is it saying?"",""not sure what this book is sup...",0


# Prepare the data

In [5]:
_, x_set, _, y_set = model_selection.train_test_split(x_train['consumer_review'], x_train['polarity_label'], test_size = 0.02)

In [6]:
#data cleaning function
def data_prep(in_text):
    # remove puctuations and numbers
    out_text = re.sub('[^a-zA-Z]', ' ', in_text)
    # convert upper case to lower case
    out_text = "".join(list(map(lambda x: x.lower(), out_text)))
    # remove single character
    out_text = re.sub(r"\s+[a-zA-Z]\s+", ' ', out_text)
    return out_text

In [7]:
#create new list with clean data

text_set = []
for reviews in list(x_set):
    text_set.append(data_prep(reviews))

In [8]:
x_train = pd.DataFrame()
x_train['consumer_review'] = text_set
x_train['polarity_label'] = list(y_set)

In [10]:
#split data into 70% train and 30% test

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_train['consumer_review'], x_train['polarity_label'], test_size = 0.30)

In [11]:
#convert to array

x_train = np.array(x_train.values.tolist())
x_test = np.array(x_test.values.tolist())
y_train = np.array(y_train.values.tolist())
y_test = np.array(y_test.values.tolist())

In [19]:
#tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index
total_size = len(word_index) + 1

print(total_size)

76613


In [21]:
# text to sequance

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [24]:
# add padding to ensure the same length

max_length = 100
x_train = pad_sequences(x_train, padding = 'post', maxlen = max_length)
x_test = pad_sequences(x_test, padding = 'post', maxlen = max_length)

# Structure the model

## Structure the model using a Keras embedding layer, an LSTM layer and a dense layer

In [25]:
# Create model

model = Sequential()
model.add(Embedding(total_size, 20, input_length = max_length))
model.add(LSTM(32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


# Compile the model

In [26]:
# compile

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 20)           1532260   
_________________________________________________________________
lstm (LSTM)                  (None, 32)                6784      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 1,539,077
Trainable params: 1,539,077
Non-trainable params: 0
_________________________________________________________________
None


# Train the model

In [27]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, verbose = 1, validation_data = (x_test, y_test))

Train on 50400 samples, validate on 21600 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x205104bcba8>

# Save the model

In [28]:
model.save('model.h5')