<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# Load the first packages we'll need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Read in the text and target dataset from local copy
data_clean = pd.read_csv('data_clean.csv')

In [None]:
# Alternatively, read in the text and target dataset from Google Drive
# Connect to Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read in dataset from Drive
data_clean = pd.read_csv("/content/drive/My Drive/helpful-reviews/data_clean.csv")
data_clean.drop('Unnamed: 0', axis=1, inplace=True)

# Create boolean column `helpful_1`
data_clean['helpful_1'] = np.where(data_clean['helpful'] > 0, 1, 0)
data_clean.head()

In [None]:
# Find splitting point for 80% train, 10% validation, 10% test
all_length = len(data_clean)
train_len = round(0.8 * all_length)
val_len = round(0.1 * all_length)

print('Train set length:', train_len)
print('Validation set length:', val_len)

In [None]:
# Train-val-test split
from sklearn.model_selection import train_test_split

y = data_clean['helpful_1']
X = data_clean['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=val_len,
                                                    random_state=123)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                test_size=val_len,
                                                random_state=123)

In [None]:
# Set up stopwords to be removed
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

stop_list = stopwords.words('english')
stop_list += list(string.punctuation)
stop_list += ['br', '.<', '..', '...', '``', "''", '--', 'http', 'https',
              'com', 'www']

In [None]:
# Import needed packages/classes
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import callbacks
from keras.preprocessing import text, sequence
import tensorflow as tf

In [None]:
# Create sequences of tokens uniform in length for all reviews (~ 1 min.)
tokenizer = text.Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(X_train))
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_train_tok_pad = sequence.pad_sequences(X_train_tok, maxlen=1000)

X_val_tok = tokenizer.texts_to_sequences(X_val)
X_val_tok_pad = sequence.pad_sequences(X_val_tok, maxlen=1000)

X_test_tok = tokenizer.texts_to_sequences(X_test)
X_test_tok_pad = sequence.pad_sequences(X_test_tok, maxlen=1000)

In [None]:
# Build a GRU network
from keras import callbacks

checkpoint = callbacks.ModelCheckpoint('/content/drive/My Drive/helpful-reviews/gru_model_embed_50.h5',
                                       monitor='val_acc', 
                                       save_best_only=True)
# early_stop = callbacks.EarlyStopping(monitor='val_loss', 
#                                      min_delta=0.001, 
#                                      patience=5) 

embedding_size = 128
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(200000, embedding_size, 
                                    input_shape=(100,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(100,)))
model.add(tf.keras.layers.GRU(25, return_sequences=True, input_shape=(100,)))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation='relu'))

model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['accuracy'])
history = model.fit(X_train_tok_pad, y_train, epochs=50, batch_size=2048, 
                    validation_data=(X_val_tok_pad, y_val),
                    callbacks=[checkpoint])

In [None]:
# Visualize loss and accuracy over training epochs
x = [i for i in range(1, 51)]

plt.figure(figsize=(12, 10))
plt.plot(x, history.history['acc'], label='Train Accuracy')
plt.plot(x, history.history['loss'], label='Train Loss')
plt.plot(x, history.history['val_acc'], label='Val. Accuracy')
plt.plot(x, history.history['val_loss'], label='Val. Loss')
plt.title('Model performance over 50 training epochs')
plt.xlabel('Epochs')
plt.legend()
plt.show();