In [16]:
import numpy as np
import tensorflow as tf
from string import punctuation
from collections import Counter
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Activation, Dropout, Embedding, LSTM, Bidirectional, CuDNNLSTM, CuDNNGRU
from keras.wrappers.scikit_learn import KerasClassifier

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
with open('./reviews.txt', 'r') as f:
    reviews = f.read()
with open('./labels.txt', 'r') as f:
    labels = f.read()

In [3]:
# Preprocessing
all_text = ''.join([c for c in reviews if c not in punctuation])
reviews = all_text.split('\n')
all_text = ' '.join(reviews)
words = all_text.split()

In [4]:
# Encoding Words
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)

# Create your dictionary that maps vocab words to integers here
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)} # start from 1

# Convert the reviews to integers, same shape as reviews list, but with integers
review_ints = []
for review in reviews:
    review_ints.append([vocab_to_int[word] for word in review.split()])

In [5]:
# Encoding labels
labels = labels.split('\n')

# Convert labels to 1s and 0s for 'positive' and 'negative'
labels = [1 if label == 'positive' else 0 for label in labels]

In [6]:
review_lens = Counter([len(x) for x in review_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
Maximum review length: 2514


In [7]:
# Removing zero length review
zero_length_review_index = ([i for i, review in enumerate(review_ints) if (len(review) == 0)])[0]
del review_ints[zero_length_review_index]
del labels[zero_length_review_index]
labels = np.array(labels)

In [8]:
# For reviews shorter than 200 words, left pad with 0s
# For reviews longer than 200, use on the first 200 words as the feature vector

seq_len = 200
features = []
for review in review_ints:
    review_len = len(review)
    len_diff = seq_len - review_len
    if len_diff <= 0:
        features.append(review[:seq_len])
    else:
        padding = [0] * len_diff
        padded_feature = padding + review
        features.append(padded_feature)
features = np.asarray(features)

In [9]:
print(features.shape)
print(labels.shape)

(25000, 200)
(25000,)


In [10]:
# Split data into train, validation and test
split_frac = 0.8
split_idx = int(len(features) * split_frac)

train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x) * 0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [11]:
# Disabing GPU - works slow for RNN. Because GPU requires more computation
# tf.config.experimental.set_visible_devices([], 'GPU')

## Vanilla RNN

In [12]:
def vanilla_rnn():
    model = Sequential()
    model.add(SimpleRNN(50, input_shape = (200,1), return_sequences = False))
    model.add(Dense(46))
    model.add(Activation('sigmoid'))
    
    adam = tf.optimizers.Adam(lr = 0.001)
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model

model = KerasClassifier(build_fn = vanilla_rnn, epochs = 10, batch_size = 50, verbose = 1)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print('Accuracy : ', accuracy_score(y_pred, test_y))

## Stacked Vanilla RNN

In [13]:
def stacked_vanilla_rnn():
    model = Sequential()
    # return_sequences parameter has to be set True to stack
    model.add(SimpleRNN(50, input_shape = (200,1), return_sequences = True))   
    model.add(SimpleRNN(50, return_sequences = False))
    model.add(Dense(46))
    model.add(Activation('softmax'))
    
    adam = tf.optimizers.Adam(lr = 0.001)
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model

model = KerasClassifier(build_fn = stacked_vanilla_rnn, epochs = 10, batch_size = 50, verbose = 1)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print('Accuracy : ', accuracy_score(y_pred, test_y))

## LSTM

In [14]:
def lstm():
    model = Sequential()
    model.add(Embedding(200000, 128))
    model.add(Dropout(0.3))
    model.add(LSTM(128))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    adam = tf.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

    return model

model = KerasClassifier(build_fn = lstm, epochs = 10, batch_size = 50, verbose = 1)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print('Accuracy : ', accuracy_score(y_pred, test_y))

Metal device set to: Apple M1


2022-03-18 15:45:13.186765: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-18 15:45:13.186851: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10


2022-03-18 15:45:13.398367: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-03-18 15:45:14.020522: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-18 15:45:14.161568: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-18 15:45:14.497372: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2022-03-18 15:52:24.782493: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-18 15:52:24.820446: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Accuracy :  0.7904


## CuDNN LSTM

In [15]:
def CuDNN_LSTM():
    model = Sequential()
    model.add(Embedding(200000, 128))
    model.add(Dropout(0.3))
    model.add(CuDNNLSTM(128))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    adam = tf.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

    return model

model = KerasClassifier(build_fn = CuDNN_LSTM, epochs = 10, batch_size = 50, verbose = 1)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print('Accuracy : ', accuracy_score(y_pred, test_y))

Epoch 1/10


  super(Adam, self).__init__(name, **kwargs)
2022-03-18 15:52:27.543435: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2022-03-18 15:59:39.405851: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Accuracy :  0.7712


## CuDNN GRU

In [17]:
def CuDNN_GRU():
    model = Sequential()
    model.add(Embedding(200000, 128))
    model.add(Dropout(0.3))
    model.add(CuDNNGRU(128))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    adam = tf.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

    return model

model = KerasClassifier(build_fn = CuDNN_GRU, epochs = 10, batch_size = 50, verbose = 1)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print('Accuracy : ', accuracy_score(y_pred, test_y))

Epoch 1/10


  super(Adam, self).__init__(name, **kwargs)
2022-03-18 16:04:03.543756: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2022-03-18 16:11:46.545658: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Accuracy :  0.7844


## Bidirectional CuDNN LSTM

In [18]:
def Bidirectional_CuDNN_LSTM():
    model = Sequential()
    model.add(Embedding(200000, 128))
    model.add(Dropout(0.3))
    model.add(Bidirectional(CuDNNLSTM(128)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))

    adam = tf.optimizers.Adam(lr = 0.001)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

    return model

model = KerasClassifier(build_fn = Bidirectional_CuDNN_LSTM, epochs = 10, batch_size = 50, verbose = 1)
model.fit(train_x, train_y)
y_pred = model.predict(test_x)
print('Accuracy : ', accuracy_score(y_pred, test_y))

Epoch 1/10


  super(Adam, self).__init__(name, **kwargs)
2022-03-18 16:15:50.029647: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2022-03-18 16:25:48.962204: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Accuracy :  0.764
