# Imports

In [11]:
import numpy as np 
import pandas as pd 
import bz2
import gc
import chardet
import re
import os
from google.colab import drive
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, Conv1D, GlobalMaxPool1D, Dropout, concatenate, Layer, InputSpec, CuDNNLSTM
from keras.preprocessing import text, sequence
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.utils.conv_utils import conv_output_length
from keras.regularizers import l2
from keras.constraints import maxnorm

# Configuring Kaggle API

In [12]:
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
%%bash
# Copying kaggle api key to working directory
cp drive/MyDrive/Colab\ Notebooks/kaggle.json .
# Then move kaggle.json into the folder where the API expects to find it.
mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

kaggle datasets download -d bittlingmayer/amazonreviews
unzip amazonreviews.zip
kaggle datasets download -d naominguyen7/glovetwitter100d
unzip glovetwitter100d
kaggle kernels output anshulrai/cudnnlstm-implementation-93-7-accuracy -p .

Downloading amazonreviews.zip to /content

Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        
Downloading glovetwitter100d.zip to /content

Archive:  glovetwitter100d.zip
  inflating: glove.twitter.27B.100d.txt  
Output file downloaded to ./early_weights.hdf5
Kernel log downloaded to ./cudnnlstm-implementation-93-7-accuracy.log 


  0%|          | 0.00/493M [00:00<?, ?B/s]  0%|          | 1.00M/493M [00:00<04:18, 1.99MB/s]  1%|          | 3.00M/493M [00:00<01:32, 5.58MB/s]  1%|▏         | 7.00M/493M [00:00<00:38, 13.4MB/s]  2%|▏         | 10.0M/493M [00:00<00:31, 16.1MB/s]  3%|▎         | 15.0M/493M [00:01<00:20, 24.5MB/s]  4%|▍         | 20.0M/493M [00:01<00:15, 31.4MB/s]  5%|▍         | 24.0M/493M [00:01<00:15, 31.8MB/s]  6%|▌         | 28.0M/493M [00:01<00:14, 34.2MB/s]  7%|▋         | 33.0M/493M [00:01<00:12, 38.3MB/s]  8%|▊         | 38.0M/493M [00:01<00:11, 41.9MB/s]  9%|▊         | 43.0M/493M [00:01<00:12, 39.2MB/s] 10%|▉         | 48.0M/493M [00:01<00:11, 41.9MB/s] 11%|█         | 53.0M/493M [00:01<00:10, 44.5MB/s] 12%|█▏        | 58.0M/493M [00:02<00:11, 41.4MB/s] 13%|█▎        | 63.0M/493M [00:02<00:10, 43.7MB/s] 14%|█▍        | 68.0M/493M [00:02<00:10, 43.2MB/s] 15%|█▍        | 73.0M/493M [00:02<00:10, 42.6MB/s] 16%|█▌        | 78.0M/493M [00:02<00:09, 44.2MB/s] 17%|█▋        | 83.

In [14]:
# Reading Train and Test Files
train_file = bz2.BZ2File('train.ft.txt.bz2')
test_file = bz2.BZ2File('test.ft.txt.bz2')

In [15]:
# Get list containing train and test Sentences
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [16]:
# deleting original files from memory
del train_file, test_file

In the dataset __label__1 corresponds to 1- and 2-star reviews, and __label__2 corresponds to 4- and 5-star reviews.

In [17]:
train_file_lines[0]

b'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

In [18]:
# Convert from raw binary strings to strings that can be parsed
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [19]:
print(f'train data length: {len(train_file_lines)}')
print(f'test data length: {len(test_file_lines)}')

train data length: 3600000
test data length: 400000


In [20]:
# Getting training labels
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
# we use [:-1] to remove the \n at the end of the sentence
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

In [21]:
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

In [22]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

In [23]:
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

## Tokenizing text

In [24]:
max_features = 20000
maxlen = 100

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_sentences)

In [25]:
tokenized_train = tokenizer.texts_to_sequences(train_sentences)

In [26]:
X_train = pad_sequences(tokenized_train, maxlen=maxlen)

In [27]:
# Applying the tokenizer fitted on train train data to test data 
tokenized_test = tokenizer.texts_to_sequences(test_sentences)
X_test = pad_sequences(tokenized_test, maxlen=maxlen)

In [28]:
'''
This code snippet maps tokens to their respective embedding
'''
EMBEDDING_FILE = 'glove.twitter.27B.100d.txt'
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [29]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) #embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

  if (await self.run_code(code, result,  async_=asy)):


In [30]:
# cleaning memory
del tokenized_test, tokenized_train, tokenizer, train_sentences, test_sentences, word_index, embeddings_index, all_embs, nb_words
gc.collect()

0

# Model training

In [34]:
batch_size = 128
epochs = 7
embed_size = 100

In [31]:
def cudnnlstm_model(conv_layers = 2, max_dilation_rate = 3):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    x = Dropout(0.25)(x)
    x = Conv1D(2*embed_size, kernel_size = 3)(x)
    prefilt = Conv1D(2*embed_size, kernel_size = 3)(x)
    x = prefilt
    for strides in [1, 1, 2]:
        x = Conv1D(128*2**(strides), strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x_f = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)  
    x_b = CuDNNLSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=maxnorm(10), bias_constraint=maxnorm(10))(x)
    x = concatenate([x_f, x_b])
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_accuracy'])

    return model

In [32]:
cudnnlstm_model = cudnnlstm_model()
cudnnlstm_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 100, 100)     2000000     ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 100, 100)     0           ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 98, 200)      60200       ['dropout[0][0]']                
                                                                                              

In [37]:
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks = [checkpoint, early_stopping]

In [None]:
cudnnlstm_model.fit(X_train, np.array(train_labels), batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.20, callbacks=callbacks)

In [44]:
cudnnlstm_model.load_weights(weight_path)
score, acc = cudnnlstm_model.evaluate(X_test, np.array(test_labels), batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.34738391637802124
Test accuracy: 0.8617975115776062
