### Template for NLP project

The aim of the project is to achieve the following:
 - Train a neural network that is **at least better than random guessing** on your dataset. The template contains the IMDB dataset for sentiment analysis, however, you can choose any other language related data set with the appropriate NLP task.
 - Investigate different neural network architectures (different hyperparameters, different layers, different pre-processing). Explain in the presentation, why the final network was selected! **Do not rely on black-box mechanisms.**
 

In [96]:
# tensorflow modules
import tensorflow as tf
#from tensorflow.keras.datasets import imdb
#from tensorflow.keras.preprocessing import sequence
#from tensorflow.keras import Sequential
#from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LayerNormalization
import matplotlib.pyplot as plt
import numpy as np

# if you have installed a different version, replace 'r2.6'  with your version in links provided below
print(tf.__version__)

2.9.1


In [97]:
# load imdb dataset
# links to dataset
# original dataset: https://ai.stanford.edu/~amaas/data/sentiment/
# version in tensorflow: https://www.tensorflow.org/versions/r2.6/api_docs/python/tf/keras/datasets/imdb

# select your vocabulary size
vocabularySize = 5000
# load data (it is already pre-processed)
# optional: add other pre.processing steps like stopword removal
(xTrain, yTrain), (xTest, yTest) = tf.keras.datasets.imdb.load_data(num_words=vocabularySize)
print('Loaded dataset with {} training samples, {} test samples'.format(len(xTrain), len(xTest)))

# look at the data
print('---review---')
print(xTrain[123])
print('---label---')
print(yTrain[123])

# look at the respective words
word2id = tf.keras.datasets.imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in xTrain[123]])


# import nltk  # probably have to install that first
# nltk.download('stopwords') # this might take somne time
# from nltk.corpus import stopwords
#
# stopWords = set(stopwords.words('english'))
# stopWords.add("br") #added br tag to stopwords
# vocabularySize = vocabularySize-len(stopWords)
# print(vocabularySize)
# for index in range(len(xTrain)):
#     xTrain[index] = [ (w-len(stopWords))%vocabularySize for w in xTrain[index] if id2word.get(w) not in stopWords]
# for index in range(len(xTest)):
#     xTest[index] = [ (w-len(stopWords))%vocabularySize for w in xTest[index] if id2word.get(w) not in stopWords]
# #xTrain[123] = [w for w in xTrain[123] if id2word.get(w) not in stopWords]
#
# print(xTrain[123])

Loaded dataset with 25000 training samples, 25000 test samples
---review---
[1, 307, 5, 1301, 20, 1026, 2511, 87, 2775, 52, 116, 5, 31, 7, 4, 91, 1220, 102, 13, 28, 110, 11, 6, 137, 13, 115, 219, 141, 35, 221, 956, 54, 13, 16, 11, 2714, 61, 322, 423, 12, 38, 76, 59, 1803, 72, 8, 2, 23, 5, 967, 12, 38, 85, 62, 358, 99]
---label---
1
---review with words---
['the', 'version', 'to', 'date', 'on', 'list', 'draw', 'him', 'critical', 'very', 'love', 'to', 'by', 'br', 'of', 'its', 'tony', 'characters', 'was', 'one', 'life', 'this', 'is', 'go', 'was', 'best', 'least', 'should', 'so', 'done', 'result', 'no', 'was', 'with', 'this', 'understood', 'only', 'war', "couldn't", 'that', 'her', 'get', 'would', 'johnny', 'we', 'in', 'and', 'are', 'to', 'business', 'that', 'her', 'because', 'story', 'use', 'movies']


In [98]:
# get properties of the dataset
print('Maximum train review length: {}'.format(len(max(xTrain, key=len))))
print('Maximum test review length: {}'.format(len(max(xTest, key=len))))
print('Minimum train review length: {}'.format(len(min(xTrain, key=len))))
print('Minimum test review length: {}'.format(len(min(xTest, key=len))))

Maximum train review length: 2494
Maximum test review length: 2315
Minimum train review length: 11
Minimum test review length: 7


In [99]:
# select maximum number of words as input length
# pad or truncated (this is done automatically) your data
maxWords = 1000
xTrain = tf.keras.preprocessing.sequence.pad_sequences(xTrain, maxlen=maxWords)
xTest = tf.keras.preprocessing.sequence.pad_sequences(xTest, maxlen=maxWords)

In [100]:
# setup the neural network architecture
# check out the respective tensorflow help page: https://www.tensorflow.org/guide/keras/rnn
model=tf.keras.Sequential()

def original_code():
    # define size of embedding, see https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/layers/Embedding
    # optional: use a different embedding like word2vec or other options available within tensorflow
    embeddingSize = 128
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    # add recurrent layers:
    # e.g. a SimpleRNN (https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/layers/SimpleRNN) with
    # LayerNormalization (https://www.tensorflow.org/versions/r2.9/api_docs/python/tf/keras/layers/LayerNormalization)
    model.add(tf.keras.layers.SimpleRNN(100))
    model.add(tf.keras.layers.LayerNormalization())

def attempt1():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.SimpleRNN(100))
    model.add(tf.keras.layers.LayerNormalization())

def attempt2():
    embeddingSize = 256
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.SimpleRNN(100))
    model.add(tf.keras.layers.LayerNormalization())

def attempt3():
    embeddingSize = 128
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.LSTM(128))
    model.add(tf.keras.layers.LayerNormalization())

def attempt4():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.LSTM(4))
    model.add(tf.keras.layers.LayerNormalization())

def attempt5():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.GRU(256))
    model.add(tf.keras.layers.LayerNormalization())

def attempt6():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4)))
    model.add(tf.keras.layers.LayerNormalization())

def attempt7():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)))
    model.add(tf.keras.layers.LayerNormalization())

def attempt8():
    #maxWords = 250
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4)))
    model.add(tf.keras.layers.LayerNormalization())

def attempt9():
    #maxWords = 250
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(4)))
    model.add(tf.keras.layers.LayerNormalization())

def attempt10():
    #maxWords = 250
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
    model.add(tf.keras.layers.LayerNormalization())

def attempt11():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4, return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)))
    model.add(tf.keras.layers.LayerNormalization())

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8, return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)))
    model.add(tf.keras.layers.LayerNormalization())

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
    model.add(tf.keras.layers.LayerNormalization())

def finalAttempt():
    embeddingSize = 64
    model.add(tf.keras.layers.Embedding(vocabularySize, embeddingSize, input_length=maxWords))

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(4, return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(16 ,return_sequences=True)))
    model.add(tf.keras.layers.LayerNormalization())

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8,return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32 ,return_sequences=True)))
    model.add(tf.keras.layers.LayerNormalization())

    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,return_sequences=True)))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)))
    model.add(tf.keras.layers.LayerNormalization())

finalAttempt()

# add layer for output
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# print model and check number of parameters
print(model.summary())

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 1000, 64)          320000    
                                                                 
 bidirectional_30 (Bidirecti  (None, 1000, 8)          2208      
 onal)                                                           
                                                                 
 bidirectional_31 (Bidirecti  (None, 1000, 32)         2496      
 onal)                                                           
                                                                 
 layer_normalization_23 (Lay  (None, 1000, 32)         64        
 erNormalization)                                                
                                                                 
 bidirectional_32 (Bidirecti  (None, 1000, 16)         2624      
 onal)                                               

In [None]:
# set parameters for network training
batchSize = 64
numEpochs = 5

# train your model
model.compile(loss='binary_crossentropy',  optimizer='adam', metrics=['accuracy'])
xValid, yValid = xTrain[:batchSize], yTrain[:batchSize]
xTrain2, yTrain2 = xTrain[batchSize:], yTrain[batchSize:]
hist = model.fit(xTrain2, yTrain2, validation_data=(xValid, yValid), batch_size=batchSize, epochs=numEpochs)

# check result
scores = model.evaluate(xTest, yTest, verbose=0)
print('Test accuracy:', scores[1])

Epoch 1/5
  8/390 [..............................] - ETA: 39:48 - loss: 0.8063 - accuracy: 0.5195

In [None]:
# Visualise Model Accuracy
plt.plot(hist.history['accuracy'])

plt.plot(hist.history['val_accuracy'])

plt.title('Accuracy')

plt.ylabel('accuracy')

plt.xlabel('epochs')
plt.yticks(np.arange(0, 1, step=0.1))
plt.xticks(np.arange(0, 5, step=1))
plt.grid()

plt.legend(['train', 'test'], loc='lower right')

plt.show()

# Visualise Loss

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.xticks(np.arange(0, 5, step=1))
plt.grid()
plt.legend(['train', 'test'], loc='upper right')
plt.show()