## Imports and Loading Embeddings File

In [1]:
import sys
print(sys.executable)

E:\Program Files (x86)\Anaconda3\envs\dissertation\python.exe


In [2]:
import tensorflow as tf
# from keras.layers import Dense, Conv1D, GlobalMaxPooling1D
# from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer 
import numpy as np
sys.path.append('E:\GitHubProjects\dissertation\Scripts')
import helperfn as hf
%load_ext autoreload
%autoreload 2

In [3]:
from gensim.models import KeyedVectors
embedding_model = KeyedVectors.load_word2vec_format(r'E:\GitHubProjects\dissertation\word2vec\GoogleNews-vectors-negative300.bin', binary=True)
# If we don't plan to train the model any further, calling init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)

## Loading and Pre-processing the dataset

In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
#convert to set for faster retrival
stop = set(stop)
stop_to_remove = {'no', 'nor', 'not', 'very', 'don', "don't", "aren't", 'couldn', "couldn't", 'didn', "didn't",
      'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
      'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't",
      'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"}

uni_names = {'anglia', 'ruskin', 'birmingham', 'brunel', 'buckinghamshire', 'cardiff', 'metropolitan', 'napier', 'heriot', 'watt', 'kingston', 'liverpool', 'metropolitan', 'south', 'middlesex', 'oxford', 'brookes', 'teesside', 'westminster', 'wolverhampton', 'suffolk', 'ltd', 'london', 'aberdeen', 'abertay', 'dundee', 'bedfordshire', 'cumbria', 'derby', 'east london', 'edinburgh', 'glasgow', 'northampton', 'salford', 'south', 'wales', 'stirling', 'strathclyde', 'sunderland', 'west', 'scotland', 'ulster', 'worcester'}

#remove negations as stopwords as they provide valauble meaning and should not be removed
stop =  stop.difference(stop_to_remove)

#Get Data
data = hf.merge_datasets(r'E:\GitHubProjects\dissertation\scraper\approved_datasets')
# data = hf.balance_dataset(data)

#Tokenizing the text using NLTK
# TODO ******. doesnt segment full stop 
# https://stackoverflow.com/questions/42056872/how-to-remove-in-strings-with-regexptokenizer
# tokenizer = RegexpTokenizer(r'([\w\']+|[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-])')
tokenizer = RegexpTokenizer(r'([\w\']+|\[+|]+|\!+|"+|\$+|%+|&+|\'+|\(+|\)+|\*+|\++|,+|\.+|:+|;+|=+|#+|@+|\?+|\[+|\^+|_+|`+|{+|\|+|\}+|~+|-+|]+)') 

data['tokenized_text'] = data['review'].apply(lambda x: tokenizer.tokenize(x)) 

#lower case
data['tokenized_text'] = data['tokenized_text'].apply(lambda x: hf.lower_token(x))

#remove stop words
data['tokenized_text'] = data['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])

#remove university names as they impact accuracy, these words should be neutral sentiment 
data['tokenized_text'] = data['tokenized_text'].apply(lambda x: [item for item in x if item not in uni_names])

#reduce puncuations, remove duplicates next to each other and leave only one e.g. !!! to !
data['tokenized_text'] = data['tokenized_text'].apply(lambda x: hf.remove_punctuations(x))

X, y = data['tokenized_text'], list(data['score'])

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence

#vocab size
NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [6]:
#finding the maximum number of words the longests sentence contains
# longest = 0
# for x in X_train:
#     if len(x.split()) > longest:
#         longest = len(x.split())

#VERSION FOR TOKENIZED ROWS
longest = 0
for x in X:
    if len(x) > longest:
        longest = len(x)

#padding sequences to have the same length
x_train_seq = pad_sequences(sequences, maxlen=longest+5)

In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(x_train_seq, y, test_size=0.2, shuffle=True, stratify=y, random_state=8)
#Converting from list to numPy arrays (fixed errors after tensorflow... imports)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [8]:
#Word Embeddings
#length of vectors in embedding word2vec
embedding_vecor_length = 300
embedding_matrix = np.zeros((NUM_WORDS, embedding_vecor_length))

for word, i in tokenizer.word_index.items():
    if i >= NUM_WORDS:
        continue
    try:
        embedding_vector = embedding_model[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        pass
        # embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25), embedding_vecor_length)

In [11]:
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, MaxPooling1D, Embedding, GlobalMaxPooling1D, Conv1D, GlobalAveragePooling1D, MaxPool1D, concatenate, Activation, Input

In [14]:
    model_cnn = Sequential()
    e = Embedding(NUM_WORDS, 300, weights=[embedding_matrix], input_length=longest+5, trainable=False)
    model_cnn.add(e)
    model_cnn.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1))
    # model_cnn.add(GlobalMaxPooling1D())

    model_cnn.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
    model_cnn.add(GlobalAveragePooling1D())
    # model_cnn.add(MaxPooling1D(pool_size=2, strides=1))

    model_cnn.add(Dense(256, activation='relu'))
    model_cnn.add(Dense(1, activation='sigmoid'))
    model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model_cnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32, verbose=2)
    # model_cnn.summary()

Train on 2277 samples, validate on 570 samples
Epoch 1/5
2277/2277 - 2s - loss: 0.5491 - accuracy: 0.7585 - val_loss: 0.3957 - val_accuracy: 0.7982
Epoch 2/5
2277/2277 - 1s - loss: 0.3300 - accuracy: 0.8608 - val_loss: 0.2682 - val_accuracy: 0.8825
Epoch 3/5
2277/2277 - 1s - loss: 0.2837 - accuracy: 0.8863 - val_loss: 0.2474 - val_accuracy: 0.8982
Epoch 4/5
2277/2277 - 1s - loss: 0.2629 - accuracy: 0.8972 - val_loss: 0.2347 - val_accuracy: 0.8982
Epoch 5/5
2277/2277 - 1s - loss: 0.2397 - accuracy: 0.9038 - val_loss: 0.2216 - val_accuracy: 0.9070


<tensorflow.python.keras.callbacks.History at 0x18b3ad37948>

In [24]:
tweet_input = Input(shape=(longest+5,), dtype='int32')

tweet_encoder = Embedding(NUM_WORDS, 300, weights=[embedding_matrix], input_length=longest+5, trainable=False)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32, verbose=2)
# model.summary()

Train on 2277 samples, validate on 570 samples
Epoch 1/5
2277/2277 - 2s - loss: 0.4046 - accuracy: 0.8142 - val_loss: 0.2643 - val_accuracy: 0.8982
Epoch 2/5
2277/2277 - 1s - loss: 0.1938 - accuracy: 0.9289 - val_loss: 0.2271 - val_accuracy: 0.9175
Epoch 3/5
2277/2277 - 1s - loss: 0.0962 - accuracy: 0.9728 - val_loss: 0.2508 - val_accuracy: 0.9211
Epoch 4/5
2277/2277 - 1s - loss: 0.0379 - accuracy: 0.9925 - val_loss: 0.2840 - val_accuracy: 0.9140
Epoch 5/5
2277/2277 - 1s - loss: 0.0173 - accuracy: 0.9982 - val_loss: 0.3110 - val_accuracy: 0.9211


<tensorflow.python.keras.callbacks.History at 0x1b414b82b48>

In [26]:
#multichannel with SVM
tweet_input = Input(shape=(longest+5,), dtype='int32')

tweet_encoder = Embedding(NUM_WORDS, 300, weights=[embedding_matrix], input_length=longest+5, trainable=False)(tweet_input)
bigram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
fourgram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

merged = Dense(256, activation='relu')(merged)

# merged = Dense(64, activation='relu')(merged)
merged = Dense(1, kernel_regularizer=regularizers.l2(0.01))(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='hinge',
              optimizer='adadelta',
              metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=32, verbose=2)

model.predict(X_test)

Train on 2277 samples, validate on 570 samples
Epoch 1/3
2277/2277 - 6s - loss: 1.0443 - accuracy: 0.3325 - val_loss: 1.0385 - val_accuracy: 0.3316
Epoch 2/3
2277/2277 - 4s - loss: 1.0355 - accuracy: 0.3325 - val_loss: 1.0300 - val_accuracy: 0.3316
Epoch 3/3
2277/2277 - 4s - loss: 1.0268 - accuracy: 0.3325 - val_loss: 1.0215 - val_accuracy: 0.3316


array([[-0.05468639],
       [-0.02446909],
       [-0.01243563],
       [ 0.02040684],
       [-0.00070055],
       [-0.00549423],
       [-0.01260153],
       [-0.01083313],
       [-0.05898526],
       [-0.02843187],
       [ 0.03050718],
       [ 0.04491062],
       [ 0.03583511],
       [-0.04048605],
       [-0.04850238],
       [ 0.04830488],
       [-0.03203132],
       [-0.02760016],
       [-0.0205056 ],
       [ 0.00997802],
       [-0.00446019],
       [-0.06048423],
       [ 0.052959  ],
       [-0.00928697],
       [ 0.02453944],
       [-0.05177166],
       [-0.02897179],
       [ 0.05246831],
       [-0.00298056],
       [ 0.00382729],
       [-0.01716235],
       [-0.07236092],
       [-0.00198305],
       [-0.01080196],
       [-0.06052648],
       [-0.01627334],
       [-0.0118429 ],
       [-0.01800928],
       [-0.01200761],
       [-0.0143432 ],
       [ 0.00756565],
       [-0.0595693 ],
       [ 0.01507306],
       [-0.03865719],
       [-0.02378934],
       [-0

In [15]:
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch

#TODO Test with and without padding
#TODO Test max and average pooling
#TODO Test strides, Test Dropout

def build_model(hp=None):
    model_cnn = Sequential()
    e = Embedding(NUM_WORDS, 300, weights=[embedding_matrix], input_length=longest+5, trainable=False)
    model_cnn.add(e)

    # model_cnn.add(Conv1D(filters=hp.Int('input_units', min_value=32, max_value=200, step=2), kernel_size=3, padding='valid', activation='relu', strides=1))

    model_cnn.add(Conv1D(filters=hp.Int(f'conv_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_units', min_value=1, max_value=4, step=1)))

    model_cnn.add(GlobalMaxPooling1D())
    
    
    model_cnn.add(Dense(256, activation='relu'))
    model_cnn.add(Dense(1, activation='sigmoid'))
    model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_cnn


tuner = RandomSearch(
    build_model,
    objective= 'val_accuracy',
    max_trials = 4,
    executions_per_trial = 1,
    directory = 'random_search',
    overwrite = True
)

tuner.search(
    x=X_train,
    y=y_train,
    epochs=1,
    batch_size=32,
    validation_data=(X_test, np.array(y_test))
)

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-15-9b20c66f09e3>", line 20, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv1_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel1_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv1_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides1_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_tensor = layer(

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-15-9b20c66f09e3>", line 20, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv1_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel1_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv1_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides1_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_tensor = layer(

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-15-9b20c66f09e3>", line 20, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv1_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel1_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv1_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides1_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_tensor = layer(

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-15-9b20c66f09e3>", line 20, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv1_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel1_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv1_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides1_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_tensor = layer(

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-15-9b20c66f09e3>", line 20, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv1_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel1_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv1_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides1_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_tensor = layer(

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-15-9b20c66f09e3>", line 20, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv1_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel1_size_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv1_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides1_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_tensor = layer(

RuntimeError: Too many failed attempts to build model.

In [11]:
from kerastuner import HyperModel
from kerastuner.tuners import RandomSearch

#TODO Test with and without padding
#TODO Test max and average pooling
#TODO Test strides, Test Dropout

def build_model(hp=None):
    model_cnn = Sequential()
    e = Embedding(NUM_WORDS, 300, weights=[embedding_matrix], input_length=longest+5, trainable=False)
    model_cnn.add(e)

    model_cnn.add(Conv1D(filters=hp.Int('input_units', min_value=32, max_value=256, step=32), kernel_size=3, padding='valid', activation='relu', strides=1))
    model_cnn.add(GlobalMaxPooling1D())

    for i in range(hp.Int('n_layers', 1, 2)):
        model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
        model_cnn.add(GlobalMaxPooling1D())

    model_cnn.add(Dense(256, activation='relu'))
    model_cnn.add(Dense(1, activation='sigmoid'))
    model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_cnn


tuner = RandomSearch(
    build_model,
    objective= 'val_accuracy',
    max_trials = 50,
    executions_per_trial = 3,
    directory = 'log_dir',
    overwrite = True
)

tuner.search(
    x=X_train,
    y=y_train,
    epochs=2,
    batch_size=64,
    validation_data=(X_test, y_test)
)

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-11-2d24448070c9>", line 17, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_ten

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-11-2d24448070c9>", line 17, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_ten

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-11-2d24448070c9>", line 17, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_ten

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-11-2d24448070c9>", line 17, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_ten

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-11-2d24448070c9>", line 17, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_ten

Traceback (most recent call last):
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\kerastuner\engine\hypermodel.py", line 105, in build
    model = self.hypermodel.build(hp)
  File "<ipython-input-11-2d24448070c9>", line 17, in build_model
    model_cnn.add(Conv1D(filters=hp.Int(f'conv_{i}_units', min_value=32, max_value=256, step=32), kernel_size=hp.Int(f'kernel_size_{i}_units', min_value=1, max_value=5, step=1), padding='valid', activation=hp.Choice(f'conv_{i}_activation', values=['relu', 'tanh', 'selu',  'elu'], default='relu'), strides=hp.Int(f'strides_{i}_units', min_value=1, max_value=4, step=1)))
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\training\tracking\base.py", line 457, in _method_wrapper
    result = method(self, *args, **kwargs)
  File "E:\Program Files (x86)\Anaconda3\envs\dissertation\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py", line 203, in add
    output_ten

RuntimeError: Too many failed attempts to build model.

## Confusion Matrix and Metrics

In [19]:
# tk_test = Tokenizer()
# tk_test.fit_on_texts(X_test)
# index_list = tk_test.texts_to_sequences(X_test)
# x_train_padded = pad_sequences(index_list, maxlen=longest+5)

score = model_cnn.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

Test loss: 0.3234133296983039 / Test accuracy: 0.9180850982666016


In [12]:
from sklearn.metrics import confusion_matrix

y_pred = (model.predict(X_test).ravel()>0.5)+0 # predict and get class (0 if pred < 0.5 else 1)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
confusion_matrix(y_test, y_pred)

array([[166,  23],
       [ 20, 361]], dtype=int64)

In [13]:
print("Precision: ", tp / (tp+fp))
print("Recall: ", tp / (tp+fn))

Precision:  0.9401041666666666
Recall:  0.94750656167979


## Exporting tokenizer and trained model to store as files

In [14]:
import pickle as pk
# saving the tokenizer
with open('tokenizer_m1.pickle', 'wb') as handle:
    pk.dump(tokenizer, handle, protocol=pk.HIGHEST_PROTOCOL)
    
model.save("model_exported.h5")