In [None]:
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = tf.keras.preprocessing.text.Tokenizer(inputs)
x = tf.keras.layers.Embedding(x, 32)
x = tf.keras.layers.LSTM(64, activation='tanh')(x)
output = tf.keras.layers.Dense(1, activation='sigmoid')
mod = tf.keras.Model(inputs, output, name='LSTM_TEST')

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 

train_df = pd.read_csv('./nlp/train.csv')
test_df = pd.read_csv('./nlp/test.csv')
train_df.head()

In [None]:
# Shuffle the dataset. Ony training dataset is shuffled
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

In [None]:
train_df.target.value_counts()

In [None]:
'''My Version'''

import random 
ind = random.sample(range(len(train_df_shuffled)), 5)
X = train_df_shuffled[['text', 'target']]
for i in ind:
    print(X.iloc[i])
    print('Disaster' if X.iloc[i].target == 1 else 'Not Disaster')
    print('\n')
    

In [None]:
#Visualize random samples
import random 
random_indexes = [random.randint(0, len(train_df_shuffled)) for i in range(5)]
print(random_indexes)
for i in random_indexes:
    x = train_df_shuffled[['text', 'target']].iloc[i]
    print('Target {}:'.format(x[1]), "Disaster" if x[1]==1 else "Not Disaster")
    print(x, '\n')

# Split data into training and validation sets from train data

In [None]:
from sklearn.model_selection import train_test_split as tts

train_sentences, val_sentences, train_labels, val_labels = tts(train_df_shuffled['text'].to_numpy(), 
                                                               train_df_shuffled['target'].to_numpy(),
                                                               test_size=0.1, random_state=42)
print(train_sentences[:5], train_labels[:5])

## Next step will be to tokenize the text before embedding. To do this, we use Textvectorizer. 
## Note: You can also use the Text tokenizer from preprocessing

In [None]:
#First we know the total words in the training vocabulary
words_size = sum([len(i.split()) for i in train_sentences]) #Total words
avg_words = round(words_size / len(train_sentences)) #Bcos of unequal lenght of each tweet, we get the avg per tweet to help build vectorizer parameters
print(words_size)


In [None]:

text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=int(words_size/2), #This is the max words to consider in the vocab. Set to None if pad_to_max_tokens=False
                                                    standardize='lower_and_strip_punctuation',
                                                    ngrams=None, 
                                                    split='whitespace', 
                                                    output_mode='int',
                                                    output_sequence_length=avg_words,
                                                    pad_to_max_tokens=True) #Adds 0 to sentences not up to output sequence length

In [None]:
#Fit text vectorizer to training data
text_vectorizer.adapt(train_sentences)

In [None]:
#Testing the text vectorizer
samp = "The best coin is FLUX and STRATOS token is the worst token"
random_sentence = random.choice(train_sentences)
token_random_sentence = text_vectorizer(random_sentence)
print(random_sentence, '\n')
print("Length Random Sentence:", len(random_sentence.split()))
print(token_random_sentence)

In [None]:
#Get Unique words in the vocabulary
unique_words = text_vectorizer.get_vocabulary()
Ten_most_common_words = [str(u) for u in unique_words[:30]]
Ten_least_common_words = unique_words[-10:]
print(len(unique_words))
print('10 most common words: {}'.format(Ten_most_common_words))
print('10 least common words: {}'.format(Ten_least_common_words))

# Create Embedding using the Embedding Layer

In [None]:
in_dim = int(words_size/2) 
out_dim = 128 
in_len = avg_words

embedding = tf.keras.layers.Embedding(input_dim=in_dim,
                                      output_dim=out_dim,
                                      input_length=in_len,
)

In [None]:
random_sentence = random.choice(train_sentences)
print('Original sentence: {}'.format(random_sentence))
random_vec = text_vectorizer(random_sentence)
print('Numeric Vectors: {}'.format(random_vec))
sentence_embeddings = embedding(random_vec)
print(sentence_embeddings, len(sentence_embeddings))
print(sentence_embeddings[0], sentence_embeddings[0].shape)

# Tensorboard Callback Function

In [None]:
import datetime

SAVE_DIR = "model_logs"       #Directory to save all model logs 

def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback

# Modelling a series of experiments 

. Create a model
. Build a model 
. Fit a model 
. Evaluate the model

### First Model = We start with a baseline : Naives Bayes with TF-IDF - Model 0

In [None]:
#First Model = We start with a baseline : Naives Bayes with TF-IDF - Model 0

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline 

model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()),
                    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

baseline_score = model_0.score(val_sentences, val_labels)
print("Our baseline model achieves accuracy of {:.2f}%".format(baseline_score*100))

baseline_preds = model_0.predict(val_sentences)
print(baseline_preds[:10])
print(val_labels[:10])

In [None]:
# Function to evaluate: accuracy, precision, recall, and F1 score 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support 

def calculate_results(y_true, y_pred):
    """
        Calculates the accuracy, precision, recall, and F1 score for the classification model. 
    """
    model_accuracy = accuracy_score(y_true, y_pred) * 100 
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    model_results = {"Accuracy": model_accuracy, 
                     "Precision": model_precision, 
                     "Recall": model_recall,
                     "F1": model_f1}
    return model_results

In [None]:
#Get Baseline results 
baseline_results = calculate_results(y_true=val_labels, y_pred=baseline_preds)
print(baseline_results)

# Second Model = A Simple Dense Model

In [None]:
#Second Model = Feed forward neural network (dense model)
tf.keras.backend.clear_session()

inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Dense(10, activation='relu')(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) #condense the feature vector for each token to one vector
#x = tf.keras.layers.GlobalMaxPooling1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs, name="Model_1_FFNN")

model_1.summary()

model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

model_1_history = model_1.fit(x=train_sentences, 
            y=train_labels,
            epochs=10,
            validation_data=(val_sentences, val_labels), 
            callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                 experiment_name='model_1_simple_dense')])

In [None]:
loss, accuracy = model_1.evaluate(val_sentences, val_labels)
print("Model_1 Loss", loss, " |||| ", "Model_1_Accuracy", accuracy)

In [None]:
model_1_pred = model_1.predict(val_sentences) #This give probabilities, we need to convert to numeric values
model_1_preds = tf.squeeze(tf.round(model_1_pred))

model_1_results = calculate_results(y_true=val_labels, y_pred=model_1_preds)
print(model_1_results)
print(baseline_results)

## Visualizing Learned Embedding

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()
print(len(words_in_vocab), words_in_vocab[:10])
model_1.summary()

In [None]:
#get the weight matrix of the embedding layer for model 1
embed_weights = model_1.get_layer('embedding_3').get_weights()[0]
print(embed_weights, embed_weights.shape)


## This code downloads the vectors and metadata tsv file. Visualize on the tensorflow projector
### https://projector.tensorflow.org/   (Load the vectors and metasata tsv in Tensorflow projector)
### https://www.tensorflow.org/text/guide/word_embeddings   (Read about word embedding here)

In [None]:
#This code downloads the vectors and metadata tsv file. Visualize on the tensorflow projector
#https://projector.tensorflow.org/   (Load the vectors and metasata tsv in Tensorflow projector)
#https://www.tensorflow.org/text/guide/word_embeddings   (Read about word embedding here)
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

# Third Model - Recurrent Neural Network
#### RNN's are useful for sequence data. It uses the representation of a previous input to aid the representation of a later input 
#### Resources Includes: 
#### MIT's sequence modelling lecture https://authurwhywait.github.io/blog/2021/12/02/introduction_to_dl02/ 
#### Chrish Olah's intro to LSTMs : https://colah.github.io/posts/2015-08-Understanding-LSTMs/ 
#### Andrej karparthy's The Unreasonable effectiveness of recurrent neural network : https://karpathy.github.io/2015/05/21/rnn-effectiveness/

In [None]:
# Create an LSTM model 
tf.keras.backend.clear_session()

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.LSTM(64, return_sequences=True)(x)  #When stacking LSTM layers, the first LSTM layer has to have return_sequences=True
x = tf.keras.layers.LSTM(64, activation='tanh')(x)
#x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_2 = tf.keras.Model(inputs, outputs, name='LSTM_model')

model_2.summary()

model_2.compile(
            loss=tf.keras.losses.BinaryCrossentropy(),
            optimizer=tf.keras.optimizers.Adam(),
            metrics=['accuracy']
)

model_2_history = model_2.fit(
                        train_sentences,
                        train_labels,
                        epochs=10,
                        validation_data=(val_sentences, val_labels), 
                        callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                 experiment_name='model_2_lstm')]
)

In [None]:
model_2_preds = model_2.predict(val_sentences)
model_2_preds = tf.squeeze(tf.round(model_2_preds))
print(np.array(model_2_preds[:10]))
print(np.array(model_1_preds[:10]))
print(baseline_preds[:10])
print(val_labels[:10])

model_2_results = calculate_results(val_labels, model_2_preds)
print(model_2_results)

# Fourth Model - Gated Recurrent Unit (GRU)

In [None]:
tf.keras.backend.clear_session 

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(64, return_sequences=True)(x)  #When stacking GRU layers, the first LSTM layer has to have return_sequences=True
#x = tf.keras.layers.LSTM(64, return_sequences=True)(x)  #You can stack GRU cell ontop of LSTM and vice versa
x = tf.keras.layers.GRU(64, activation='tanh')(x) 
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_3 = tf.keras.Model(inputs, outputs, name='GRU_model')

model_3.summary()

model_3.compile(
                loss = tf.keras.losses.BinaryCrossentropy(), 
                optimizer = tf.keras.optimizers.Adam(), 
                metrics = ['accuracy']
)

model_3_history = model_3.fit(train_sentences, train_labels,
                              epochs = 10,
                              validation_data=(val_sentences, val_labels), 
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                 experiment_name='model_3_gru')]
)

In [None]:
model_3_preds = model_3.predict(val_sentences)
model_3_preds = tf.squeeze(tf.round(model_3_preds))
print(model_3_preds[:10])
model_3_results = calculate_results(val_labels, model_3_preds)
print(model_3_results)
print(baseline_results)

# Fifth Model - Bidirectional RNN 

#### Normal RNN goes from left to right. Bidirectional combines representation from right to left as well as from left to right 
#### So basically, it encodes text from one direction to the next, and from the other direction to the next.

In [None]:
tf.keras.backend.clear_session

inputs = tf.keras.layers.Input(shape=(1,), dtype='string') 
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x)  #Bidirectional doubles the shape sequence bcos it goes both ways (64*2)
#x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, activation='tanh'))(x)  #GRU and LSTM can be interchanged or even used together
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_4 = tf.keras.Model(inputs, outputs, name='Bidirectional_Model')

model_4.summary()

model_4.compile(
                loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy']
)

model_4_history = model_4.fit(train_sentences, train_labels,
                              epochs=10,
                              validation_data=(val_sentences, val_labels), 
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                 experiment_name='model_4_bidirectional_lstm')]
)

In [None]:
model_4_preds = model_4.predict(val_sentences)
model_4_preds = tf.squeeze(tf.round(model_4_preds))
print(model_4_preds[:10])
model_4_results = calculate_results(val_labels, model_4_preds)
print(model_4_results)
print(baseline_results)

# Bidirectional GRU

In [None]:
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, activation='tanh'))(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_4_5 = tf.keras.Model(inputs, outputs, name='Bidirectional_Model_GRU')

model_4_5.compile(
            loss = tf.keras.losses.BinaryCrossentropy(), 
            optimizer = tf.keras.optimizers.Adam(), 
            metrics = ['accuracy']
)

model_4_5_history = model_4_5.fit(train_sentences, train_labels, 
                                  epochs=10, 
                                  validation_data = (val_sentences, val_labels), 
                                  callbacks = [create_tensorboard_callback(dir_name=SAVE_DIR,
                                                 experiment_name='model_4_bidirectional_gru')]
)

model_4_5.summary()

In [None]:
model_4_5_preds = model_4_5.predict(val_sentences)
model_4_5_preds = tf.squeeze(tf.round(model_4_5_preds))

model_4_5_results = calculate_results(val_labels, model_4_5_preds)
model_4_5_results

# Sixth Model - Convolutional Neural Network for Text (and other types of sequences) 
#### Conv2D is used for image data, while Conv1D is applicable to texts with one dimensional

In [None]:
# Let us first do some testing

embedding_test = embedding(text_vectorizer(['My name is Ikenna Oluigbo and i have two sons']))
conv_1d = tf.keras.layers.Conv1D(filters=64, kernel_size=5, strides=1, activation='relu', padding='same')
conv_1d_output = conv_1d(embedding_test)
max_pool = tf.keras.layers.GlobalMaxPool1D()
max_pool_output = max_pool(conv_1d_output)

#embedding_test.shape, conv_1d_output.shape, max_pool_output.shape 
print(max_pool_output)

In [None]:
tf.keras.backend.clear_session 

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='valid', activation='relu')(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model_5 = tf.keras.Model(inputs, outputs, name='Conv1D_Model')

model_5.summary()

model_5.compile(
                loss = tf.keras.losses.BinaryCrossentropy(), 
                optimizer = tf.keras.optimizers.Adam(), 
                metrics = ['accuracy']
)

model_5_history = model_5.fit(train_sentences, train_labels,
                              epochs=10, 
                              validation_data=(val_sentences, val_labels), 
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                 experiment_name='model_5_conv1d')]
)

In [None]:
model_5_preds = model_5.predict(val_sentences)
model_5_preds = tf.squeeze(tf.round(model_5_preds))
print(model_5_preds[:10])
model_5_results = calculate_results(val_labels, model_5_preds)
print(model_5_results)
print(baseline_results)


# Pretrained Model using Universal Sentence Encoder in Tensorflow Hub 
#### We have built our own models, let us try transfer learning for NLP 
#### https://www.kaggle.com/models/google/universal-sentence-encoder/tensorFlow2/universal-sentence-encoder

In [None]:
# Here is a sample to demonstrate how the pretrained model work 

import tensorflow_hub as hub
import tensorflow as tf

embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2") 
embed_samples = embed([samp, 
                 random_sentence])

print(embed_samples[0][:50])

tt = 'Will Isak sign for Liverpool or remain at Newcastle'
ttt = embed([tt])
print('\n')
print(ttt[0][:50])



In [None]:
tf.keras.backend.clear_session()

import tensorflow_hub as hub
import tensorflow as tf 

#Create a Keras Layer using the USE pretrained layer from tensorflow hub
sentence_encoder_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2",
                                       input_shape = [],  #empty because the pretrained model can take variable lenght inputs
                                       trainable = False,  #false so as not to train the embeddings
                                       dtype = tf.string, 
                                       name = 'USE')


In [None]:
# Create the model using the Sequential API 
## For using a pretrained model, use tf_keras instead of tf.keras

import tf_keras

'''Repeated this here because pretrained models need tf_keras'''

import datetime

#SAVE_DIR = "model_logs"       #Directory to save all model logs 

def create_tensorboard_callback_pretrained(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf_keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback



model_6 = tf_keras.Sequential(name='model_6_USE')
model_6.add(sentence_encoder_layer) 
model_6.add(tf_keras.layers.Dense(64, activation='relu'))  #Help upgrade  the result
model_6.add(tf_keras.layers.Dense(1, activation='sigmoid'))

model_6.summary()

model_6.compile(
            loss = tf_keras.losses.BinaryCrossentropy(),
            optimizer = tf_keras.optimizers.Adam(),
            metrics = ['accuracy']
)

model_6_history = model_6.fit(train_sentences, train_labels, 
                              epochs = 10,
                              validation_data = (val_sentences, val_labels), 
                              callbacks=[create_tensorboard_callback_pretrained(dir_name=SAVE_DIR,
                                                 experiment_name='model_6_tfhub_USE')]
)

In [None]:
model_6_preds = model_6.predict(val_sentences)
model_6_preds = tf.squeeze(tf.round(model_6_preds))

model_6_results = calculate_results(val_labels, model_6_preds)
print(model_6_results)
print(baseline_results)
print('\n')
print(model_6_preds[:20])
print(val_labels[:20])
print(model_6_preds[:20] == val_labels[:20])

In [None]:
'''Experimenting with BERT'''

import tensorflow_hub as hub 
import tf_keras
import tensorflow_text

text_input = tf_keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://kaggle.com/models/tensorflow/bert/TensorFlow2/multi-cased-preprocess/3")
encoder_inputs = preprocessor(text_input)

encoder = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/multi-cased-l-12-h-768-a-12/4",
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].


In [None]:
embedding_model = tf_keras.Model(text_input, pooled_output)
sentences = tf.constant(["Will Liverpool sign Isak or will he remain at Newcastle?"])
#print(embedding_model(sentences))

In [None]:
model_bert = tf_keras.Sequential()
x = tf_keras.layers.Dense(64, activation='relu')(pooled_output)
x = tf_keras.layers.Dense(32, activation='relu')(x)
x = tf_keras.layers.Dense(16, activation='relu')(x)
output = tf_keras.layers.Dense(1, activation='sigmoid')(x)
model_bert = tf_keras.Model(text_input, output, name='test_BERT')

model_bert.compile(loss=tf_keras.losses.BinaryCrossentropy(),
                   optimizer=tf_keras.optimizers.Adam(), 
                   metrics=['acc'])

model_bert.fit(train_sentences, train_labels, epochs=2,
               validation_data=(val_sentences, val_labels))

model_bert.summary()

In [None]:
bert_pred = model_bert.predict(val_sentences)
bert_pred = tf.squeeze(tf.round(bert_pred))
print(calculate_results(val_labels, bert_pred))

# TF Hub Pretrained USE but with 10% of training data
#### Transfer learning really helps when you dont have large dataset

In [None]:
# Create 10% of the training data set 
# Can you quickly spot a problem here? There will be data leakage since some sentences in train_10_percent will be in val sentences

train_10_percent = train_df_shuffled[['text', 'target']].sample(frac=0.1, random_state=42)
train_sentences_10_percent = train_10_percent['text'].to_numpy()
train_labels_10_percent = train_10_percent['target'].to_numpy()
print(train_10_percent['target'].value_counts())

In [None]:
# We try to fix data leakage by taking 10 percent from the original training sentences 

total_len = len(train_sentences)
train_percent = int(total_len * 0.1)   #Ten percent
train_sentences_10percent_shuffle = train_sentences[:train_percent]
train_labels_10percent_shuffle = train_labels[:train_percent]

dt = pd.Series(train_labels_10percent_shuffle)
dt.value_counts()

In [None]:
# Let's build a model the same to model 6
#We can either retype the model same as in model 6 or we can just clone the model 
# Since we are still on the matter of using pretrained model, remember to use the tf_keras

model_7 = tf_keras.models.clone_model(model_6)
model_7._name = 'model_7_USE_10percent_data'   #Rename the model

model_7.compile(
            loss = tf_keras.losses.BinaryCrossentropy(),
            optimizer = tf_keras.optimizers.Adam(), 
            metrics = ['accuracy']
)

model_7.fit(
            train_sentences_10percent_shuffle, train_labels_10percent_shuffle, 
            #train_sentences_10_percent, train_labels_10_percent,
            epochs = 10, 
            validation_data=(val_sentences, val_labels), 
            callbacks=[create_tensorboard_callback_pretrained(dir_name=SAVE_DIR,
                                                 experiment_name='model_7_tfhub_USE_10percent')]
)

model_7.summary()

In [None]:
model_7_preds = model_7.predict(val_sentences)
model_7_preds = tf.squeeze(tf.round(model_7_preds))
print(model_7_preds[:10])

model_7_results = calculate_results(val_labels, model_7_preds)
print(model_7_results) 
print(model_6_results)
#Interesting that model 7 outperforms model 6 even though it is trained on 10% less data 

# Comparing the Performance of each of out models

In [None]:
all_model_dict = dict()
all_models = [baseline_results, model_1_results, model_2_results, model_3_results, 
              model_4_results, model_4_5_results, model_5_results, model_6_results, model_7_results]
all_index = ["baseline_results", "model_1_results", "model_2_results", "model_3_results", 
              "model_4_results", "model_4_5_results", "model_5_results", "model_6_results", "model_7_results"]

acc = list() ; pre = list() ; rec = list() ; fscore = list() 

for i in all_models: 
    acc.append(round(i['Accuracy'], 3))
    pre.append(round(i['Precision'], 3))
    rec.append(round(i['Recall'], 3))
    fscore.append(round(i['F1'], 3))

all_model_dict['Accuracy_Score'] = acc
all_model_dict['Precision_Score'] = pre
all_model_dict['Recall_Score'] = rec
all_model_dict['F1_Score'] = fscore

models_performance = pd.DataFrame(all_model_dict, index=all_index)
models_performance


In [None]:
# ALternative way of putting all model result in table

model_dict = pd.DataFrame({"0_baseline" : baseline_results, 
                           "1_simple_dense" : model_1_results,
                           "2_lstm" : model_2_results,
                           "3_gru" : model_3_results,
                           "4_bidirectional_lstm" : model_4_results,
                           "4_5_bidirectional_gru" : model_4_5_results,
                           "5_conv1d" : model_5_results,
                           "6_tfhub_use_encoder" : model_6_results,
                           "7_tfhub_use_encoder_10percent" : model_7_results})

# Put Accuracy on thesame scale as others
model_dict.loc['Accuracy'] = model_dict.loc['Accuracy'] / 100
model_dict = model_dict.transpose()
model_dict              

In [None]:
# Visualizing results

model_dict.plot(kind='bar', figsize=(10,7)).legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
model_dict['Accuracy'].sort_values(ascending=False).plot(kind='bar', figsize=(10,7))

# Uploading out model training logs to Tensorboard.dev 
#### Inspect it using tensorboard.dev: https://tensorboard.dev/

In [None]:
""" View Model Logs on Tensorboard """

!tensorboard --logdir model_logs

# Saving Models 
### The two main formats to save tensorflow models: 
#### 1. The HDF5 format 
#### 2. The SavedModel format (default)

In [None]:
# Let us save the best performing model, Model 6 (TF Hub Pretrained)

model_6.save("model_6.h5")

In [None]:
# Load saved model 
import tensorflow_hub as hub

loaded_model6 = tf_keras.models.load_model("model_6.h5", 
                                          custom_objects={"KerasLayer" : hub.KerasLayer})  #Custom Objects is needed for h5 format
loaded_model6.summary()

In [None]:
# Trying the second SavedModel format 

model_6.save("model_6_savedformat")

In [None]:
loaded_model6_savedformat = tf_keras.models.load_model("model_6_savedformat")
loaded_model6_savedformat.summary()

# Model driven data exploration also known as ACTIVE LEARNING 
#### Using a trained model to visualize the data 
#### 0 = Not Disaster |  1 = Disaster

In [None]:
model_6_preds_prob = tf.squeeze(model_6.predict(val_sentences))  #Just so we have probabilities

val_df = pd.DataFrame({"text" : val_sentences, 
                      "target" : val_labels, 
                      "pred" : model_6_preds, 
                      "pred_prob" : model_6_preds_prob})

#val_df.head()

""" Let us find the wrong predictions and sort by predictions probabilities """ 

most_wrong = val_df[val_df['target'] != val_df['pred']].sort_values('pred_prob', ascending=False)

num = 5
l = len(most_wrong)
for i in range(num):
    curr = random.choice(range(l))
    text, target, pred, pred_prob = most_wrong.iloc[curr].items()
    print(f"Target: {target[1]}, Pred: {pred[1]}, Prob: {pred_prob[1]}")
    print("Text: ", text[1])
    print("\n")

# Making Predictions on Test Dataset

In [None]:
test_sentences = test_df['text'].to_numpy()
length_test = len(test_sentences)
num_p = 5
for i in range(num_p):
    c = random.choice(range(length_test))
    t = test_sentences[c]
    test_pred_prob = model_6.predict([t])
    test_pred = tf.squeeze(tf.round(test_pred_prob))
    print(f"Pred: {test_pred}, Prob: {test_pred_prob[0]}")
    print('Text: ', t)
    print('\n')

In [None]:
"""Making predictions from live tweets"""

tt = model_6.predict(['We escaped a bandit attack ystdy at around 2pm 20km from Talata-Mafara.'])
print(tt)
print(tf.squeeze(tf.round(tt)).numpy())

In [None]:
test_sentences = test_df['text'].to_numpy()
test_id = test_df['id'].to_numpy()
pred_res = [int(i) for i in tf.squeeze(tf.round(model_6.predict([test_sentences])))]
res_df = pd.DataFrame({'id' : test_id, 
                       'target' : pred_res})
res_df.to_csv('NLP with Disaster Tweet', index=False)