<a href="https://colab.research.google.com/github/laxmiharikumar/deeplearning/blob/main/nlp_fundamentals_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to NLP

In [1]:
# Get the data
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

--2023-01-26 22:24:36--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.203.128, 172.253.123.128, 142.250.97.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.203.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-01-26 22:24:36 (55.4 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [2]:
import zipfile

zip_file = zipfile.ZipFile("nlp_getting_started.zip")
zip_file.extractall()
zip_file.close()

In [3]:
## Visualize the data
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [4]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
# How many examples of each target
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
# Lets visualize some code
import random

# random.seed(42)
random_index = random.randint(0, len(train_df_shuffled)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real_disaster)" if target > 0 else "(not a real disaster)")
  print(f"Text is: {text}")
  print("------------------\n")

Target: 0 (not a real disaster)
Text is: I liked a @YouTube video http://t.co/45TWHJ0l6m RomanAtwoodVlogs | RESCUED SICK KITTENS!!
------------------

Target: 1 (real_disaster)
Text is: #Bestnaijamade: 16yr old PKK suicide bomber who detonated bomb in ... http://t.co/KSAwlYuX02 bestnaijamade bestnaijamade bestnaijamade beÛ_
------------------

Target: 0 (not a real disaster)
Text is: @YoungHeroesID 4. Lava Blast Power Red #PantherAttack
------------------

Target: 1 (real_disaster)
Text is: Central Mass. fruit trees escape heavy damage after wind hail http://t.co/VbFfodtP6M
------------------

Target: 1 (real_disaster)
Text is: As of 2010 there were 17 Beluga deaths reported at #SeaWorld their average age 15 1/2 years #OpSeaWorld http://t.co/MZk5UjlFCV
------------------



In [7]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

In [8]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [9]:
# Check 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

## Convert Text to Numbers

1. Tokenization
2. Embedding

In [10]:
import tensorflow as tf
from keras.layers import TextVectorization

In [11]:
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=None, # how long do you want your sequences to be
                                    pad_to_max_tokens=False)

In [12]:
# Find the average number of words in a sentence
max_length = 0;
for i in train_sentences:
  max_length = max_length + len(i)
max_length = max_length / len(train_sentences)
max_length

100.84294263611152

In [13]:
round(sum([len(i.split()) for i in train_sentences]))

102087

In [14]:
max_vocab_length = 10000
max_length = round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))
max_length

15

In [15]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length,
                                    pad_to_max_tokens=True)

In [16]:
# Fit the text vectorizer to the train sentences
text_vectorizer.adapt(train_sentences)

In [17]:
# Create a sample sentence and tokenize it
sample_sentence="There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [18]:
## See top 5 and bottom 5 words 
all_words = text_vectorizer.get_vocabulary()
print(f"Number of words: {len(all_words)}")
print(f"Top 5 words: {all_words[:5]}")
print(f"Bottom 5 words: {all_words[-5:]}")

Number of words: 10000
Top 5 words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Embedding
To create an embedding we use Tensorflow's Embedding layer

Parameters
* `input_dim` - size of vocab (10000)
* `output_dim` - size of each output embedding vector
* `input_length` - length of sequences being passed to embedding layer (15) 

In [19]:
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim = 128,
                                      input_length = max_length)

In [20]:
sample_sentence = random.choice(train_sentences)
print(f"The sentence is: {sample_sentence}")

The sentence is: Still no plans? Don't worry we got you covered. Plenty of Seismic IPA and Seismic Squeeze Radler to help... http://t.co/A8nMdkd3rV


In [21]:
embed_op = embedding(text_vectorizer([sample_sentence]))
embed_op

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04135367, -0.04297298,  0.01889981, ..., -0.00715023,
          0.02596622,  0.01632125],
        [ 0.01815074, -0.00092636, -0.01173472, ...,  0.017931  ,
          0.02832935, -0.00241455],
        [ 0.03333833, -0.00031959,  0.00850451, ...,  0.00495659,
         -0.03048179,  0.03254688],
        ...,
        [-0.03757721,  0.0112638 ,  0.01497832, ...,  0.00176647,
         -0.01097985, -0.00880669],
        [-0.03309876, -0.00781668, -0.02436919, ...,  0.00317317,
          0.03854719,  0.0057398 ],
        [ 0.00827412,  0.00477184,  0.03447337, ..., -0.00939039,
         -0.00980262, -0.04778599]]], dtype=float32)>

### Model 0 - Baseline model with scikit - Text Classification with Multinomial Naive Bayes

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer ## Convert text to numbers
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [23]:
# Create tokenization and modeling pipeline
model_0 = Pipeline([
                  ("tfidf", TfidfVectorizer()),
                  ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [24]:
# Evaluate out baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"The score of baseline model is: {baseline_score*100:.2f}%")

The score of baseline model is: 79.27%


In [25]:
# Make Predictions
baseline_preds = model_0.predict(val_sentences)
print(f"Baseline model predictions: {baseline_preds[:10]}")
print(f"Actual values: {val_labels[:10]}")

Baseline model predictions: [1 1 1 0 0 1 1 1 1 0]
Actual values: [0 0 1 1 1 1 1 1 1 0]


In [26]:
# Function that returns evaluation metrics
from sklearn import metrics

def calculate_results(y_true, y_pred):
  eval_metrics = {}
  eval_metrics["accuracy"] = metrics.accuracy_score(y_true, y_pred) * 100
  eval_metrics["precision"] = metrics.precision_score(y_true, y_pred) * 100
  eval_metrics["recall"] = metrics.recall_score(y_true, y_pred) * 100
  eval_metrics["f1_score"] = metrics.f1_score(y_true, y_pred) * 100

  return eval_metrics

In [27]:
baseline_results = calculate_results(val_labels, baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 88.6178861788618,
 'recall': 62.643678160919535,
 'f1_score': 73.4006734006734}

### Model 1 - Feed forward neural network (A simple dense model)

In [28]:
# Create a tensorboard callback (need to create a new one for each model)

import datetime

def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback


In [29]:
SAVE_DIR = "model_logs"

In [30]:
# Build a model using functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) # Condense the feature vector for each tokento one vector
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs,outputs, name="model_1_dense")

In [31]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [32]:
# Compile the model
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [33]:
# Fit the model
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_1_dense")]
                        )

Saving TensorBoard log files to: model_logs/model_1_dense/20230126-222450
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
model_1.evaluate(val_sentences, val_labels)



[0.47762688994407654, 0.7860892415046692]

In [35]:
model_1_preds = model_1.predict(val_sentences)
model_1_preds[:10]



array([[0.39852282],
       [0.761952  ],
       [0.9976947 ],
       [0.09993853],
       [0.10411252],
       [0.94020826],
       [0.9136869 ],
       [0.99409026],
       [0.9643234 ],
       [0.21410784]], dtype=float32)

In [36]:
# Convert model prediction probabilities into labels
model_1_preds = tf.squeeze(tf.round(model_1_preds))

In [37]:
model_1_results = calculate_results(val_labels, model_1_preds)
model_1_results

{'accuracy': 78.60892388451444,
 'precision': 82.45614035087719,
 'recall': 67.52873563218391,
 'f1_score': 74.24960505529226}

In [38]:
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 88.6178861788618,
 'recall': 62.643678160919535,
 'f1_score': 73.4006734006734}

In [39]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [40]:
# Get the weight matrix of embedding layer
# these are the numerical representations of each token in our training data, which have been learned for 5 epochs
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape # same as vocab size and embedding dim

(10000, 128)

### RNNs

Premise of a recurrent neural network is to use the representation of a previous input to aid the representation of a later input

To Read
1. MIT's sequence modelling lecture
2. Chris Olah's intro to LSTM
3. word2vec
4. Word Embeddings
5. Unreasonable effectiveness of RNNs

### Model 2 - LSTM

Long Short Term Memory 

Structure of an RNN 
- Input (text) -> Tokenize -> Embedding-> Layers (RNNs/dense) ->  Output (label probability)

In [41]:
# Build a model using functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
print(f"After emedding the shape is: {x.shape}")
# x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x) # when you stack LSTM layers you need to return sequences
# print(f"After first LSTM the shape is: {x.shape}")
x = tf.keras.layers.LSTM(64)(x)
print(f"After second LSTM shape is: {x.shape}")
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_lstm")

After emedding the shape is: (None, 15, 128)
After second LSTM shape is: (None, 64)


In [42]:
# Get a summary
model_2.summary()

Model: "model_2_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [43]:
# Compile the model
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [44]:
# Fit the model
model_2_history = model_2.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_2_lstm")])

Saving TensorBoard log files to: model_logs/model_2_lstm/20230126-222536
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
# Make predictions with model 2
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[7.0613466e-04],
       [5.5498362e-01],
       [9.9990100e-01],
       [1.1658367e-02],
       [1.6974936e-04],
       [9.9974692e-01],
       [7.3307854e-01],
       [9.9993640e-01],
       [9.9988067e-01],
       [2.5762850e-01]], dtype=float32)

In [46]:
# Convert pred probabilities to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [47]:
# Calculate model_2 results
model_2_results = calculate_results(val_labels, model_2_preds)
model_2_results

{'accuracy': 77.69028871391076,
 'precision': 82.2463768115942,
 'recall': 65.22988505747126,
 'f1_score': 72.75641025641025}

### Model 3 - RNN using GRU

Gated Recurrent Unit

GRU cell has similar features to LSTM but has lower number of parameters

In [48]:
# Build using functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(units=64)(x)
outputs=tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_gru")

In [49]:
# Get the model summary
model_3.summary()

Model: "model_3_gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [50]:
# Compile the model
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [51]:
# Fit the model
model_3_history = model_3.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                    experiment_name="model_3_gru")])

Saving TensorBoard log files to: model_logs/model_3_gru/20230126-222628
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [52]:
# Get the predictions
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]



array([[1.38045279e-02],
       [7.74851263e-01],
       [9.99916553e-01],
       [1.21154204e-01],
       [4.38189090e-05],
       [9.99813437e-01],
       [4.04991269e-01],
       [9.99970734e-01],
       [9.99950349e-01],
       [6.74697638e-01]], dtype=float32)

In [53]:
# Convert pred probabilities to labels
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 0., 1., 1., 1.], dtype=float32)>

In [54]:
# Get model_3 results
model_3_results = calculate_results(val_labels, model_3_preds)
model_3_results

{'accuracy': 76.9028871391076,
 'precision': 78.47682119205298,
 'recall': 68.10344827586206,
 'f1_score': 72.92307692307692}

### Model 4: Bidirectional RNN

Normal RNNs go from left to right. However bidirectional RNN goes from left to right and right to left

In [57]:
# Build a bidirectional RNN in tensorflow
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=64))(x)
# x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64))(x)
outputs=tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_bidirectional")

In [58]:
# Get the model summary
model_4.summary()

Model: "model_4_bidirectional"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,378,945
Trainable params: 1,3

In [59]:
# Compile the model
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [60]:
# Fit the model
model_4_history = model_4.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_4_bidirectional")])

Saving TensorBoard log files to: model_logs/model_4_bidirectional/20230126-223419
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [61]:
# Make predictions
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[5.6787204e-02],
       [9.2426914e-01],
       [9.9993420e-01],
       [1.0841317e-01],
       [2.7345560e-04],
       [9.9959606e-01],
       [9.4492507e-01],
       [9.9997598e-01],
       [9.9995643e-01],
       [9.2648071e-01]], dtype=float32)

In [62]:
# Convert pred probs into labels
model_4_pred = tf.squeeze(tf.round(model_4_pred_probs))
model_4_pred[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [63]:
# Calculate results
model_4_results = calculate_results(val_labels, model_4_pred)
model_4_results

{'accuracy': 76.24671916010499,
 'precision': 77.02265372168284,
 'recall': 68.39080459770115,
 'f1_score': 72.45053272450532}

### Model 5 - Conv1D

Typical structure:

Inputs(text) -> Tokenization -> Embedding -> Layers (Conv1D + Pooling) -> Outputs (class probabilities)

In [73]:
# Build using functional API
inputs=tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
print(f"Shape after embedding: {x.shape}")
x = tf.keras.layers.Conv1D(filters=64, # number of hidden units
                           kernel_size=5, # look at 5 words at a time
                           strides=1,
                           activation="relu",
                           padding="valid")(x) # padding is often necessary when the kernel extends beyond the activation map. valid means output is smaller than input shape.  same means output is same as input shape.
print(f"Shape after conv1D: {x.shape}")
x = tf.keras.layers.GlobalMaxPooling1D()(x) # condenses the max value from all 15 tokens to 1 value. equivalent to "get the most important features" or "get the feature with the highest value"
print(f"Shape after maxpool: {x.shape}")
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
print(f"Shape after dense: {outputs.shape}")
model_5 = tf.keras.Model(inputs, outputs, name="model_5_conv1D")

Shape after embedding: (None, 15, 128)
Shape after conv1D: (None, 11, 64)
Shape after maxpool: (None, 64)
Shape after dense: (None, 1)


In [74]:
# Get the model summary
model_5.summary()

Model: "model_5_conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 conv1d_2 (Conv1D)           (None, 11, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 1)              

In [75]:
# Compile the model
model_5.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [76]:
# Fit the model
model_5_history = model_5.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_5_conv1D")])

Saving TensorBoard log files to: model_logs/model_5_conv1D/20230126-234214
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Make the predictions
model_5_pred_probs = model_5.predict(val_sentences)
model_5_pred_probs[:10]

In [71]:
# Convert pred probs to labels
model_5_pred = tf.squeeze(tf.round(model_5_pred_probs))
model_5_pred[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [72]:
# Calculate results
model_5_results = calculate_results(val_labels, model_5_pred)
model_5_results

{'accuracy': 75.8530183727034,
 'precision': 76.11464968152866,
 'recall': 68.67816091954023,
 'f1_score': 72.20543806646526}

### Model 6 - Tensorflow Hub Pretrained Sentence Encoder

In [81]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embed_samples = embed(["There's a flood in my street!",
                       "when you call the universal sentence encoder on a sentence, it turns it into numbers"])
print(embed_samples[0][:50])

tf.Tensor(
[-0.01157025  0.02485911  0.02878051 -0.012715    0.03971541  0.08827761
  0.02680988  0.05589838 -0.01068731 -0.00597293  0.00639321 -0.01819516
  0.00030816  0.09105889  0.05874645 -0.03180629  0.01512474 -0.05162925
  0.00991366 -0.06865345 -0.04209306  0.0267898   0.03011009  0.00321065
 -0.00337968 -0.04787356  0.0226672  -0.00985927 -0.04063615 -0.01292093
 -0.04666382  0.05630299 -0.03949255  0.00517682  0.02495827 -0.07014439
  0.0287151   0.0494768  -0.00633978 -0.08960193  0.02807119 -0.00808364
 -0.01360601  0.05998649 -0.10361788 -0.05195372  0.00232958 -0.02332531
 -0.03758106  0.03327729], shape=(50,), dtype=float32)


In [82]:
embed_samples.shape

TensorShape([2, 512])