<a href="https://colab.research.google.com/github/laxmiharikumar/deeplearning/blob/main/nlp_fundamentals_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction to NLP

In [1]:
# Get the data
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

--2023-01-26 18:30:24--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.191.128, 173.194.74.128, 173.194.192.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.191.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2023-01-26 18:30:24 (72.4 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [2]:
import zipfile

zip_file = zipfile.ZipFile("nlp_getting_started.zip")
zip_file.extractall()
zip_file.close()

In [3]:
## Visualize the data
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


In [4]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [5]:
# How many examples of each target
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
# Lets visualize some code
import random

# random.seed(42)
random_index = random.randint(0, len(train_df_shuffled)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real_disaster)" if target > 0 else "(not a real disaster)")
  print(f"Text is: {text}")
  print("------------------\n")

Target: 0 (not a real disaster)
Text is: Reddit Will Now Quarantine OffensiveåÊContent https://t.co/MjbIUvbMo6 http://t.co/I5cdTD8ftj
------------------

Target: 0 (not a real disaster)
Text is: #hot  Reddit's new content policy goes into effect many horrible subreddits banned or quarantined http://t.co/algtcN8baf #prebreak #best
------------------

Target: 0 (not a real disaster)
Text is: Texas Seeks Comment on Rules for Changes to Windstorm Insurer http://t.co/BZ07c9WthX via @ijournal
------------------

Target: 0 (not a real disaster)
Text is: What's missing in the #asae15 exhibitor emails? Value. http://t.co/r8cepRqxlE #assnchat
------------------

Target: 1 (real_disaster)
Text is: The EFAK would be designed for building occupants once they evacuate and report to their evacuation assembly sites
------------------



In [7]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

In [8]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [9]:
# Check 10 samples
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object), array([0, 

## Convert Text to Numbers

1. Tokenization
2. Embedding

In [10]:
import tensorflow as tf
from keras.layers import TextVectorization

In [11]:
text_vectorizer = TextVectorization(max_tokens=None,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=None, # how long do you want your sequences to be
                                    pad_to_max_tokens=False)

In [12]:
# Find the average number of words in a sentence
max_length = 0;
for i in train_sentences:
  max_length = max_length + len(i)
max_length = max_length / len(train_sentences)
max_length

100.84294263611152

In [13]:
round(sum([len(i.split()) for i in train_sentences]))

102087

In [14]:
max_vocab_length = 10000
max_length = round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))
max_length

15

In [15]:
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length,
                                    pad_to_max_tokens=True)

In [16]:
# Fit the text vectorizer to the train sentences
text_vectorizer.adapt(train_sentences)

In [17]:
# Create a sample sentence and tokenize it
sample_sentence="There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [18]:
## See top 5 and bottom 5 words 
all_words = text_vectorizer.get_vocabulary()
print(f"Number of words: {len(all_words)}")
print(f"Top 5 words: {all_words[:5]}")
print(f"Bottom 5 words: {all_words[-5:]}")

Number of words: 10000
Top 5 words: ['', '[UNK]', 'the', 'a', 'in']
Bottom 5 words: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


### Embedding
To create an embedding we use Tensorflow's Embedding layer

Parameters
* `input_dim` - size of vocab (10000)
* `output_dim` - size of each output embedding vector
* `input_length` - length of sequences being passed to embedding layer (15) 

In [19]:
embedding = tf.keras.layers.Embedding(input_dim = max_vocab_length,
                                      output_dim = 128,
                                      input_length = max_length)

In [20]:
sample_sentence = random.choice(train_sentences)
print(f"The sentence is: {sample_sentence}")

The sentence is: Free Ebay Sniping RT? http://t.co/RqIPGQslT6 Chevrolet : Avalanche Ltz Lifted 4x4 Truck ?Please Favorite &amp; Share


In [21]:
embed_op = embedding(text_vectorizer([sample_sentence]))
embed_op

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.02428705, -0.02553303,  0.00989275, ..., -0.01367463,
         -0.03089025, -0.03926803],
        [ 0.03957475,  0.01874969,  0.01164664, ..., -0.02911686,
          0.00281934, -0.01032578],
        [ 0.00807588,  0.00777727, -0.02933869, ...,  0.01414574,
         -0.00269455,  0.02611843],
        ...,
        [-0.0476573 , -0.03304435, -0.01669575, ..., -0.01930666,
          0.028467  ,  0.01790127],
        [ 0.03667644,  0.00962435,  0.02681459, ..., -0.01598229,
         -0.01530118, -0.044081  ],
        [-0.01555747,  0.03334272,  0.04902834, ..., -0.0344921 ,
         -0.03488463, -0.01610807]]], dtype=float32)>

### Model 0 - Baseline model with scikit - Text Classification with Multinomial Naive Bayes

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer ## Convert text to numbers
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [23]:
# Create tokenization and modeling pipeline
model_0 = Pipeline([
                  ("tfidf", TfidfVectorizer()),
                  ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(train_sentences, train_labels)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [24]:
# Evaluate out baseline model
baseline_score = model_0.score(val_sentences, val_labels)
print(f"The score of baseline model is: {baseline_score*100:.2f}%")

The score of baseline model is: 79.27%


In [25]:
# Make Predictions
baseline_preds = model_0.predict(val_sentences)
print(f"Baseline model predictions: {baseline_preds[:10]}")
print(f"Actual values: {val_labels[:10]}")

Baseline model predictions: [1 1 1 0 0 1 1 1 1 0]
Actual values: [0 0 1 1 1 1 1 1 1 0]


In [26]:
# Function that returns evaluation metrics
from sklearn import metrics

def calculate_results(y_true, y_pred):
  eval_metrics = {}
  eval_metrics["accuracy"] = metrics.accuracy_score(y_true, y_pred) * 100
  eval_metrics["precision"] = metrics.precision_score(y_true, y_pred) * 100
  eval_metrics["recall"] = metrics.recall_score(y_true, y_pred) * 100
  eval_metrics["f1_score"] = metrics.f1_score(y_true, y_pred) * 100

  return eval_metrics

In [27]:
baseline_results = calculate_results(val_labels, baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 88.6178861788618,
 'recall': 62.643678160919535,
 'f1_score': 73.4006734006734}

### Model 1 - Feed forward neural network (A simple dense model)

In [28]:
# Create a tensorboard callback (need to create a new one for each model)

import datetime

def create_tensorboard_callback(dir_name, experiment_name):
  """
  Creates a TensorBoard callback instand to store log files.

  Stores log files with the filepath:
    "dir_name/experiment_name/current_datetime/"

  Args:
    dir_name: target directory to store TensorBoard log files
    experiment_name: name of experiment directory (e.g. efficientnet_model_1)
  """
  log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
  tensorboard_callback = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir
  )
  print(f"Saving TensorBoard log files to: {log_dir}")
  return tensorboard_callback


In [29]:
SAVE_DIR = "model_logs"

In [30]:
# Build a model using functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x) # Condense the feature vector for each tokento one vector
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs,outputs, name="model_1_dense")

In [31]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [32]:
# Compile the model
model_1.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [33]:
# Fit the model
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_1_dense")]
                        )

Saving TensorBoard log files to: model_logs/model_1_dense/20230126-183039
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
model_1.evaluate(val_sentences, val_labels)



[0.4773145914077759, 0.7860892415046692]

In [35]:
model_1_preds = model_1.predict(val_sentences)
model_1_preds[:10]



array([[0.40695548],
       [0.7787206 ],
       [0.9973857 ],
       [0.09256705],
       [0.12117622],
       [0.93339205],
       [0.9258426 ],
       [0.993929  ],
       [0.9633111 ],
       [0.23966594]], dtype=float32)

In [36]:
# Convert model prediction probabilities into labels
model_1_preds = tf.squeeze(tf.round(model_1_preds))

In [37]:
model_1_results = calculate_results(val_labels, model_1_preds)
model_1_results

{'accuracy': 78.60892388451444,
 'precision': 82.00692041522491,
 'recall': 68.10344827586206,
 'f1_score': 74.41130298273156}

In [38]:
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 88.6178861788618,
 'recall': 62.643678160919535,
 'f1_score': 73.4006734006734}

In [39]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
N

In [41]:
# Get the weight matrix of embedding layer
# these are the numerical representations of each token in our training data, which have been learned for 5 epochs
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape # same as vocab size and embedding dim

(10000, 128)

### RNNs

Premise of a recurrent neural network is to use the representation of a previous input to aid the representation of a later input

To Read
1. MIT's sequence modelling lecture
2. Chris Olah's intro to LSTM
3. word2vec
4. Word Embeddings
5. Unreasonable effectiveness of RNNs

### Model 2 - LSTM

Long Short Term Memory 

Structure of an RNN 
- Input (text) -> Tokenize -> Embedding-> Layers (RNNs/dense) ->  Output (label probability)

In [44]:
# Build a model using functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
print(f"After emedding the shape is: {x.shape}")
# x = tf.keras.layers.LSTM(units=64, return_sequences=True)(x) # when you stack LSTM layers you need to return sequences
# print(f"After first LSTM the shape is: {x.shape}")
x = tf.keras.layers.LSTM(64)(x)
print(f"After second LSTM shape is: {x.shape}")
# x = tf.keras.layers.Dense(64, activation="relu")(x)
outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_lstm")

After emedding the shape is: (None, 15, 128)
After second LSTM shape is: (None, 64)


In [45]:
# Get a summary
model_2.summary()

Model: "model_2_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 lstm_3 (LSTM)               (None, 64)                49408     
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
____________________________________________

In [46]:
# Compile the model
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [48]:
# Fit the model
model_2_history = model_2.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_2_lstm")])

Saving TensorBoard log files to: model_logs/model_2_lstm/20230126-190556
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
# Make predictions with model 2
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]



array([[4.5380117e-03],
       [6.9668442e-01],
       [9.9971396e-01],
       [1.2726453e-02],
       [5.0827081e-04],
       [9.9838001e-01],
       [6.6886729e-01],
       [9.9985588e-01],
       [9.9973172e-01],
       [5.4011935e-01]], dtype=float32)

In [51]:
# Convert pred probabilities to labels
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [53]:
# Calculate model_2 results
model_2_results = calculate_results(val_labels, model_2_preds)
model_2_results

{'accuracy': 77.55905511811024,
 'precision': 79.7979797979798,
 'recall': 68.10344827586206,
 'f1_score': 73.48837209302326}

### Model 3 - RNN using GRU

Gated Recurrent Unit

GRU cell has similar features to LSTM but has lower number of parameters

In [55]:
# Build using functional API
inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GRU(units=64)(x)
outputs=tf.keras.layers.Dense(1, activation="sigmoid")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_gru")

In [56]:
# Get the model summary
model_3.summary()

Model: "model_3_gru"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 15)               0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,317,313
Trainable params: 1,317,313
Non-trainable params: 0
_____________________________________________

In [57]:
# Compile the model
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [59]:
# Fit the model
model_3_history = model_3.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                    experiment_name="model_3_gru")])

Saving TensorBoard log files to: model_logs/model_3_gru/20230126-195708
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [60]:
# Get the predictions
model_3_pred_probs = model_3.predict(val_sentences)
model_3_pred_probs[:10]



array([[2.8336328e-03],
       [8.1761211e-01],
       [9.9971485e-01],
       [5.7008203e-02],
       [1.3122936e-04],
       [9.9882621e-01],
       [7.5565475e-01],
       [9.9993545e-01],
       [9.9975216e-01],
       [9.3692315e-01]], dtype=float32)

In [61]:
# Convert pred probabilities to labels
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [62]:
# Get model_3 results
model_3_results = calculate_results(val_labels, model_3_preds)