In [1]:
# NLP
# NLP has the goal of deriving information out of natural language (sequences of text or speech)
# another common term is sequence to sequence seq2seq


In [2]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-4994eb2f-aef6-930f-406e-a3a73a6742b9)


In [3]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2025-05-17 17:32:12--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2025-05-17 17:32:13 (93.8 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [4]:
# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [5]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2025-05-17 17:32:22--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.10.207, 142.251.12.207, 172.217.194.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.10.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2025-05-17 17:32:24 (693 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [6]:
# Turn .csv files into pandas DataFrame's
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [8]:
# The test data doesn't have a target (that's what we'd try to predict)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [9]:
# How many examples of each class?
train_df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [10]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [11]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 1 (real disaster)
Text:
Oh the usual. Mass murder and world domination plans over coffee. How's your day going?

---

Target: 1 (real disaster)
Text:
Thunder pounds north goes black
a deep bruise on the sky's chest
wind cries its pain.  
A summer storm has a tough life
short violent.

---

Target: 0 (not real disaster)
Text:
@emmerdale is Ross really dead?? #AskCharley

---

Target: 1 (real disaster)
Text:
Photo: blue by @forest.fires source: http://t.co/awXR24zsqh http://t.co/o9A26Fn27y

---

Target: 0 (not real disaster)
Text:
I added a video to a @YouTube playlist http://t.co/f2TqMFh1Yb Cher Lloyd - Sirens

---



In [12]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [13]:

# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [14]:
# View the first 10 training sentences and their labels
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

In [15]:
# Converting text to numbers: tokenization | embedding

In [16]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [17]:
text_vectorizer = TextVectorization(max_tokens=None, # how many words in vocabulary add <OOV> if > max_t
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, # create groups of n-words
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None, #how long do you want your sequences to be
                                    )
                                   # pad_to_max_tokens=True)



In [18]:
# Find the avg number of tokens (words) in the training tweets (divide sum of lengths of each tweet by length of dataset)

In [19]:
sum([len(i.split()) for i in train_sentences]) / len(train_sentences)

14.901036345059115

In [20]:
max_vocab_length=10000 # max number of words to have in our vocab
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [21]:
text_vectorizer.adapt(train_sentences)

In [22]:
sample_senetence = "There's a flood in my street!"
text_vectorizer([sample_senetence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [23]:
random_sentence = random.choice(train_sentences)
print(f"Original: \n {random_sentence} \n\nVectorized: \n {text_vectorizer([random_sentence])}")


Original: 
 Free Kindle Book - Aug 3-7 - Thriller - Desolation Run by @jamessnyder22 http://t.co/sgXb6E5Yda 

Vectorized: 
 [[ 268 1827  747  795    1 2259  552  340   18    1    1    0    0    0
     0]]


In [24]:
# get the unique words in the vocab

In [25]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 most common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', np.str_('the'), np.str_('a'), np.str_('in')]
Bottom 5 most common words: [np.str_('pages'), np.str_('paeds'), np.str_('pads'), np.str_('padres'), np.str_('paddytomlinson1')]


In [26]:
# Creating an Embedding using Embedding layer

In [27]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding



<Embedding name=embedding, built=False>

In [28]:
random_sentence  = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence} \n\nEmbedded version: ")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original sentence: 
 #fitness Knee Damage Solution http://t.co/pUMbrNeBJE 

Embedded version: 


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.04567105,  0.01747039,  0.02439656, ..., -0.0076021 ,
          0.0449609 , -0.01542629],
        [ 0.04092577,  0.04407748,  0.03518382, ..., -0.03714675,
         -0.00069659, -0.01461409],
        [ 0.02360506, -0.04184911,  0.02828064, ..., -0.00048428,
          0.02808489, -0.04134011],
        ...,
        [ 0.01707271, -0.04513588, -0.00151999, ..., -0.03767676,
          0.00015952, -0.02008103],
        [ 0.01707271, -0.04513588, -0.00151999, ..., -0.03767676,
          0.00015952, -0.02008103],
        [ 0.01707271, -0.04513588, -0.00151999, ..., -0.03767676,
          0.00015952, -0.02008103]]], dtype=float32)>

In [29]:
### model 0: baseline model sci-kit (Naive Bayes TF-IDF)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB()), # model the text. clf = classifier
])

model_0.fit(train_sentences, train_labels)

In [31]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

our baseline model achieves an accuracy of: 79.27%


In [32]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [33]:
train_labels

array([0, 0, 1, ..., 1, 1, 0])

In [34]:
## EVALUATION FUNCTION FOR MODEL EXPERIMENTS

In [35]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def caluclate_results(y_true, y_pred):
  """
  Calcualates model accuracy, precision, recall and f1 score of a binary classification model
  """
  model_accurancy = accuracy_score(y_true, y_pred) * 100
  # precision, recall and f1-score using *weighted* average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accurancy,
                   "precision": model_precision,
                   "recall": model_recall,
                   "f1": model_f1}

  return model_results


In [36]:
baseline_results = caluclate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [37]:
from helper_functions import create_tensorboard_callback

SAVE_DIR = "model_logs"

from tensorflow.keras import layers

inputs = layers.Input(shape=(), dtype=tf.string) # inputs  are 1-dimensional strings
x = text_vectorizer(inputs)
x = embedding(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

model_1.summary()

In [38]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [39]:
model_1_history = model_1.fit(x=train_sentences,
                              y=train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                                     experiment_name="model_1_dense")])

Saving TensorBoard log files to: model_logs/model_1_dense/20250517-173234
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.6416 - loss: 0.6525 - val_accuracy: 0.7690 - val_loss: 0.5370
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8134 - loss: 0.4703 - val_accuracy: 0.7848 - val_loss: 0.4757
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8590 - loss: 0.3523 - val_accuracy: 0.7927 - val_loss: 0.4580
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8854 - loss: 0.3004 - val_accuracy: 0.7861 - val_loss: 0.4610
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9139 - loss: 0.2454 - val_accuracy: 0.7848 - val_loss: 0.4767


In [40]:
model_1.evaluate(val_sentences, val_labels)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7778 - loss: 0.5100


[0.47668319940567017, 0.7847769260406494]

In [41]:
model_1_pred_probs = model_1.predict(val_sentences)
model_1_pred_probs.shape

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


(762, 1)

In [42]:
model_1_preds = tf.squeeze(tf.round(model_1_pred_probs))
model_1_preds

<tf.Tensor: shape=(762,), dtype=float32, numpy=
array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0

In [43]:
model_1_results = caluclate_results(y_true=val_labels, y_pred=model_1_preds)
model_1_results

{'accuracy': 78.4776902887139,
 'precision': 0.7900209877855079,
 'recall': 0.7847769028871391,
 'f1': 0.781600538849599}

In [44]:
import numpy as np
np.array(list(model_1_results.values())) > np.array(list(baseline_results.values()))

array([False, False, False, False])

In [45]:
words_in_vocab

['',
 '[UNK]',
 np.str_('the'),
 np.str_('a'),
 np.str_('in'),
 np.str_('to'),
 np.str_('of'),
 np.str_('and'),
 np.str_('i'),
 np.str_('is'),
 np.str_('for'),
 np.str_('on'),
 np.str_('you'),
 np.str_('my'),
 np.str_('with'),
 np.str_('it'),
 np.str_('that'),
 np.str_('at'),
 np.str_('by'),
 np.str_('this'),
 np.str_('from'),
 np.str_('be'),
 np.str_('are'),
 np.str_('was'),
 np.str_('have'),
 np.str_('like'),
 np.str_('as'),
 np.str_('up'),
 np.str_('so'),
 np.str_('just'),
 np.str_('but'),
 np.str_('me'),
 np.str_('im'),
 np.str_('your'),
 np.str_('not'),
 np.str_('amp'),
 np.str_('out'),
 np.str_('its'),
 np.str_('will'),
 np.str_('an'),
 np.str_('no'),
 np.str_('has'),
 np.str_('fire'),
 np.str_('after'),
 np.str_('all'),
 np.str_('when'),
 np.str_('we'),
 np.str_('if'),
 np.str_('now'),
 np.str_('via'),
 np.str_('new'),
 np.str_('more'),
 np.str_('get'),
 np.str_('or'),
 np.str_('about'),
 np.str_('what'),
 np.str_('he'),
 np.str_('people'),
 np.str_('news'),
 np.str_('been'),
 n

In [46]:
# get the weight matrix of embedding layer
# these are the numerical representations of each token in our training data
embed_weights = model_1.get_layer("embedding").get_weights()[0]
embed_weights.shape

(10000, 128)

In [47]:
#create embedding fiels
import io

out_v = io.open("vectors.tsv", 'w', encoding="utf-8")
out_m = io.open("metadata.tsv", 'w', encoding="utf-8")

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue # skip 0, it's padding
  vec = embed_weights[index]
  out_v.write("\t".join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")

out_v.close()
out_m.close()

In [48]:
try:
  from google.colab import files
  files.download("vectors.tsv")
  files.download("metadata.tsv")
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# RNN are useful for sequence data

# the premise of a rnn is
#to use the representation of a previous input to aid the representation of a later input

In [53]:
# LSTM - one of the most popular LSTM cells
# Input -> Tokenize -> Embedding -> Layers -> Output
# Create an Lstm model

from tensorflow.keras import layers
input = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
print(x.shape)
x = layers.LSTM(units=64)(x)
print(x.shape)
#x = layers.LSTM(units=64, return_sequences=True)(x)
#print(x.shape)
#x = layers.LSTM(64, return_sequences=True)(x)
#print(x.shape)
#x = layers.LSTM(64, activation='relu')(x)
#print(x.shape)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")


(None, 15, 128)
(None, 64)


In [54]:
model_2.summary()

In [55]:
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [56]:
model_2_history = model_2.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels),
                              callbacks=[create_tensorboard_callback(SAVE_DIR, "model_2_LSTM")])

Saving TensorBoard log files to: model_logs/model_2_LSTM/20250517-175420
Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.8839 - loss: 0.3078 - val_accuracy: 0.7690 - val_loss: 0.5914
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9478 - loss: 0.1470 - val_accuracy: 0.7782 - val_loss: 0.6478
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9505 - loss: 0.1336 - val_accuracy: 0.7808 - val_loss: 0.6990
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9569 - loss: 0.1085 - val_accuracy: 0.7756 - val_loss: 0.7922
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9712 - loss: 0.0740 - val_accuracy: 0.7795 - val_loss: 0.8413


In [57]:
model_2_pred_probs = model_2.predict(val_sentences)
model_2_pred_probs[:10]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


array([[9.9952519e-03],
       [6.7727339e-01],
       [9.9962568e-01],
       [4.5938648e-02],
       [9.2142489e-04],
       [9.9771297e-01],
       [7.9632497e-01],
       [9.9977046e-01],
       [9.9956816e-01],
       [3.6017975e-01]], dtype=float32)

In [58]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [60]:
model_2_results = caluclate_results(y_true=val_labels, y_pred=model_2_preds)
model_2_results

{'accuracy': 77.95275590551181,
 'precision': 0.7849754984084437,
 'recall': 0.7795275590551181,
 'f1': 0.7761172470890803}