In [1]:
# NLP
# NLP has the goal of deriving information out of natural language (sequences of text or speech)
# another common term is sequence to sequence seq2seq


In [2]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-37fa1d72-b5a4-b845-46e5-8ac9627901f8)


In [3]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2025-05-15 09:54:04--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2025-05-15 09:54:04 (87.7 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [4]:
# Import series of helper functions for the notebook
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys

In [5]:
# Download data (same as from Kaggle)
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip"

# Unzip data
unzip_data("nlp_getting_started.zip")

--2025-05-15 09:54:11--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.4.207, 172.253.118.207, 74.125.200.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.4.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2025-05-15 09:54:13 (709 KB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [6]:
# Turn .csv files into pandas DataFrame's
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
# Shuffle training dataframe
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [8]:
# The test data doesn't have a target (that's what we'd try to predict)
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [9]:
# How many examples of each class?
train_df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,4342
1,3271


In [10]:
# How many samples total?
print(f"Total training samples: {len(train_df)}")
print(f"Total test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(test_df)}")

Total training samples: 7613
Total test samples: 3263
Total samples: 10876


In [11]:
# Let's visualize some random training examples
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 0 (not real disaster)
Text:
Throw that water at me until I drown and my last words are choke me http://t.co/tUBE4NBqNz

---

Target: 0 (not real disaster)
Text:
babe I'm gonna ruin you if you let me stay

---

Target: 0 (not real disaster)
Text:
The Whirlwind! Scourge of Europe! RT @whedonesque Or you could just watch the Fanged Four http://t.co/Q0JHDcU6Ly

---

Target: 0 (not real disaster)
Text:
Creation of AI
Climate change
Bioterrorism
Mass automation of workforce
Contact with other life
Wealth inequality

Yea we've got it easy

---

Target: 0 (not real disaster)
Text:
Help me survive the zombie apocalypse at the Zombie Fun Run on November 15th. https://t.co/kgSwhSr7Mn #teamsurvivors #zombiefunrun2014

---



In [12]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, # dedicate 10% of samples to validation set
                                                                            random_state=42) # random state for reproducibility

In [13]:

# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [14]:
# View the first 10 training sentences and their labels
train_sentences[:10], train_labels[:10]

(array(['@mogacola @zamtriossu i screamed after hitting tweet',
        'Imagine getting flattened by Kurt Zouma',
        '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
        "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
        'Somehow find you and I collide http://t.co/Ee8RpOahPk',
        '@EvaHanderek @MarleyKnysh great times until the bus driver held us hostage in the mall parking lot lmfao',
        'destroy the free fandom honestly',
        'Weapons stolen from National Guard Armory in New Albany still missing #Gunsense http://t.co/lKNU8902JE',
        '@wfaaweather Pete when will the heat wave pass? Is it really going to be mid month? Frisco Boy Scouts have a canoe trip in Okla.',
        'Patient-reported outcomes in long-term survivors of metastatic colorectal cancer - British Journal of Surgery http://t.co/5Yl4DC1Tqt'],
       dtype=object),
 array([0,

In [15]:
# Converting text to numbers: tokenization | embedding

In [16]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [17]:
text_vectorizer = TextVectorization(max_tokens=None, # how many words in vocabulary add <OOV> if > max_t
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, # create groups of n-words
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None, #how long do you want your sequences to be
                                    )
                                   # pad_to_max_tokens=True)



In [18]:
# Find the avg number of tokens (words) in the training tweets (divide sum of lengths of each tweet by length of dataset)

In [19]:
sum([len(i.split()) for i in train_sentences]) / len(train_sentences)

14.901036345059115

In [20]:
max_vocab_length=10000 # max number of words to have in our vocab
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [21]:
text_vectorizer.adapt(train_sentences)

In [22]:
sample_senetence = "There's a flood in my street!"
text_vectorizer([sample_senetence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]])>

In [27]:
random_sentence = random.choice(train_sentences)
print(f"Original: \n {random_sentence} \n\nVectorized: \n {text_vectorizer([random_sentence])}")


Original: 
 3. excessive engine failure rate significant maintenance constantly emerging structural defects. Phew that's a lot I say. 

Vectorized: 
 [[ 118    1 3887  320 1264 3416 2057 3089    1  384 5847 4925  215    3
   505]]


In [None]:
# get the unique words in the vocab

In [31]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 most common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', np.str_('the'), np.str_('a'), np.str_('in')]
Bottom 5 most common words: [np.str_('pages'), np.str_('paeds'), np.str_('pads'), np.str_('padres'), np.str_('paddytomlinson1')]


In [32]:
# Creating an Embedding using Embedding layer

In [33]:
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding



<Embedding name=embedding, built=False>

In [34]:
random_sentence  = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence} \n\nEmbedded version: ")

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original sentence: 
 I'm liable to sound like a wounded animal during sex if the ?? is good lol 

Embedded version: 


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.00016645, -0.03123934,  0.00575968, ..., -0.00063642,
          0.01067158, -0.01435257],
        [-0.00394195, -0.02738487,  0.04237573, ..., -0.04460721,
         -0.00108249,  0.04934095],
        [ 0.03644122, -0.01756961,  0.01437226, ..., -0.04052138,
          0.02807328, -0.01259668],
        ...,
        [-0.04590349,  0.04638496, -0.01454872, ...,  0.01245527,
         -0.00013759,  0.02600235],
        [ 0.04922017, -0.02852803, -0.0390541 , ...,  0.00137415,
          0.01651067,  0.01686888],
        [ 0.04218146, -0.02440755, -0.00582181, ..., -0.02385071,
         -0.01224537, -0.01497616]]], dtype=float32)>

In [35]:
### model 0: baseline model sci-kit (Naive Bayes TF-IDF)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB()), # model the text. clf = classifier
])

model_0.fit(train_sentences, train_labels)

In [42]:
baseline_score = model_0.score(val_sentences, val_labels)
print(f"our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

our baseline model achieves an accuracy of: 79.27%


In [44]:
baseline_preds = model_0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [45]:
train_labels

array([0, 0, 1, ..., 1, 1, 0])

In [46]:
## EVALUATION FUNCTION FOR MODEL EXPERIMENTS

In [51]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def caluclate_results(y_true, y_pred):
  """
  Calcualates model accuracy, precision, recall and f1 score of a binary classification model
  """
  model_accurancy = accuracy_score(y_true, y_pred) * 100
  # precision, recall and f1-score using *weighted* average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accurancy,
                   "precision": model_precision,
                   "recall": model_recall,
                   "f1": model_f1}

  return model_results


In [52]:
baseline_results = caluclate_results(y_true=val_labels,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}