In [1]:
import zipfile

from fontTools.misc.cython import returns
!curl -O https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

zip_ref = zipfile.ZipFile('nlp_getting_started.zip')
zip_ref.extractall()
zip_ref.close()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  2  593k    2 15114    0     0  31649      0  0:00:19 --:--:--  0:00:19 31752
 19  593k   19  113k    0     0  94595      0  0:00:06  0:00:01  0:00:05 94691
100  593k  100  593k    0     0   317k      0  0:00:01  0:00:01 --:--:--  318k


In [2]:
import pandas as pd

train_df = pd.read_csv("nlp_getting_started/train.csv")
test_df = pd.read_csv("nlp_getting_started/test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)  # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [4]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [5]:
import random

random_index = random.randint(0, len(train_df) - 5)
for row in train_df[["text", "target"]][random_index:random_index + 5].itertuples():
    _, text, target = row
    print(f"target: {target}", "(Real disaster)" if target > 0 else "(Not Real disaster)")
    print(f"Text:\n{text}\n")
    print("--------------\n")

target: 0 (Not Real disaster)
Text:
aggressif is so bloody aggressive

--------------

target: 0 (Not Real disaster)
Text:
I entered to #win the ENTIRE set of butterLONDON Lip Crayons via @be_ram0s. - Go enter! #bbloggers http://t.co/DsB3lDfuxU

--------------

target: 0 (Not Real disaster)
Text:
@slsandpet Hey Sally sorry have you emailed me? Been AWOL bloody work ARGH! @ResignInShame

--------------

target: 0 (Not Real disaster)
Text:
I'm over here listening to Bloody Jay.  ???? https://t.co/CIyty0FgpR

--------------

target: 0 (Not Real disaster)
Text:
@LauradeHolanda I have the Forrest version from '83 that's bloody awful as well :))) xxx

--------------



In [6]:
#Split data into training and validation sets
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, random_state=42)

In [7]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [8]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [9]:
#converting text to number
import tensorflow as tf
from keras.api.layers import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=500000, standardize="lower_and_strip_punctuation",
    split="whitespace", ngrams=None, output_mode="int",
    output_sequence_length=None, pad_to_max_tokens=True,
)


In [10]:
len(train_sentences[0].split())

7

In [11]:
round(sum([len(i.split()) for i in train_sentences])) / len(train_sentences)

14.901036345059115

In [12]:
max_vocab_length = 10000
max_length = 15
text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)
text_vectorizer.adapt(train_sentences)

In [13]:
sample_sentences = "There's a flood in my street !!"
text_vectorizer([sample_sentences])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [14]:
#Choose a random sentence frm dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
      \n\nVectorize version:")
text_vectorizer([random_sentence])

Original sentence: 
 Police expand search for missing pregnant woman in Beloeil: Police in Richelieu-Saint-Laurent are expanding th... http://t.co/hMuyzmv8qH      

Vectorize version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[  77, 5686, 1259,   10,  373, 2788,  410,    4,    1,   77,    4,
        9090,   22,    1, 2657]], dtype=int64)>

In [15]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of word in vocabulary: {len(words_in_vocab)}")
print(f"5 most common word: {top_5_words}")
print(f"5 least common word: {bottom_5_words}")

Number of word in vocabulary: 10000
5 most common word: ['', '[UNK]', 'the', 'a', 'in']
5 least common word: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [37]:
#Creating an Embedding using an Embedding layer
from keras.api.layers import Embedding
embeddings=Embedding(input_dim=max_vocab_length,
                     output_dim=128,
                     embeddings_initializer="uniform",
                     input_length=max_length, name="embeddings_1")
embeddings



<Embedding name=embeddings_1, built=False>

In [38]:
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
\n\nVectorize version:")

sample_embeded=embeddings(text_vectorizer([random_sentence]))
sample_embeded

Original sentence: 
 [+]

Such a lonely day
And it's mine
It's a day that I'm glad I survived ...

??

Vectorize version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.01665516, -0.02942224, -0.00595887, ..., -0.04852667,
          0.04887315, -0.03544787],
        [ 0.01412356,  0.04909846,  0.00063423, ...,  0.0457469 ,
          0.02934206, -0.04095821],
        [-0.03495141,  0.04980837, -0.00785287, ...,  0.01508777,
          0.00809494, -0.04684956],
        ...,
        [ 0.02654089, -0.01128205, -0.02125224, ..., -0.03377541,
         -0.00277038, -0.02502468],
        [ 0.00332323,  0.040421  , -0.01412728, ...,  0.01925405,
         -0.01898351,  0.00821656],
        [ 0.01991207,  0.03674955,  0.00288671, ..., -0.0347579 ,
          0.04881075,  0.00805764]]], dtype=float32)>

In [39]:
sample_embeded[0][0],sample_embeded[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-1.66551583e-02, -2.94222366e-02, -5.95886633e-03, -4.43805829e-02,
         3.60893123e-02, -5.25380298e-03,  3.12057771e-02, -4.68354821e-02,
        -1.63707882e-03,  1.01805218e-02, -3.75890136e-02, -4.80220430e-02,
         3.93572114e-02,  5.35768270e-03,  9.96768475e-03, -4.54457775e-02,
         2.41997950e-02,  1.34179108e-02,  1.84972025e-02, -2.65546683e-02,
         1.93908066e-03, -1.70351975e-02, -1.63520463e-02, -3.07843927e-02,
        -1.56900510e-02,  9.60708782e-03,  1.19077079e-02,  8.27535987e-05,
        -6.69144467e-03, -3.96767631e-02,  2.34132521e-02, -4.92560640e-02,
        -2.24029664e-02, -7.42755830e-04, -3.97365466e-02, -1.68263912e-02,
        -3.53172049e-02,  4.00389358e-03, -1.81152225e-02,  4.18650843e-02,
         3.56175415e-02,  1.90275647e-02, -2.88972985e-02,  6.34778664e-03,
        -3.01017407e-02, -2.16735359e-02,  1.78789161e-02,  2.30079405e-02,
         4.45651449e-02,  1.14970915e-0

In [40]:
from sklearn.pipeline import Pipeline
#model 0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

model0 = Pipeline([
    ("tfidf", TfidfVectorizer()), #convert words into numbers using tfidf
    ("clf", MultinomialNB()) #model the text
])

model0.fit(train_sentences, train_labels)

In [41]:
baseline_score=model0.score(val_sentences, val_labels)
print(f"Baseline score: {baseline_score*100:.2f}%")

Baseline score: 79.27%


In [42]:
baseline_preds=model0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [43]:
train_labels[:20]

array([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
      dtype=int64)

In [44]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def calculate_result(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_prediction, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_prediction,
        "recall": model_recall,
        "f1": model_f1,
    }
    return model_results

In [45]:
baseline_result=calculate_result(y_true=val_labels, y_pred=baseline_preds)
baseline_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [46]:
from helper_functions import create_tensorboard_callback

SAVE_DIR="model_logs"

In [47]:
from keras.api.layers import Input,Dense,GlobalAveragePooling1D
from keras.api.models import Model
inputs=Input(shape=(1,), dtype=tf.string)
x=text_vectorizer(inputs)
x=embeddings(x)
x=GlobalAveragePooling1D()(x)
outputs=Dense(1, activation="sigmoid")(x)
model1=Model(inputs=inputs, outputs=outputs, name="model1_dense")

In [48]:
from keras.api.optimizers import Adam
model1.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
model1.summary()

In [49]:
model1_history=model1.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                          callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="simple-dense-model")],verbose=2)

Saving TensorBoard log files to: model_logs/simple-dense-model/20250416-003043
Epoch 1/5
215/215 - 2s - 10ms/step - accuracy: 0.7035 - loss: 0.6074 - val_accuracy: 0.7441 - val_loss: 0.5348
Epoch 2/5
215/215 - 1s - 6ms/step - accuracy: 0.8162 - loss: 0.4444 - val_accuracy: 0.7900 - val_loss: 0.4718
Epoch 3/5
215/215 - 1s - 6ms/step - accuracy: 0.8594 - loss: 0.3521 - val_accuracy: 0.7874 - val_loss: 0.4630
Epoch 4/5
215/215 - 1s - 6ms/step - accuracy: 0.8880 - loss: 0.2888 - val_accuracy: 0.7940 - val_loss: 0.4707
Epoch 5/5
215/215 - 1s - 6ms/step - accuracy: 0.9088 - loss: 0.2426 - val_accuracy: 0.7887 - val_loss: 0.4823


In [50]:
model1.evaluate(val_sentences, val_labels)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7836 - loss: 0.5158 


[0.4822697937488556, 0.7887139320373535]

In [51]:
embeddings.weights

[<Variable path=embeddings_1/embeddings, shape=(10000, 128), dtype=float32, value=[[-0.00105956  0.06630866 -0.05052552 ...  0.01238488  0.01555996
   -0.05597138]
  [ 0.00749016  0.04296309  0.02691201 ...  0.00592512 -0.03651454
    0.03019136]
  [-0.02736591  0.03094904 -0.01883821 ... -0.00969164  0.0668201
   -0.01755818]
  ...
  [-0.04330193 -0.02654817 -0.00121553 ...  0.00478987 -0.04170083
    0.01119641]
  [-0.00715868  0.02432529 -0.04117876 ...  0.06612311  0.01719463
   -0.00361164]
  [ 0.04292446  0.04501513 -0.05462654 ...  0.03519523  0.02726359
   -0.0319547 ]]>]

In [54]:
embed_weights=model1.get_layer("embeddings_1").get_weights()[0]
print(embed_weights.shape)

(10000, 128)


In [59]:
model_1_pred_probs=model1.predict(val_sentences)
model_1_pred_probs[:10]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


array([[0.33146945],
       [0.7698919 ],
       [0.9980194 ],
       [0.11929566],
       [0.11007538],
       [0.9190936 ],
       [0.906498  ],
       [0.99298733],
       [0.9559882 ],
       [0.2587864 ]], dtype=float32)

In [61]:
model1_preds=tf.squeeze(tf.round(model_1_pred_probs))
model1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [62]:
model1_results=calculate_result(y_true=val_labels, y_pred=model1_preds)
model1_results

{'accuracy': 78.87139107611549,
 'precision': 0.7958623979922417,
 'recall': 0.7887139107611548,
 'f1': 0.785056193671008}

In [65]:
import numpy as np
np.array(list(model1_results.values()))>np.array(list(baseline_result.values()))

array([False, False, False, False])