In [1]:
# import zipfile
#
# from fontTools.misc.cython import returns
# !curl -O https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
#
# zip_ref = zipfile.ZipFile('nlp_getting_started.zip')
# zip_ref.extractall()
# zip_ref.close()

In [2]:
import pandas as pd

train_df = pd.read_csv("nlp_getting_started/train.csv")
test_df = pd.read_csv("nlp_getting_started/test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)  # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [4]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [5]:
import random

random_index = random.randint(0, len(train_df) - 5)
for row in train_df[["text", "target"]][random_index:random_index + 5].itertuples():
    _, text, target = row
    print(f"target: {target}", "(Real disaster)" if target > 0 else "(Not Real disaster)")
    print(f"Text:\n{text}\n")
    print("--------------\n")

target: 1 (Real disaster)
Text:
More Natural Disaster Research Urgent http://t.co/5Cm0LfZhxn via #JakartaPost

--------------

target: 0 (Not Real disaster)
Text:
I'm a disaster?? https://t.co/VCV73BUaCZ

--------------

target: 0 (Not Real disaster)
Text:
@LovelyLikeLaura I can see why one of your favorite books is 'Beautiful Disaster' it may now be one of mine??

--------------

target: 1 (Real disaster)
Text:
å¬'Only the sea knows how many are dead' @MSF_Sea after last disaster in #Mediterranean turned into a massgrave  http://t.co/m0utLDif77

--------------

target: 0 (Not Real disaster)
Text:
Beautiful disaster // Jon McLaughlin is such a good song

--------------



In [6]:
#Split data into training and validation sets
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, random_state=42)

In [7]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [8]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [9]:
#converting text to number
import tensorflow as tf
from keras.api.layers import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=500000, standardize="lower_and_strip_punctuation",
    split="whitespace", ngrams=None, output_mode="int",
    output_sequence_length=None, pad_to_max_tokens=True,
)


In [10]:
len(train_sentences[0].split())

7

In [11]:
round(sum([len(i.split()) for i in train_sentences])) / len(train_sentences)

14.901036345059115

In [12]:
max_vocab_length = 10000
max_length = 15
text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)
text_vectorizer.adapt(train_sentences)

In [13]:
sample_sentences = "There's a flood in my street !!"
text_vectorizer([sample_sentences])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [14]:
#Choose a random sentence frm dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
      \n\nVectorize version:")
text_vectorizer([random_sentence])

Original sentence: 
 Bed time. Don't wake me up unless revolution or Armageddon start.      

Vectorize version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[1040,   92,   63,  722,   31,   27, 1744, 3467,   53,  443,  667,
           0,    0,    0,    0]], dtype=int64)>

In [15]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of word in vocabulary: {len(words_in_vocab)}")
print(f"5 most common word: {top_5_words}")
print(f"5 least common word: {bottom_5_words}")

Number of word in vocabulary: 10000
5 most common word: ['', '[UNK]', 'the', 'a', 'in']
5 least common word: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [16]:
#Creating an Embedding using an Embedding layer
from keras.api.layers import Embedding
embeddings=Embedding(input_dim=max_vocab_length,
                     output_dim=128,
                     embeddings_initializer="uniform",
                     input_length=max_length, name="embeddings_1")
embeddings



<Embedding name=embeddings_1, built=False>

In [17]:
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
\n\nVectorize version:")

sample_embeded=embeddings(text_vectorizer([random_sentence]))
sample_embeded

Original sentence: 
 late night mcdonalds with friends = hilarious although my car is wrecked and there's half a steak pastie in the industrial estate

Vectorize version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.02153241, -0.03238304,  0.02198416, ..., -0.03415438,
          0.01593081,  0.04640279],
        [-0.04341969, -0.01572249, -0.02905985, ...,  0.01954875,
          0.04137622, -0.027309  ],
        [ 0.01112512,  0.036932  ,  0.00547595, ...,  0.00402074,
          0.0219307 ,  0.03124971],
        ...,
        [-0.00972886,  0.01944676, -0.03812282, ..., -0.00771857,
         -0.00413267,  0.02558854],
        [ 0.04873742,  0.00712948, -0.04618287, ..., -0.00282793,
         -0.01827381, -0.0417719 ],
        [-0.00596889, -0.03081917, -0.00821077, ...,  0.01358683,
          0.04816519,  0.03692839]]], dtype=float32)>

In [18]:
sample_embeded[0][0],sample_embeded[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 0.02153241, -0.03238304,  0.02198416, -0.02892491, -0.02828711,
         0.00807458,  0.00031047,  0.00928439, -0.02651612,  0.00453838,
        -0.02090596,  0.01070954,  0.00280907, -0.0085783 ,  0.04899809,
        -0.02215507,  0.01645121, -0.04464303,  0.04511135, -0.01830501,
         0.00461465,  0.0262882 , -0.01641557,  0.01064236, -0.03270749,
         0.00468332,  0.03915206,  0.01315503, -0.04674742,  0.01821804,
         0.01242018, -0.03281577,  0.03211558, -0.03162804,  0.01684523,
        -0.03387548,  0.02274704, -0.04321848,  0.01333723, -0.04869003,
        -0.03851522,  0.03457772, -0.02612773, -0.0306596 , -0.02972591,
         0.01530026,  0.00255289, -0.01431168, -0.0353044 , -0.03253063,
         0.03896293,  0.029511  , -0.01056363,  0.03604325,  0.00312525,
        -0.03065456, -0.04453642,  0.04819813, -0.03833803, -0.02034226,
        -0.00749434, -0.03859544, -0.03910185,  0.01152992, -0.03283212,
  

In [19]:
from sklearn.pipeline import Pipeline
#model 0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

model0 = Pipeline([
    ("tfidf", TfidfVectorizer()), #convert words into numbers using tfidf
    ("clf", MultinomialNB()) #model the text
])

model0.fit(train_sentences, train_labels)

In [20]:
baseline_score=model0.score(val_sentences, val_labels)
print(f"Baseline score: {baseline_score*100:.2f}%")

Baseline score: 79.27%


In [21]:
baseline_preds=model0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [22]:
train_labels[:20]

array([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
      dtype=int64)

In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_result(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_prediction, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_prediction,
        "recall": model_recall,
        "f1": model_f1,
    }
    return model_results

In [24]:
baseline_result=calculate_result(y_true=val_labels, y_pred=baseline_preds)
baseline_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [25]:
from helper_functions import create_tensorboard_callback

SAVE_DIR="model_logs"

In [26]:
from keras.api.layers import Input,Dense,GlobalAveragePooling1D
from keras.api.models import Model
inputs=Input(shape=(1,), dtype=tf.string)
x=text_vectorizer(inputs)
x=embeddings(x)
x=GlobalAveragePooling1D()(x)
outputs=Dense(1, activation="sigmoid")(x)
model1=Model(inputs=inputs, outputs=outputs, name="model1_dense")

In [27]:
from keras.api.optimizers import Adam
model1.compile(loss="binary_crossentropy", optimizer=Adam(), metrics=["accuracy"])
model1.summary()

In [28]:
model1_history=model1.fit(train_sentences, train_labels, epochs=5, validation_data=(val_sentences, val_labels),
                          callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,experiment_name="simple-dense-model")],verbose=2)

Saving TensorBoard log files to: model_logs/simple-dense-model/20250419-221552
Epoch 1/5
215/215 - 4s - 17ms/step - accuracy: 0.6979 - loss: 0.6078 - val_accuracy: 0.7546 - val_loss: 0.5373
Epoch 2/5
215/215 - 2s - 8ms/step - accuracy: 0.8171 - loss: 0.4438 - val_accuracy: 0.7861 - val_loss: 0.4770
Epoch 3/5
215/215 - 2s - 8ms/step - accuracy: 0.8613 - loss: 0.3510 - val_accuracy: 0.7874 - val_loss: 0.4634
Epoch 4/5
215/215 - 2s - 10ms/step - accuracy: 0.8897 - loss: 0.2872 - val_accuracy: 0.7900 - val_loss: 0.4716
Epoch 5/5
215/215 - 2s - 9ms/step - accuracy: 0.9096 - loss: 0.2407 - val_accuracy: 0.7822 - val_loss: 0.4789


In [29]:
model1.evaluate(val_sentences, val_labels)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7696 - loss: 0.5120


[0.47889477014541626, 0.7821522355079651]

In [30]:
embeddings.weights

[<Variable path=embeddings_1/embeddings, shape=(10000, 128), dtype=float32, value=[[-0.03852179 -0.04297213 -0.01192745 ...  0.03213843  0.02696556
   -0.02262829]
  [-0.00738348 -0.03287888 -0.00837778 ... -0.03891818 -0.00846155
   -0.02191011]
  [-0.04508296 -0.04964537 -0.010082   ... -0.04412369  0.03848629
   -0.03929289]
  ...
  [ 0.00176894 -0.04877473 -0.0083646  ...  0.02678731  0.00220319
    0.02145778]
  [-0.05977643 -0.03733831  0.03309556 ... -0.05898897 -0.01910268
    0.0151341 ]
  [-0.09594821 -0.07744503  0.04406755 ... -0.08655727 -0.06634388
    0.02949214]]>]

In [31]:
embed_weights=model1.get_layer("embeddings_1").get_weights()[0]
print(embed_weights.shape)

(10000, 128)


In [32]:
model_1_pred_probs=model1.predict(val_sentences)
model_1_pred_probs[:10]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


array([[0.37727714],
       [0.8184652 ],
       [0.997603  ],
       [0.117115  ],
       [0.125233  ],
       [0.942799  ],
       [0.9312243 ],
       [0.9924305 ],
       [0.96624935],
       [0.24578662]], dtype=float32)

In [33]:
model1_preds=tf.squeeze(tf.round(model_1_pred_probs))
model1_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [34]:
model1_results=calculate_result(y_true=val_labels, y_pred=model1_preds)
model1_results

{'accuracy': 78.21522309711287,
 'precision': 0.7868451603977311,
 'recall': 0.7821522309711286,
 'f1': 0.779088324447517}

In [35]:
import numpy as np
np.array(list(model1_results.values()))>np.array(list(baseline_result.values()))

array([False, False, False, False])

In [36]:
words_in_vocab=text_vectorizer.get_vocabulary()
len(words_in_vocab),words_in_vocab[:10]

(10000, ['', '[UNK]', 'the', 'a', 'in', 'to', 'of', 'and', 'i', 'is'])

In [37]:
model1.summary()

In [40]:
embed_weights=model1.get_layer("embeddings_1").get_weights()[0]
embed_weights.shape

(10000, 128)

In [41]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(words_in_vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = embed_weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()