In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import zipfile
import pandas as pd
import numpy as np
import random

In [None]:
!wget https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

--2024-04-18 14:40:37--  https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.12.207, 74.125.26.207, 172.217.193.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.12.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607343 (593K) [application/zip]
Saving to: ‘nlp_getting_started.zip’


2024-04-18 14:40:37 (95.1 MB/s) - ‘nlp_getting_started.zip’ saved [607343/607343]



In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-04-18 11:54:41--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2024-04-18 11:54:42 (47.6 MB/s) - ‘helper_functions.py’ saved [10246/10246]



In [None]:
# Now you can import your function
from helper_functions import create_tensorboard_callback, unzip_data, plot_loss_curves, compare_historys

In [None]:
unzip_data('nlp_getting_started.zip')

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [None]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [None]:
train_df.target.value_counts()[0]/len(train_df)*100

57.03402075397347

In [None]:
train_df.target.value_counts()[1]/len(train_df)*100

42.96597924602653

In [None]:
random_index = random.randint(0, len(train_df)-5)
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 0 (not real disaster)
Text:
Currently want to drive my car off a cliff and fall to my death.

---

Target: 0 (not real disaster)
Text:
'It hasn't collapsed because the Greek people are still being played for as fools by Tsipras he costÛ_' ÛÓ WallyBaiter http://t.co/gbRNuLp3fH

---

Target: 1 (real disaster)
Text:
UPDATE 1-Russian 'food crematoria' provoke outrage amid crisis famine memories: * Russian society still recal... http://t.co/J2erZbMjQD

---

Target: 0 (not real disaster)
Text:
General News Û¢åÊ'Demolition of houses on waterways begins at Achimota Mile 7 ' via @233liveOnline. Full story at http://t.co/iO7kUUg1uq

---

Target: 0 (not real disaster)
Text:
Photo: forrestmankins: Colorado camping. http://t.co/S0VgTkhW7V

---



In [None]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [None]:
train_labels.shape, val_labels.shape, train_sentences.shape, val_sentences.shape

((6851,), (762,), (6851,), (762,))

#**Tokenizer**

Word level tokenization

In [None]:
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=None,
                                                    standardize="lower_and_strip_punctuation",
                                                    split="whitespace",
                                                    ngrams=None,
                                                    output_mode="int",
                                                    output_sequence_length=None,
                                                    pad_to_max_tokens=False)

In [None]:
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

TypeError: 'int' object is not callable

In [None]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                                    output_mode="int",
                                                    output_sequence_length=max_length)

In [None]:
text_vectorizer.adapt(train_sentences)

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()
words_in_vocab[:5]

['', '[UNK]', 'the', 'a', 'in']

In [None]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                             output_dim=128,
                             input_length=max_length)

embedding

<keras.src.layers.core.embedding.Embedding at 0x78b03d349120>

In [None]:
random_sentence = random.choice(train_sentences)

sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[ 0.03565862, -0.01033003, -0.01864718, ...,  0.01106099,
         -0.0329092 ,  0.03117971],
        [-0.00199997, -0.02933523,  0.0103354 , ...,  0.03355156,
         -0.02008897,  0.01802984],
        [-0.0375211 , -0.01270992, -0.02642171, ...,  0.04323239,
          0.04471685,  0.00097612],
        ...,
        [-0.03256723,  0.03802327, -0.0241738 , ...,  0.04117694,
          0.01811675, -0.00968892],
        [-0.03256723,  0.03802327, -0.0241738 , ...,  0.04117694,
          0.01811675, -0.00968892],
        [-0.03256723,  0.03802327, -0.0241738 , ...,  0.04117694,
          0.01811675, -0.00968892]]], dtype=float32)>

In [None]:
sample_embed[0][0], sample_embed[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([ 0.03565862, -0.01033003, -0.01864718, -0.00554155,  0.00408251,
         0.01776457, -0.01085478, -0.01546165, -0.00148969,  0.04539653,
        -0.00971016, -0.01489978,  0.00266638, -0.04286582,  0.0142331 ,
        -0.0107279 , -0.04994383,  0.04748831, -0.0413591 ,  0.04074809,
        -0.0265231 ,  0.01928708,  0.04400006,  0.01929868, -0.0313251 ,
         0.0271752 ,  0.03690842, -0.01914697,  0.04136935, -0.02841464,
         0.03419888, -0.0472103 , -0.03898551, -0.0311281 , -0.02732431,
         0.02229353,  0.01871051,  0.01425009,  0.02752156,  0.01339965,
        -0.03520106,  0.01305094, -0.02771625,  0.02786047, -0.01455928,
         0.02756016, -0.03931076,  0.01861567, -0.00639253,  0.0429661 ,
        -0.03920343, -0.00359518,  0.04876646, -0.01446038, -0.0034264 ,
         0.0291229 , -0.00990884, -0.0112932 , -0.02097582,  0.0408037 ,
         0.03566371,  0.00276359,  0.00990468, -0.02304274,  0.01213851,
  

In [None]:
model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels)

In [None]:
train_labels.shape, val_labels.shape, train_sentences.shape, val_sentences.shape

((6851,), (762,), (6851,), (762,))

In [None]:
baseline_score = model_0.score(val_sentences, val_labels)

In [None]:
baseline_score

0.7926509186351706

In [None]:
baseline_preds = model_0.predict(val_sentences)

In [None]:
baseline_preds

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [None]:
def calculate_results(y_true, y_preds):
  model_accuracy = accuracy_score(y_true, y_preds) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_preds, average="weighted")
  model_results = {"accuracy": model_accuracy,
                   "precision": model_precision,
                   "recall": model_recall,
                   "f1": model_f1}
  return model_results

In [None]:
baseline_results = calculate_results(y_true=val_labels,
                                     y_preds=baseline_preds)
baseline_results

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [None]:
SAVE_DIR = "model_logs"

In [None]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_dense")

In [None]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (Text  (None, 15)                0         
 Vectorization)                                                  
                                                                 
 embedding (Embedding)       (None, 15, 128)           1280000   
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1280129 (4.88 MB)
Trainable params: 128

In [None]:
model_1.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
model_1.fit(x=train_sentences,
           y=train_labels,
           epochs=5,
           validation_data=(val_sentences, val_labels),
           callbacks=[create_tensorboard_callback(dir_name=SAVE_DIR,
                                                  experiment_name="Model_1_Dense")])

Saving TensorBoard log files to: model_logs/Model_1_Dense/20240418-115534
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x78b03d375ab0>

In [None]:
model_1_preds = model_1.predict(val_sentences)
model_1_preds.shape



(762, 1)

In [None]:
model_1_results = calculate_results(y_true=val_labels,
                                    y_preds=tf.squeeze(tf.round(model_1_preds)))
model_1_results

{'accuracy': 78.74015748031496,
 'precision': 0.7932296029485675,
 'recall': 0.7874015748031497,
 'f1': 0.7841130596930417}

In [None]:
np.array(list(baseline_results.values())) > np.array(list(model_1_results.values()))

array([ True,  True,  True,  True])

In [None]:
baseline_results, model_1_results

({'accuracy': 79.26509186351706,
  'precision': 0.8111390004213173,
  'recall': 0.7926509186351706,
  'f1': 0.7862189758049549},
 {'accuracy': 78.74015748031496,
  'precision': 0.7932296029485675,
  'recall': 0.7874015748031497,
  'f1': 0.7841130596930417})

In [None]:
weights = model_1.get_layer('embedding').get_weights()[0]
vocab = text_vectorizer.get_vocabulary()

In [None]:
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
n = 0
sum = 0
while n <= 7:
  x = 2**n
  sum -=x
  n = n + 1

In [None]:
sum

-255

In [None]:
matrix = tf.range(shape=(3, 3), dtype=tf.dtypes.float32)

TypeError: Got an unexpected keyword argument 'shape'

In [None]:
matrix *= 500

TypeError: unsupported operand type(s) for *=: 'method' and 'int'

In [None]:
np.max(matrix), np.min(matrix)

(<bound method _EagerTensorBase.numpy of <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
 array([[-286.40338 ,  625.4731  ,  -47.459167],
        [  28.034765, -621.8519  , -344.89468 ],
        [ 906.45215 , -255.2908  ,   60.29392 ]], dtype=float32)>>,
 <bound method _EagerTensorBase.numpy of <tf.Tensor: shape=(3, 3), dtype=float32, numpy=
 array([[-286.40338 ,  625.4731  ,  -47.459167],
        [  28.034765, -621.8519  , -344.89468 ],
        [ 906.45215 , -255.2908  ,   60.29392 ]], dtype=float32)>>)