In [1]:
import zipfile

from fontTools.misc.cython import returns
!curl -O https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip

zip_ref = zipfile.ZipFile('nlp_getting_started.zip')
zip_ref.extractall()
zip_ref.close()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  593k  100  593k    0     0  1306k      0 --:--:-- --:--:-- --:--:-- 1315k


In [2]:
import pandas as pd

train_df = pd.read_csv("nlp_getting_started/train.csv")
test_df = pd.read_csv("nlp_getting_started/test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
train_df_shuffled = train_df.sample(frac=1, random_state=42)  # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [4]:
train_df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [5]:
import random

random_index = random.randint(0, len(train_df) - 5)
for row in train_df[["text", "target"]][random_index:random_index + 5].itertuples():
    _, text, target = row
    print(f"target: {target}", "(Real disaster)" if target > 0 else "(Not Real disaster)")
    print(f"Text:\n{text}\n")
    print("--------------\n")

target: 0 (Not Real disaster)
Text:
@crowdtappers @joinvroom OMG I remember the meltdown the day I did her hair like ELSA and not ANNA.... OHHHH THE HORROR!!! LOL #tangletalk

--------------

target: 0 (Not Real disaster)
Text:
@DmoneyDemi I had my meltdown yesterday.  I'm going to miss you so much. You are forsure my DTB for life. When I get back watchout ??

--------------

target: 0 (Not Real disaster)
Text:
Why must I have a meltdown every few days? ??

--------------

target: 0 (Not Real disaster)
Text:
@nprfreshair I really can't believe he is skipping out before the Republican meltdown...I mean 'debate'.

--------------

target: 0 (Not Real disaster)
Text:
CommoditiesåÊAre Crashing Like It's 2008 All Over Again http://t.co/nJD1N5TxXe via @business

--------------



In [6]:
#Split data into training and validation sets
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1, random_state=42)

In [7]:
len(train_sentences), len(val_sentences), len(train_labels), len(val_labels)

(6851, 762, 6851, 762)

In [8]:
train_sentences[:5]

array(['@mogacola @zamtriossu i screamed after hitting tweet',
       'Imagine getting flattened by Kurt Zouma',
       '@Gurmeetramrahim #MSGDoing111WelfareWorks Green S welfare force ke appx 65000 members har time disaster victim ki help ke liye tyar hai....',
       "@shakjn @C7 @Magnums im shaking in fear he's gonna hack the planet",
       'Somehow find you and I collide http://t.co/Ee8RpOahPk'],
      dtype=object)

In [9]:
#converting text to number
import tensorflow as tf
from keras.api.layers import TextVectorization

text_vectorizer = TextVectorization(
    max_tokens=500000, standardize="lower_and_strip_punctuation",
    split="whitespace", ngrams=None, output_mode="int",
    output_sequence_length=None, pad_to_max_tokens=True,
)


In [10]:
len(train_sentences[0].split())

7

In [11]:
round(sum([len(i.split()) for i in train_sentences])) / len(train_sentences)

14.901036345059115

In [12]:
max_vocab_length = 10000
max_length = 15
text_vectorizer = TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)
text_vectorizer.adapt(train_sentences)

In [13]:
sample_sentences = "There's a flood in my street !!"
text_vectorizer([sample_sentences])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[264,   3, 232,   4,  13, 698,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [14]:
#Choose a random sentence frm dataset and tokenize it
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
      \n\nVectorize version:")
text_vectorizer([random_sentence])

Original sentence: 
 #Colorado #News Motorcyclist bicyclist injured in Denver collision on Broadway: At least two people were tak... http://t.co/2iAFPmqJeP      

Vectorize version:


<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[1033,   58, 1499, 1715,  243,    4, 1145,  584,   11, 1889,   17,
         506,  116,   57,   67]], dtype=int64)>

In [15]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5]
bottom_5_words = words_in_vocab[-5:]
print(f"Number of word in vocabulary: {len(words_in_vocab)}")
print(f"5 most common word: {top_5_words}")
print(f"5 least common word: {bottom_5_words}")

Number of word in vocabulary: 10000
5 most common word: ['', '[UNK]', 'the', 'a', 'in']
5 least common word: ['pages', 'paeds', 'pads', 'padres', 'paddytomlinson1']


In [16]:
#Creating an Embedding using an Embedding layer
from keras.api.layers import Embedding
embeddings=Embedding(input_dim=max_vocab_length,
                     output_dim=128,
                     embeddings_initializer="uniform",
                     input_length=max_length)
embeddings



<Embedding name=embedding, built=False>

In [17]:
random_sentence = random.choice(train_sentences)
print(f"Original sentence: \n {random_sentence}\
\n\nVectorize version:")

sample_embeded=embeddings(text_vectorizer([random_sentence]))
sample_embeded

Original sentence: 
 Mourning notices for stabbing arson victims stir Û÷politics of griefÛª in Israel: Posters for Shira Banki and A... http://t.co/3GZ5zQQTHe

Vectorize version:


<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04551202,  0.03857634, -0.04914736, ..., -0.00042182,
          0.00797714, -0.0322746 ],
        [ 0.00513594,  0.00448005,  0.04345074, ..., -0.00479954,
         -0.01601674, -0.02639924],
        [ 0.03940158,  0.02386623,  0.03672632, ...,  0.01182682,
         -0.03413142, -0.00098972],
        ...,
        [-0.00104886, -0.03518275,  0.04880024, ..., -0.03561976,
          0.01794698, -0.02935177],
        [ 0.03940158,  0.02386623,  0.03672632, ...,  0.01182682,
         -0.03413142, -0.00098972],
        [-0.01197406,  0.04185488,  0.02048573, ...,  0.04349026,
         -0.00376817,  0.04197248]]], dtype=float32)>

In [18]:
sample_embeded[0][0],sample_embeded[0][0].shape, random_sentence

(<tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-0.04551202,  0.03857634, -0.04914736,  0.03190819,  0.01092755,
         0.03874786, -0.01460918, -0.01313291, -0.005229  , -0.00774373,
        -0.04829104,  0.04012263, -0.00630523,  0.01836142,  0.03542963,
         0.01726628, -0.01959338,  0.02918119,  0.03516959, -0.00859635,
        -0.03029349, -0.0059541 ,  0.03951808,  0.00530795, -0.02214882,
        -0.00377263, -0.00040275,  0.02228657, -0.03483798,  0.03321543,
        -0.03898922,  0.01631714, -0.01419834,  0.04465567, -0.01530501,
         0.0417786 , -0.04070181,  0.01553083,  0.00480266,  0.00581226,
         0.03334181,  0.03907721, -0.00577771, -0.02947348,  0.02928985,
         0.00961913,  0.03579931,  0.04564463,  0.04468948,  0.04359681,
         0.00730104,  0.01127845,  0.04284218, -0.00359859, -0.00207865,
         0.04111966,  0.04910049, -0.01738505,  0.04286157, -0.03838556,
        -0.0436048 ,  0.01420348,  0.03002555,  0.01972202,  0.02095455,
  

In [19]:
from sklearn.pipeline import Pipeline
#model 0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

model0 = Pipeline([
    ("tfidf", TfidfVectorizer()), #convert words into numbers using tfidf
    ("clf", MultinomialNB()) #model the text
])

model0.fit(train_sentences, train_labels)

In [20]:
baseline_score=model0.score(val_sentences, val_labels)
print(f"Baseline score: {baseline_score*100:.2f}%")

Baseline score: 79.27%


In [21]:
baseline_preds=model0.predict(val_sentences)
baseline_preds[:20]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
      dtype=int64)

In [22]:
train_labels[:20]

array([0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
      dtype=int64)

In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def calculate_result(y_true, y_pred):
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_prediction, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {
        "accuracy": model_accuracy,
        "precision": model_prediction,
        "recall": model_recall,
        "f1": model_f1,
    }
    return model_results

In [26]:
baseline_result=calculate_result(y_true=val_labels, y_pred=baseline_preds)
baseline_result

{'accuracy': 79.26509186351706,
 'precision': 0.8111390004213173,
 'recall': 0.7926509186351706,
 'f1': 0.7862189758049549}

In [None]:
from helper_functions import create_tensorboard_callback

SAVE_DIR="model_logs"

In [28]:
from keras.api.layers import Input,Dense
from keras.api.models import Model
inputs=Input(shape=(1,), dtype=tf.string)
x=text_vectorizer(inputs)
x=embeddings(x)
outputs=Dense(1, activation="sigmoid")(x)
model1=Model(inputs=inputs, outputs=outputs, name="model1_dense")



<Embedding name=embedding, built=True>