### Import the data set

In [34]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### # How many samples of each class?

In [35]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [36]:
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


### Let's visualize some random training examples

In [37]:
import random
random_index = random.randint(0, len(train_df)-5) # create random indexes not higher than the total number of samples
for row in train_df_shuffled[["text", "target"]][random_index:random_index+5].itertuples():
  _, text, target = row
  print(f"Target: {target}", "(real disaster)" if target > 0 else "(not real disaster)")
  print(f"Text:\n{text}\n")
  print("---\n")

Target: 0 (not real disaster)
Text:
I have the biggest crush on you &amp; I dont know if you'll ever know it ??

---

Target: 0 (not real disaster)
Text:
@KirCut1 lets get a dope picture together and have the dopest explosion ????

---

Target: 1 (real disaster)
Text:
Suspect in latest theater attack had psychological issues http://t.co/3huhZxliiG

---

Target: 0 (not real disaster)
Text:
@DavidCovucci We can't because a sinkhole swallowed every taco place in the neighborhood

---

Target: 1 (real disaster)
Text:
@Kirafrog @mount_wario Did you get wrecked again?

---



In [38]:
from sklearn.model_selection import train_test_split

# Use train_test_split to split training data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["text"].to_numpy(),
                                                                            train_df_shuffled["target"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42) # random state for reproducibility

In [39]:
# Check the lengths
len(train_sentences), len(train_labels), len(val_sentences), len(val_labels)

(6851, 6851, 762, 762)

In [40]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization # after TensorFlow 2.6

# Before TensorFlow 2.6
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization 
# Note: in TensorFlow 2.6+, you no longer need "layers.experimental.preprocessing"
# you can use: "tf.keras.layers.TextVectorization"

# Use the default TextVectorization variables
text_vect = TextVectorization(max_tokens=None, # how many words in the vocabulary (all of the different words in your text)
                                    standardize="lower_and_strip_punctuation", # how to process text
                                    split="whitespace", # how to split tokens
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=None) # how long should the output sequence of tokens be?


In [41]:
# Fit the text vectorizer to the training text
text_vect.adapt(train_sentences)

In [42]:
# Create sample sentence and tokenize it
sample_sentence = "There is flood in my city"
text_vect([sample_sentence])

<tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[ 74,   9, 232,   4,  13, 182]], dtype=int64)>

In [43]:
# Create sample sentence and tokenize it
sample_sentence = "There is flood in my city and we are looking for help"
text_vect([sample_sentence])

<tf.Tensor: shape=(1, 12), dtype=int64, numpy=
array([[ 74,   9, 232,   4,  13, 182,   7,  46,  22, 884,  10, 148]],
      dtype=int64)>

In [44]:
# Find average number of tokens (words) in training Tweets
round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))

15

In [45]:
# Setup text vectorization with custom variables
max_vocab_length = 10000 # max number of words to have in our vocabulary
max_length = 15 # max length our sequences will be (e.g. how many words from a Tweet does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [46]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_sentences)

In [47]:
# Create sample sentence and tokenize it
sample_sentence = "There is flood in my city"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9, 232,   4,  13, 182,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int64)>

In [48]:
# Create sample sentence and tokenize it
sample_sentence = "There is flood in my city and we are looking for help"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 74,   9, 232,   4,  13, 182,   7,  46,  22, 884,  10, 148,   0,
          0,   0]], dtype=int64)>

## DRAWBACKS of Textvectorization: 
##           1. creats very huge matrix
##           2. results in sparse matrix representation
##           3. provides static vector representation

## Word Embedding

In [49]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1") 



In [50]:
sample_sentence = "There is flood in my city"
sample_embed = embedding(text_vectorizer([sample_sentence]))
sample_embed

<tf.Tensor: shape=(1, 15, 128), dtype=float32, numpy=
array([[[-0.04161851,  0.02555401,  0.03059632, ...,  0.01822953,
          0.01464251, -0.04292045],
        [ 0.03323453,  0.04985398, -0.00108425, ..., -0.04525739,
         -0.04970834, -0.01337311],
        [-0.03690611,  0.04052639, -0.03470866, ..., -0.00616246,
         -0.03034323,  0.04261291],
        ...,
        [ 0.00785639,  0.00100154, -0.04740683, ...,  0.037011  ,
         -0.0412913 , -0.01938312],
        [ 0.00785639,  0.00100154, -0.04740683, ...,  0.037011  ,
         -0.0412913 , -0.01938312],
        [ 0.00785639,  0.00100154, -0.04740683, ...,  0.037011  ,
         -0.0412913 , -0.01938312]]], dtype=float32)>

In [51]:
# Check out a single token's embedding
sample_embed[0][0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0.04161851,  0.02555401,  0.03059632, -0.01319492,  0.02548862,
        0.0184299 ,  0.00765024,  0.01213402, -0.03109097, -0.03191024,
        0.0155153 ,  0.03752157, -0.00980986, -0.02124889, -0.01659371,
       -0.0180951 ,  0.01655449, -0.02162659, -0.01845462,  0.0122849 ,
       -0.03900913, -0.04460765, -0.03791111, -0.0330929 ,  0.02078063,
        0.00836837,  0.00143768, -0.03502387, -0.02615548, -0.0005996 ,
       -0.03172252,  0.0486537 ,  0.02887782, -0.03824252, -0.0270465 ,
       -0.01473473, -0.03352197, -0.04126145, -0.04876558,  0.03251504,
        0.0102616 ,  0.03238047, -0.03751427,  0.04423592,  0.0385224 ,
        0.03503698, -0.04129225, -0.03040377, -0.01148222,  0.03695824,
       -0.00780533, -0.01735145,  0.00818031,  0.0133373 ,  0.04201298,
       -0.0063285 , -0.04324958,  0.03536199,  0.04854539,  0.00397132,
        0.0234475 ,  0.00103261, -0.00258385, -0.00196835, -0.02755803,
        0.041401

In [52]:
# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x) # return vector for whole sequence
x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [53]:
# Compile model
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [54]:
# Fit model
model_history = model.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [55]:
# Make predictions on the validation dataset
model_pred_probs = model.predict(val_sentences)
model_pred_probs.shape, model_pred_probs[:10] # view the first 10



((762, 1),
 array([[1.5833689e-02],
        [7.2575074e-01],
        [9.9856305e-01],
        [4.7345631e-02],
        [4.5973939e-04],
        [9.9703228e-01],
        [9.2987216e-01],
        [9.9937958e-01],
        [9.9713320e-01],
        [2.9848439e-01]], dtype=float32))

In [56]:
### We can turn these prediction probabilities into prediction classes by rounding to the nearest integer 
### (by default, prediction probabilities under 0.5 will go to 0 and those over 0.5 will go to 1).

# Round out predictions and reduce to 1-dimensional array
model_preds = tf.squeeze(tf.round(model_pred_probs))
model_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [57]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_acc = accuracy_score(val_labels, model_preds) * 100
model_acc

76.11548556430446

### Model 2: GRU

* Another popular and effective RNN component is the GRU or gated recurrent unit.

* The GRU cell has similar features to an LSTM cell but has less parameters.

In [58]:
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x) # return vector for whole sequence
x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [59]:
# Compile GRU model
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [60]:
model_history = model.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [61]:
# Make predictions on the validation dataset
model_pred_probs = model.predict(val_sentences)
model_pred_probs.shape, model_pred_probs[:10]



((762, 1),
 array([[1.9721978e-03],
        [7.3158938e-01],
        [9.9994552e-01],
        [1.6574892e-01],
        [2.8520526e-05],
        [9.9995482e-01],
        [9.9671900e-01],
        [9.9997663e-01],
        [9.9996781e-01],
        [8.6742634e-01]], dtype=float32))

In [62]:
model_preds = tf.squeeze(tf.round(model_pred_probs))
model_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32)>

In [63]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_acc = accuracy_score(val_labels, model_preds) * 100
model_acc

76.11548556430446

### Model 3: Bidirectonal RNN model

* A standard RNN will process a sequence from left to right, where as a bidirectional RNN will process the sequence from left to right and then again from right to left.
 * In practice, many sequence models often see and improvement in performance when using bidirectional RNN's.

* However, this improvement in performance often comes at the cost of longer training times and increased model parameters (since the model goes left to right and right to left, the number of trainable parameters doubles).

In [64]:
# Set random seed and create embedding layer (new embedding layer for each model)
tf.random.set_seed(42)
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_4")

# Build a Bidirectional RNN in TensorFlow
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_4_embedding(x)
# x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x) # stacking RNN layers requires return_sequences=True
x = layers.Bidirectional(layers.LSTM(64))(x) # bidirectional goes both ways so has double the parameters of a regular LSTM layer
outputs = layers.Dense(1, activation="sigmoid")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [65]:
# Compile
model_4.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [67]:
# Fit the model (takes longer because of the bidirectional layers)
model_4_history = model_4.fit(train_sentences,
                              train_labels,
                              epochs=5,
                              validation_data=(val_sentences, val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [68]:
# Make predictions with bidirectional RNN on the validation data
model_4_pred_probs = model_4.predict(val_sentences)
model_4_pred_probs[:10]



array([[0.0978686 ],
       [0.90104336],
       [0.99965   ],
       [0.30254355],
       [0.0058037 ],
       [0.9983175 ],
       [0.96804327],
       [0.99970883],
       [0.99980676],
       [0.20917854]], dtype=float32)

In [69]:
# Convert prediction probabilities to labels
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_preds[:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 1., 1., 0., 0., 1., 1., 1., 1., 0.], dtype=float32)>

In [70]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_acc = accuracy_score(val_labels, model_4_preds) * 100
model_acc

76.37795275590551