## Natural Language Processing with Disaster Tweets

## Predict which Tweets are about real disasters and which ones are not

Twitter has become an important communication channel in times of emergency. The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter.

We have access to a dataset of 10,000 tweets that were hand classified.

### Import libraries

In [4]:
!python -m spacy download en_core_web_sm -q

[K     |████████████████████████████████| 12.0 MB 5.1 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import pathlib
import os
import io
import warnings
import en_core_web_sm
import plotly.graph_objects as go

from google.colab import drive
from spacy.lang.en.stop_words import STOP_WORDS
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from plotly.subplots import make_subplots
from sklearn.metrics import f1_score

warnings.filterwarnings('ignore')

### Import data


In [6]:
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/src/train.csv', encoding="utf-8")
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Data preprocessing

In [7]:
# Let's take the columns we're interested in
df = df[["text", "target"]]
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
# DO NOT RUN THIS COMMAND (TAKES TIME)
# IMPORT INSTEAD THE CLEANED DATASET IN THE NEXT CELL

# Spacy and english initialisation
nlp = en_core_web_sm.load()

# Clean up the character strings and encode the words so they are represented as integers
df["text_clean"] = df["text"].apply(lambda x: "".join(ch for ch in x if ch.isalnum() or ch == " "))
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace(" +", " ").lower().strip())
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

df = df[df["text_clean"].notna()]
df.to_csv("/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/src/train_clean.csv", index=False)
df

Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,Just got sent this photo from Ruby #Alaska as ...,1,send photo ruby alaska smoke wildfire pour school
...,...,...,...
7608,Two giant cranes holding a bridge collapse int...,1,giant crane hold bridge collapse nearby home h...
7609,@aria_ahrary @TheTawniest The out of control w...,1,ariaahrary thetawniest control wild fire calif...
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m194 0104 utc5 km s volcano hawaii httptcozdto...
7611,Police investigating after an e-bike collided ...,1,police investigate ebike collide car little po...


In [9]:
# Import the cleaned dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/src/train_clean.csv', encoding="utf-8")

In [10]:
vocab_size = 10000

# Using tf.keras.preprocessing.text.Tokenizer process to encoding all the reviews
# Instanciate the tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df["text_clean"])

df["text_encoded"] = tokenizer.texts_to_sequences(df.text_clean)
df["len_text"] = df["text_encoded"].apply(lambda x: len(x))
df = df[df["len_text"] != 0]
df.head()

Unnamed: 0,text,target,text_clean,text_encoded,len_text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,"[3679, 410, 167, 1368, 1944]",5
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[112, 2, 156, 504, 5569, 5570, 955]",7
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,"[1369, 442, 1692, 318, 5571, 288, 182, 1692, 3...",11
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...,"[2312, 6, 2313, 68, 182, 289, 34]",7
4,Just got sent this photo from Ruby #Alaska as ...,1,send photo ruby alaska smoke wildfire pour school,"[175, 117, 5572, 1693, 168, 68, 2314, 103]",8


In [11]:
# As Tensorflow is incapable to create a tensor dataset based on lists, we need to store all our encoded texts into a single numpy array before creating the tensorflow dataset
# Furthermore, we also need to use the pad_sequences to add zero padding at the beginning or at the end of our sequences so they all have equal length

reviews_pad = tf.keras.preprocessing.sequence.pad_sequences(df.text_encoded, padding="post")
full_ds = tf.data.Dataset.from_tensor_slices((reviews_pad, df.target.values))

In [31]:
# Train Test Split (70/30)
n_samples = df.shape[0]
TAKE_SIZE = int(0.7 * n_samples)
BATCH_SIZE = 128

# .shuffle() allows to randomly split the train and the test sets
# reshuffle_each_iteration=False is very important in order to avoid changing the splitting of the dataset between train and test
shuffled_ds = full_ds.shuffle(n_samples, reshuffle_each_iteration=False)

train_data_w = shuffled_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
test_data_w = shuffled_ds.skip(TAKE_SIZE).shuffle(n_samples - TAKE_SIZE)

# .batch on both sets to organise them by batches of XX observations
train_data = train_data_w.batch(BATCH_SIZE)
test_data = test_data_w.batch(BATCH_SIZE)

In [32]:
# Look at a batch
for text, target in train_data.take(1):
  print(text, target)

tf.Tensor(
[[ 498  232  274 ...    0    0    0]
 [1888  215 2108 ...    0    0    0]
 [9093 9094 1164 ...    0    0    0]
 ...
 [  73  149  363 ...    0    0    0]
 [  23 4646  355 ...    0    0    0]
 [ 120 3369 1193 ...    0    0    0]], shape=(128, 25), dtype=int32) tf.Tensor(
[0 1 1 1 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 0
 0 0 0 1 1 0 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 1
 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0
 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 1 0], shape=(128,), dtype=int64)


## Classification Modeling

LSTMs (Long Short Term Memory networks)

In [33]:
vocab_size = tokenizer.num_words
# text.shape[1] = df.len_text.max()

model = tf.keras.Sequential(
    [
     # Input word embedding layer
     Embedding(vocab_size, 64, input_shape=[df.len_text.max(),], name="embedding"),
     
     # Maintains the sequential nature
     LSTM(units=64, return_sequences=True),
     
     # Returns the last output
     LSTM(units=32, return_sequences=False),
     
     # Classical dense layer once the data is flat
     Dense(16, activation='relu'),
     Dense(8, activation='relu'),
     
     # Dropout layer to prevent from overfitting
     # Dropout(rate=0.7),
     
     # Output layer with as many neurons as the number of classes (in our case one so activation function is sigmoid)
     Dense(1, activation="sigmoid", name="output_layer")
    ]
)

In [34]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 64)            640000    
                                                                 
 lstm_4 (LSTM)               (None, 25, 64)            33024     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_4 (Dense)             (None, 16)                528       
                                                                 
 dense_5 (Dense)             (None, 8)                 136       
                                                                 
 output_layer (Dense)        (None, 1)                 9         
                                                                 
Total params: 686,113
Trainable params: 686,113
Non-tr

In [35]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
    # metrics=['accuracy']
)

In [36]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.fit(
    train_data,
    epochs=20,
    validation_data=test_data,
    callbacks=[tensorboard_callback]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9ba4081f50>

In [37]:
model.save("/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/model_LSTM.h5")

with open("/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/LSTM_history.json", 'w') as file:
  json.dump(model.history.history, file)

## Classification Evaluation

Visualizing the training process and interpreting the results for our predictive models

In [38]:
with open("/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/LSTM_history.json", 'r') as file:
  LSTM_history = json.load(file)

model_LSTM = tf.keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/Module 10 - Projects/06 - NLP with Disaster Tweets/model_LSTM.h5")
model_LSTM.summary()
LSTM_history

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 64)            640000    
                                                                 
 lstm_4 (LSTM)               (None, 25, 64)            33024     
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dense_4 (Dense)             (None, 16)                528       
                                                                 
 dense_5 (Dense)             (None, 8)                 136       
                                                                 
 output_layer (Dense)        (None, 1)                 9         
                                                                 
Total params: 686,113
Trainable params: 686,113
Non-tr

{'binary_accuracy': [0.5675320029258728,
  0.8171557784080505,
  0.904815673828125,
  0.9341610074043274,
  0.9529721736907959,
  0.961249053478241,
  0.9672686457633972,
  0.9721595048904419,
  0.9747930765151978,
  0.9762979745864868,
  0.9776147603988647,
  0.9811888933181763,
  0.9828818440437317,
  0.9834461808204651,
  0.980812668800354,
  0.9810007810592651,
  0.9823175072669983,
  0.9823175072669983,
  0.9806245565414429,
  0.9826937317848206],
 'loss': [0.6788202524185181,
  0.44187626242637634,
  0.2670855224132538,
  0.19144946336746216,
  0.14533112943172455,
  0.12138615548610687,
  0.103753000497818,
  0.09354479610919952,
  0.08533422648906708,
  0.08330875635147095,
  0.07356011867523193,
  0.0672183632850647,
  0.062029674649238586,
  0.05716395378112793,
  0.0537089966237545,
  0.05308963730931282,
  0.047715578228235245,
  0.045865803956985474,
  0.045269403606653214,
  0.0441364161670208],
 'val_binary_accuracy': [0.6546730995178223,
  0.7784115672111511,
  0.779289

In [39]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(y=LSTM_history["loss"], mode='lines', name='loss'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=LSTM_history["val_loss"], mode='lines', name='val_loss'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=LSTM_history["binary_accuracy"], mode='lines', name='binary_accuracy'),
    # go.Scatter(y=LSTM_history["accuracy"], mode='lines', name='accuracy'),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(y=LSTM_history["val_binary_accuracy"], mode='lines', name='val_binary_accuracy'),
    # go.Scatter(y=LSTM_history["val_accuracy"], mode='lines', name='val_accuracy'),
    row=1, col=2
)

fig.update_layout(height=600, width=1200, title_text="Model Performance")
fig.show()