## Natural Language Processing with Disaster Tweets

## Predict which Tweets are about real disasters and which ones are not

Twitter has become an important communication channel in times of emergency. The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter.

We have access to a dataset of 10,000 tweets that were hand classified.

### Import libraries

In [1]:
!python -m spacy download en_core_web_sm -q

[K     |████████████████████████████████| 12.0 MB 3.8 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import pathlib
import os
import io
import warnings
import en_core_web_sm
import plotly.graph_objects as go

from google.colab import drive
from spacy.lang.en.stop_words import STOP_WORDS
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from plotly.subplots import make_subplots
from sklearn.metrics import f1_score

warnings.filterwarnings('ignore')

### Import data


In [3]:
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/disaster-tweets/src/train.csv', encoding="utf-8")
df.head()

Mounted at /content/drive


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Data preprocessing

In [4]:
# let's take the columns we're interested in
df = df[["text", "target"]]
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
# DO NOT RUN THIS COMMAND (TAKES TIME)
# IMPORT INSTEAD THE CLEANED DATASET IN THE NEXT CELL

# spacy and english initialisation
nlp = en_core_web_sm.load()

# clean up the character strings and encode the words so they are represented as integers
df["text_clean"] = df["text"].apply(lambda x: "".join(ch for ch in x if ch.isalnum() or ch == " "))
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace(" +", " ").lower().strip())
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

df = df[df["text_clean"].notna()]
df.to_csv("/content/drive/MyDrive/Colab Notebooks/disaster-tweets/src/train_clean.csv", index=False)
df

Unnamed: 0,text,target,text_clean
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...
4,Just got sent this photo from Ruby #Alaska as ...,1,send photo ruby alaska smoke wildfire pour school
...,...,...,...
7608,Two giant cranes holding a bridge collapse int...,1,giant crane hold bridge collapse nearby home h...
7609,@aria_ahrary @TheTawniest The out of control w...,1,ariaahrary thetawniest control wild fire calif...
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,m194 0104 utc5 km s volcano hawaii httptcozdto...
7611,Police investigating after an e-bike collided ...,1,police investigate ebike collide car little po...


In [5]:
# import the cleaned dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/disaster-tweets/src/train_clean.csv', encoding="utf-8")

In [6]:
vocab_size = 10000

# using tf.keras.preprocessing.text.Tokenizer process to encoding all the reviews
# instanciate the tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df["text_clean"])

df["text_encoded"] = tokenizer.texts_to_sequences(df.text_clean)
df["len_text"] = df["text_encoded"].apply(lambda x: len(x))
df = df[df["len_text"] != 0]
df.head()

Unnamed: 0,text,target,text_clean,text_encoded,len_text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,"[3679, 410, 167, 1368, 1944]",5
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[112, 2, 156, 504, 5569, 5570, 955]",7
2,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...,"[1369, 442, 1692, 318, 5571, 288, 182, 1692, 3...",11
3,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfire evacuation order...,"[2312, 6, 2313, 68, 182, 289, 34]",7
4,Just got sent this photo from Ruby #Alaska as ...,1,send photo ruby alaska smoke wildfire pour school,"[175, 117, 5572, 1693, 168, 68, 2314, 103]",8


In [7]:
# as Tensorflow is incapable to create a tensor dataset based on lists, we need to store all our encoded texts into a single numpy array before creating the tensorflow dataset
# furthermore, we also need to use the pad_sequences to add zero padding at the beginning or at the end of our sequences so they all have equal length

reviews_pad = tf.keras.preprocessing.sequence.pad_sequences(df.text_encoded, padding="post")
full_ds = tf.data.Dataset.from_tensor_slices((reviews_pad, df.target.values))

In [8]:
# train test split (70/30)
n_samples = df.shape[0]
TAKE_SIZE = int(0.7 * n_samples)
BATCH_SIZE = 128

# .shuffle() allows to randomly split the train and the test sets
# reshuffle_each_iteration=False is very important in order to avoid changing the splitting of the dataset between train and test
shuffled_ds = full_ds.shuffle(n_samples, reshuffle_each_iteration=False)

train_data_w = shuffled_ds.take(TAKE_SIZE).shuffle(TAKE_SIZE)
test_data_w = shuffled_ds.skip(TAKE_SIZE).shuffle(n_samples - TAKE_SIZE)

# .batch on both sets to organise them by batches of XX observations
train_data = train_data_w.batch(BATCH_SIZE)
test_data = test_data_w.batch(BATCH_SIZE)

In [9]:
# look at a batch
for text, target in train_data.take(1):
  print(text, target)

tf.Tensor(
[[1238    2 2060 ...    0    0    0]
 [ 584   86    0 ...    0    0    0]
 [4385  681 1050 ...    0    0    0]
 ...
 [6576  495   69 ...    0    0    0]
 [  14  181  985 ...    0    0    0]
 [  75  374 2030 ...    0    0    0]], shape=(128, 25), dtype=int32) tf.Tensor(
[0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0 1 1 0 1 1 0 0 1 0
 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 0 1 1 0
 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 1
 1 1 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1], shape=(128,), dtype=int64)


## Classification Modeling

LSTMs (Long Short Term Memory networks)

In [10]:
vocab_size = tokenizer.num_words
# text.shape[1] = df.len_text.max()

model = tf.keras.Sequential(
    [
     # input word embedding layer
     Embedding(vocab_size, 64, input_shape=[df.len_text.max(),], name="embedding"),
     
     # first layer LSTM, maintains the sequential nature
     LSTM(units=64, return_sequences=True),
     
     # second layer LSTM, returns the last output
     LSTM(units=32, return_sequences=False),
     
     # classical dense layer once the data is flat
     Dense(16, activation='relu'),
     Dense(8, activation='relu'),
     
     # dropout layer to prevent from overfitting
     # Dropout(rate=0.7),
     
     # output layer with as many neurons as the number of classes (in our case one so activation function is sigmoid)
     Dense(1, activation="sigmoid", name="output_layer")
    ]
)

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 64)            640000    
                                                                 
 lstm (LSTM)                 (None, 25, 64)            33024     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 output_layer (Dense)        (None, 1)                 9         
                                                                 
Total params: 686,113
Trainable params: 686,113
Non-trai

In [12]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
    # metrics=['accuracy']
)

In [13]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

model.fit(
    train_data,
    epochs=50,
    validation_data=test_data,
    callbacks=[tensorboard_callback]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f23ee74c050>

In [14]:
model.save("/content/drive/MyDrive/Colab Notebooks/disaster-tweets/model.h5")

with open("/content/drive/MyDrive/Colab Notebooks/disaster-tweets/history.json", 'w') as file:
  json.dump(model.history.history, file)

## Classification Evaluation

Visualizing the training process and interpreting the results for our predictive models

In [15]:
with open("/content/drive/MyDrive/Colab Notebooks/disaster-tweets/history.json", 'r') as file:
  LSTM_history = json.load(file)

model_LSTM = tf.keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/disaster-tweets/model.h5")
model_LSTM.summary()
LSTM_history

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 64)            640000    
                                                                 
 lstm (LSTM)                 (None, 25, 64)            33024     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 output_layer (Dense)        (None, 1)                 9         
                                                                 
Total params: 686,113
Trainable params: 686,113
Non-trai

{'binary_accuracy': [0.6260346174240112,
  0.8431151509284973,
  0.9106470942497253,
  0.9367945790290833,
  0.9537246227264404,
  0.9620015025138855,
  0.9608728289604187,
  0.9651994109153748,
  0.9719713926315308,
  0.97516930103302,
  0.977990984916687,
  0.9778028726577759,
  0.9804364442825317,
  0.9817531704902649,
  0.9815650582313538,
  0.9796839952468872,
  0.9828818440437317,
  0.9836342930793762,
  0.9778028726577759,
  0.9791196584701538,
  0.980812668800354,
  0.9798721075057983,
  0.9794958829879761,
  0.9825056195259094,
  0.984762966632843,
  0.9813769459724426,
  0.9817531704902649,
  0.9817531704902649,
  0.9838224053382874,
  0.9825056195259094,
  0.9843867421150208,
  0.9840105175971985,
  0.9836342930793762,
  0.9798721075057983,
  0.9825056195259094,
  0.9825056195259094,
  0.9838224053382874,
  0.9834461808204651,
  0.9825056195259094,
  0.983258068561554,
  0.983258068561554,
  0.9840105175971985,
  0.9823175072669983,
  0.9813769459724426,
  0.9834461808204651

In [16]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(y=LSTM_history["loss"], mode='lines', name='loss'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=LSTM_history["val_loss"], mode='lines', name='val_loss'),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(y=LSTM_history["binary_accuracy"], mode='lines', name='binary_accuracy'),
    # go.Scatter(y=LSTM_history["accuracy"], mode='lines', name='accuracy'),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(y=LSTM_history["val_binary_accuracy"], mode='lines', name='val_binary_accuracy'),
    # go.Scatter(y=LSTM_history["val_accuracy"], mode='lines', name='val_accuracy'),
    row=1, col=2
)

fig.update_layout(height=600, width=1200, title_text="Model Performance")
fig.show()