# Twitter Disaster Prediction

### Import libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model, model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string
# nltk.download('stopwords')
# nltk.download('punkt')

import matplotlib.pyplot as plt
from plotnine import *

# tensorflow and Keras
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub

### Read Data

In [2]:
# read data
train = pd.read_csv( '../data/nlp-getting-started/train.csv' )
test  = pd.read_csv( '../data/nlp-getting-started/test.csv' )
subm_samp = pd.read_csv( '../data/nlp-getting-started/sample_submission.csv' )

In [3]:
print(train.shape)
train.head(10)

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
##
## Checking size of each class on train set
print( ' Class 1 Size : ' + str(train[ train['target'] == 1 ].shape[0]) )
print( ' Class 0 Size : ' + str(train[ train['target'] == 0 ].shape[0]) )

 Class 1 Size : 3271
 Class 0 Size : 4342


## Feature Engineering

In [5]:
###
### Function to process data
###
def textProcessing(dt, textCol, cols = ['id','keyword','text','tokenized','new_text','target'] ):
    
    # make copy
    dt_copy = dt.copy()
    
    # Get stopwords
    stop_words = set(stopwords.words('english')) 
    
    # Remove cases
    dt_copy['tmp_text'] = dt[textCol].str.lower()

    # Remove punctuation
    dt_copy['tmp_text'] = dt_copy.apply(lambda row: row['tmp_text'].translate(str.maketrans('', '', string.punctuation)), axis=1)

    # Tokenize
    dt_copy['tmp_text'] = dt_copy.apply(lambda row: word_tokenize(row['tmp_text']), axis=1)

    # Remove stopwords
    dt_copy['tokenized'] = dt_copy['tmp_text'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # Merge words
    dt_copy['new_text'] = dt_copy['tokenized'].apply( lambda row: ' '.join(row) )
    
    # Return dataframe
    return dt_copy[ cols ]

newtrain = textProcessing( train, 'text' )
newtrain.head(5)

Unnamed: 0,id,keyword,text,tokenized,new_text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,"[deeds, reason, earthquake, may, allah, forgiv...",deeds reason earthquake may allah forgive us,1
1,4,,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada,1
2,5,,All residents asked to 'shelter in place' are ...,"[residents, asked, shelter, place, notified, o...",residents asked shelter place notified officer...,1
3,6,,"13,000 people receive #wildfires evacuation or...","[13000, people, receive, wildfires, evacuation...",13000 people receive wildfires evacuation orde...,1
4,7,,Just got sent this photo from Ruby #Alaska as ...,"[got, sent, photo, ruby, alaska, smoke, wildfi...",got sent photo ruby alaska smoke wildfires pou...,1


### Read GloVe vectors

In [6]:
# this chunk takes a bit to run
glovePath = '/Users/joaquimlyrio/Downloads/glove/glove.twitter.27B.25d.txt'
embeddings_dict = {}
with open( glovePath, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [7]:
from scipy import spatial
spatial.distance.cosine(np.reshape( embeddings_dict['fire'], (-1,1) ) , 
                        np.reshape( embeddings_dict['house'], (-1,1) ) )

0.11160975694656372

### Create embeddings for each tweet based on GloVe word embeddings (word -> sentence)

In [8]:
###
### Tweet embedding is the average of the word embeddings
def computeGloVeEnsemble( s, embeddings ):
    
    iCnt = 0
    cum = np.zeros( embeddings.get('cool').shape )
    
    # iterate over words in sentence s
    # first word
    if s[0] in embeddings:
        cum = embeddings.get(s[0])
        iCnt = iCnt + 1
    
    # other words
    for w in s[1:]:
        if w in embeddings:
            cum = cum + embeddings.get(w)
            iCnt = iCnt + 1
        
    return cum / iCnt

In [9]:
##
## Compute tweet embeddings based on GloVe
dt_copy = newtrain[['tokenized']].copy()
gloveSeries = dt_copy.apply(lambda row: computeGloVeEnsemble( row['tokenized'], embeddings=embeddings_dict ), axis=1)

# convert one column to multiple columns (no. of features -> 25)
gloveEmbeds = pd.DataFrame(gloveSeries)
gloveEmbeds = gloveEmbeds[0].apply(pd.Series)
gloveEmbeds



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,-0.034642,0.113201,-0.430404,0.489391,-0.853268,-0.306259,1.077509,-0.201949,-0.086634,0.437167,...,-0.115637,0.230185,0.121293,0.576177,-0.770221,0.263890,0.306265,-0.436617,0.242523,-0.605413
1,-0.567261,-0.202308,-0.182597,-0.002299,-0.911867,0.029404,0.044439,-0.149501,0.482768,-0.156152,...,0.251632,0.237843,0.109339,0.164636,0.505321,0.013704,-0.441899,0.354123,-0.906471,0.207224
2,-0.600594,0.683745,-0.279191,-0.182460,-0.222019,-0.768375,0.441783,-1.233742,0.358456,0.232217,...,0.012621,0.572267,-0.258214,0.147398,-0.713238,0.231145,-0.565903,0.002023,-0.839046,-0.239976
3,-0.302245,0.713562,-0.572892,0.051555,-0.331429,-0.620134,0.610448,-1.426075,0.374652,0.257453,...,-0.229748,0.337154,-0.285332,0.569386,-0.700277,-0.067718,-0.516196,0.041319,-0.958743,-0.079488
4,-0.440428,0.247316,0.190015,0.146781,-0.571416,0.001253,0.602678,-0.746124,0.165429,0.126911,...,0.120295,0.539177,-0.081443,-0.047724,0.087765,0.124569,-0.118026,0.481347,-0.423808,0.124419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,-0.648327,0.070490,0.380065,-0.042914,-0.387247,0.036434,0.478167,-1.198946,0.579818,0.453287,...,-0.100243,0.376008,-0.045428,0.473099,-0.313125,-0.304531,-0.226257,0.297763,-1.010939,-0.234808
7609,-0.135133,0.096568,-0.276388,-0.126443,-0.288394,0.141316,0.764192,-0.630848,0.299536,-0.072727,...,-0.019655,0.220042,0.173761,0.262831,-0.165517,0.061640,-0.072648,0.181287,-0.851537,-0.043328
7610,-1.168000,-0.450815,0.532550,-0.399675,-1.363600,-0.095002,0.954495,-0.814990,0.556580,0.270772,...,-0.002895,0.338860,0.395045,1.074420,0.028868,-0.702900,-0.445012,0.069295,-1.341100,0.638075
7611,-0.287354,0.493432,-0.094455,-0.263650,0.140191,-0.228304,0.218582,-0.772906,0.312802,-0.002916,...,0.336800,0.562909,0.074350,0.353500,-0.345130,0.084602,0.363871,0.330678,-0.892794,-0.284000


In [10]:
gloveEmbeds['target'] = newtrain['target']
gloveEmbeds = gloveEmbeds.dropna()
gloveEmbeds.shape

(7610, 26)

### Split Train/Test sets

In [11]:
# Split the dataset for cross validation
X_train, X_test, y_train, y_test = train_test_split(gloveEmbeds.loc[:, gloveEmbeds.columns != 'target'], 
                                                    gloveEmbeds.target, 
                                                    test_size=0.25, 
                                                    random_state=123)


print('Train shape:' + str(X_train.shape) )
print('Proportion of class 0 in train: ' + str( np.round( 100*np.sum(y_train==0) / len(y_train), 2 ) ) + '%' )
print('Proportion of class 1 in train: ' + str( np.round( 100*np.sum(y_train==1) / len(y_train), 2 ) ) + '%' )
print('')
print('Test shape:' + str(X_test.shape) )
print('Proportion of class 0 in train: ' + str( np.round( 100*np.sum(y_test==0) / len(y_test), 2 ) ) + '%' )
print('Proportion of class 1 in train: ' + str( np.round( 100*np.sum(y_test==1) / len(y_test), 2 ) ) + '%' )

Train shape:(5707, 25)
Proportion of class 0 in train: 56.82%
Proportion of class 1 in train: 43.18%

Test shape:(1903, 25)
Proportion of class 0 in train: 57.65%
Proportion of class 1 in train: 42.35%


## Models

### (1) Bernoulli Naive Bayes

In [12]:
##
## Fit Bernoulli Naive-Bayes
print('')
print('Bernoulli Naive Bayes')
print('')

clf_nb = BernoulliNB().fit( X_train, y_train )
clf_nb

##
## Predict on train / test set
print( 'Train 0/1 Accuracy: ' + str( np.mean( clf_nb.predict(X_train) == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( clf_nb.predict(X_test) == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
                                                   y_pred = clf_nb.predict(X_train) ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
                                                  y_pred = clf_nb.predict(X_test) ) ) )


Bernoulli Naive Bayes

Train 0/1 Accuracy: 0.7333099702120204
Test 0/1 Accuracy: 0.7299001576458224

Train F1 Score: 0.7026182102383745
Test F1 Score: 0.6933174224343676


### (2) Ridge Classifier

In [13]:
print('')
print('Ridge Classifier')
print('')

from sklearn.linear_model import RidgeClassifierCV
clf_ridge = RidgeClassifierCV( alphas=[1e-3, 1e-2, 1e-1, 1, 5, 10], cv = 5 ).fit(X_train, y_train)

print( 'Train 0/1 Accuracy: ' + str( np.mean( clf_ridge.predict(X_train) == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( clf_ridge.predict(X_test) == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
                                                   y_pred = clf_ridge.predict(X_train) ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
                                                  y_pred = clf_ridge.predict(X_test) ) ) )


Ridge Classifier

Train 0/1 Accuracy: 0.779568950411775
Test 0/1 Accuracy: 0.7682606410930111

Train F1 Score: 0.72472647702407
Test F1 Score: 0.7100591715976331


### (3) Feed-Forward Neural Network

In [14]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

##
## Define model's architecture
model = tf.keras.Sequential()
model.add( tf.keras.layers.Dense(16, activation='tanh', input_shape=(X_train.shape[1],) ) )
model.add( tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],) ) )
model.add( tf.keras.layers.Dense(32, activation='tanh', input_shape=(X_train.shape[1],) ) )
model.add( tf.keras.layers.Dense(8, activation='relu', input_shape=(X_train.shape[1],) ) )
model.add( tf.keras.layers.Dense(1, activation='sigmoid' ) )

##
## Compile model
model.compile(optimizer=keras.optimizers.Adam(),
#               optimizer=keras.optimizers.RMSprop(),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# model.compile(optimizer='adam',
#               loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               metrics=['accuracy'])

##
## Print model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                416       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 264       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 2,289
Trainable params: 2,289
Non-trainable params: 0
_________________________________________________________________


In [15]:
## Train Model
model.fit(X_train,
          y_train,
          epochs=20,
          validation_data=(X_test, y_test),
          verbose=1 )

Train on 5707 samples, validate on 1903 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1a409fca50>

In [16]:
##
## Predict on train / test set
print('')
print('Neural Net - Feed-forward')
print('')

# get predictions for in/out of sample
preds_train = np.reshape( np.round(model.predict(X_train)), (-1,) )
preds_test  = np.reshape( np.round(model.predict(X_test)), (-1,) )

print( 'Train 0/1 Accuracy: ' + str( np.mean( preds_train == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( preds_test == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = y_train,
                                                   y_pred = preds_train ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = y_test,
                                                   y_pred = preds_test ) ) )


Neural Net - Feed-forward

Train 0/1 Accuracy: 0.8239004731032066
Test 0/1 Accuracy: 0.7976878612716763

Train F1 Score: 0.7802317953203586
Test F1 Score: 0.7468770545693624
