# Twitter Disaster Prediction

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model, model_selection, preprocessing, metrics, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import string
# nltk.download('stopwords')
# nltk.download('punkt')

from plotnine import *

# tensorflow and Keras
import tensorflow as tf
from tensorflow import keras
import tensorflow_hub as hub


### Read Data

In [2]:
# read data
train = pd.read_csv( '../data/nlp-getting-started/train.csv' )
test  = pd.read_csv( '../data/nlp-getting-started/test.csv' )
subm_samp = pd.read_csv( '../data/nlp-getting-started/sample_submission.csv' )

train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
##
## Checking size of each class on train set
print( ' Class 1 Size : ' + str(train[ train['target'] == 1 ].shape[0]) )
print( ' Class 0 Size : ' + str(train[ train['target'] == 0 ].shape[0]) )

 Class 1 Size : 3271
 Class 0 Size : 4342


## Feature Engineering

### Text Pre-processing

In [4]:
###
### Function to process data
###
def textProcessing(dt, textCol, cols = ['id','keyword','text','tokenized','new_text','target'] ):
    
    # make copy
    dt_copy = dt.copy()
    
    # Get stopwords
    stop_words = set(stopwords.words('english')) 
    
    # Remove cases
    dt_copy['tmp_text'] = dt[textCol].str.lower()

    # Remove punctuation
    dt_copy['tmp_text'] = dt_copy.apply(lambda row: row['tmp_text'].translate(str.maketrans('', '', string.punctuation)), axis=1)

    # Tokenize
    dt_copy['tmp_text'] = dt_copy.apply(lambda row: word_tokenize(row['tmp_text']), axis=1)

    # Remove stopwords
    dt_copy['tokenized'] = dt_copy['tmp_text'].apply(lambda x: [item for item in x if item not in stop_words])
    
    # Merge words
    dt_copy['new_text'] = dt_copy['tokenized'].apply( lambda row: ' '.join(row) )
    
    # Return dataframe
    return dt_copy[ cols ]

newtrain = textProcessing( train, 'text' )
newtrain.head(5)

Unnamed: 0,id,keyword,text,tokenized,new_text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,"[deeds, reason, earthquake, may, allah, forgiv...",deeds reason earthquake may allah forgive us,1
1,4,,Forest fire near La Ronge Sask. Canada,"[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada,1
2,5,,All residents asked to 'shelter in place' are ...,"[residents, asked, shelter, place, notified, o...",residents asked shelter place notified officer...,1
3,6,,"13,000 people receive #wildfires evacuation or...","[13000, people, receive, wildfires, evacuation...",13000 people receive wildfires evacuation orde...,1
4,7,,Just got sent this photo from Ruby #Alaska as ...,"[got, sent, photo, ruby, alaska, smoke, wildfi...",got sent photo ruby alaska smoke wildfires pou...,1


### Split Train / Test Set

In [5]:
# Split the dataset for cross validation
X_train, X_test, y_train, y_test = train_test_split(newtrain.new_text, 
                                                    newtrain.target, 
                                                    test_size=0.25, 
                                                    random_state=123)


print('Train shape:' + str(X_train.shape) )
print('Proportion of class 0 in train: ' + str( np.round( 100*np.sum(y_train==0) / len(y_train), 2 ) ) + '%' )
print('Proportion of class 1 in train: ' + str( np.round( 100*np.sum(y_train==1) / len(y_train), 2 ) ) + '%' )
print('')
print('Test shape:' + str(X_test.shape) )
print('Proportion of class 0 in train: ' + str( np.round( 100*np.sum(y_test==0) / len(y_test), 2 ) ) + '%' )
print('Proportion of class 1 in train: ' + str( np.round( 100*np.sum(y_test==1) / len(y_test), 2 ) ) + '%' )

Train shape:(5709,)
Proportion of class 0 in train: 56.84%
Proportion of class 1 in train: 43.16%

Test shape:(1904,)
Proportion of class 0 in train: 57.62%
Proportion of class 1 in train: 42.38%


## Models

### Tf-idf + Bernoulli Naive Bayes Classifier

In [6]:
###
### TF-IDF
###

from sklearn.feature_extraction.text import TfidfVectorizer

# create Tfidf based on clean train data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform( X_train )
print(X_train_tfidf.shape)

# tranform test data into Tfidf
X_test_tfidf = vectorizer.transform( X_test )
print(X_test_tfidf.shape)


(5709, 18221)
(1904, 18221)


In [7]:

##
## Fit Bernoulli Naive-Bayes
clf_tfidf = BernoulliNB().fit( X_train_tfidf, y_train )
clf_tfidf

##
## Predict on train / test set
print( 'Train 0/1 Accuracy: ' + str( np.mean( clf_tfidf.predict(X_train_tfidf) == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( clf_tfidf.predict(X_test_tfidf) == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
                                                   y_pred = clf_tfidf.predict(X_train_tfidf) ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
                                                  y_pred = clf_tfidf.predict(X_test_tfidf) ) ) )


Train 0/1 Accuracy: 0.9164477141355754
Test 0/1 Accuracy: 0.8025210084033614

Train F1 Score: 0.895049504950495
Test F1 Score: 0.7251461988304094


### Tf-idf + Ridge Classifier

In [8]:
from sklearn.linear_model import RidgeClassifierCV

print('')
print('Ridge Classifier')
print('')

clf_ridge_tfidf = RidgeClassifierCV( alphas=[1e-3, 1e-2, 1e-1, 1, 5, 10], 
                              cv = 5 ).fit(X_train_tfidf, y_train)

print( 'Train 0/1 Accuracy: ' + str( np.mean( clf_ridge_tfidf.predict(X_train_tfidf) == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( clf_ridge_tfidf.predict(X_test_tfidf) == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
                                                   y_pred = clf_ridge_tfidf.predict(X_train_tfidf) ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
                                                  y_pred = clf_ridge_tfidf.predict(X_test_tfidf) ) ) )




Ridge Classifier

Train 0/1 Accuracy: 0.9719740760203188
Test 0/1 Accuracy: 0.7878151260504201

Train F1 Score: 0.966914805624483
Test F1 Score: 0.7352555701179555


### Tf-idf + Random Forest

In [119]:
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# from sklearn.ensemble import RandomForestClassifier

# # Number of trees in random forest
# n_estimators = [ 200, 300]#, 400, 500 ]

# # Number of features to consider at every split
# max_features = ['auto', 40, 50]

# # Create the random grid
# param_grid = {'n_estimators': n_estimators,
#               'max_features': max_features }

# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# rf_tuned = GridSearchCV(estimator = rf, 
#                         param_grid = param_grid, 
#                         scoring = 'f1',
#                         cv = 3, verbose= 2, n_jobs = -1 )

# # Fit the random search model
# rf_tuned.fit( X_train_tfidf, y_train )

# # Print best parameters
# rf_tuned.best_params_

In [194]:
# print( 'Train 0/1 Accuracy: ' + str( np.mean( rf_random.predict(X_train_tfidf) == y_train ) ) )
# print( 'Test 0/1 Accuracy: ' + str( np.mean( rf_random.predict(X_test_tfidf) == y_test ) ) )
# print('')
# print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
#                                                    y_pred = rf_random.predict(X_train_tfidf) ) ) )
# print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
#                                                   y_pred = rf_random.predict(X_test_tfidf) ) ) )

### Universal Sentence Enconder + Fully Connected Layer

In [9]:
import tensorflow as tf

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

In [10]:
# Create embedding based on clean version of text
# X_train_embed = embed( X_train )
# X_test_embed = embed( X_test )

X_train_embed = embed( X_train )
X_test_embed = embed( X_test )

In [11]:
from sklearn.linear_model import RidgeClassifierCV

print('')
print('Ridge Classifier')
print('')

clf_ridge = RidgeClassifierCV( alphas=[1e-3, 1e-2, 1e-1, 1, 5, 10], 
                              cv = 5 ).fit(X_train_embed, y_train)

print( 'Train 0/1 Accuracy: ' + str( np.mean( clf_ridge.predict(X_train_embed) == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( clf_ridge.predict(X_test_embed) == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
                                                   y_pred = clf_ridge.predict(X_train_embed) ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
                                                  y_pred = clf_ridge.predict(X_test_embed) ) ) )


##
## Fit Bernoulli Naive-Bayes
print('')
print('Bernoulli Naive Bayes')
print('')

clf_nb = BernoulliNB().fit( X_train_embed, y_train )
clf_nb

##
## Predict on train / test set
print( 'Train 0/1 Accuracy: ' + str( np.mean( clf_nb.predict(X_train_embed) == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( clf_nb.predict(X_test_embed) == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_train.to_numpy(), (-1, 1)),
                                                   y_pred = clf_nb.predict(X_train_embed) ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = np.reshape(y_test.to_numpy(), (-1, 1)),
                                                  y_pred = clf_nb.predict(X_test_embed) ) ) )


Ridge Classifier

Train 0/1 Accuracy: 0.8237870029777544
Test 0/1 Accuracy: 0.8025210084033614

Train F1 Score: 0.7858663260962111
Test F1 Score: 0.7577319587628866

Bernoulli Naive Bayes

Train 0/1 Accuracy: 0.7777193904361535
Test 0/1 Accuracy: 0.7689075630252101

Train F1 Score: 0.7443078782994156
Test F1 Score: 0.7280593325092708


In [12]:
X_train_embed.shape

TensorShape([5709, 512])

In [13]:
##
## Define model's architecture
model = tf.keras.Sequential()
model.add( tf.keras.layers.Dense(8, activation='relu', input_shape=(X_train_embed.shape[1],) ) )
# model.add( tf.keras.layers.Dense(16, activation='relu' ) )
# model.add( tf.keras.layers.Dense(8, activation='relu' ) )
# model.add( tf.keras.layers.Dropout(0.2, seed=123) )
model.add( tf.keras.layers.Dense(1, activation='sigmoid' ) )

##
## Compile model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [14]:
## Train Model
model.fit(X_train_embed,
          y_train,
          epochs=20,
          validation_data=(X_test_embed, y_test),
          verbose=1 )

Train on 5709 samples, validate on 1904 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x10837b210>

In [15]:
##
## Predict on train / test set
print('')
print('Neural Net - Fully Connected')
print('')

# get predictions for in/out of sample
preds_train = np.reshape( np.round(model.predict(X_train_embed)), (-1,) )
preds_test  = np.reshape( np.round(model.predict(X_test_embed)), (-1,) )

print( 'Train 0/1 Accuracy: ' + str( np.mean( preds_train == y_train ) ) )
print( 'Test 0/1 Accuracy: ' + str( np.mean( preds_test == y_test ) ) )
print('')
print( 'Train F1 Score: ' + str( metrics.f1_score( y_true = y_train,
                                                   y_pred = preds_train ) ) )
print( 'Test F1 Score: ' + str( metrics.f1_score( y_true = y_test,
                                                   y_pred = preds_test ) ) )


Neural Net - Fully Connected

Train 0/1 Accuracy: 0.8327202662462778
Test 0/1 Accuracy: 0.8114495798319328

Train F1 Score: 0.7939590075512406
Test F1 Score: 0.7652060170045782


## Generate submission for Kaggle competition

In [16]:
newtest = textProcessing( test, 'text', cols = ['id','keyword','text','tokenized','new_text'] )
newtest.head(5)

Unnamed: 0,id,keyword,text,tokenized,new_text
0,0,,Just happened a terrible car crash,"[happened, terrible, car, crash]",happened terrible car crash
1,2,,"Heard about #earthquake is different cities, s...","[heard, earthquake, different, cities, stay, s...",heard earthquake different cities stay safe ev...
2,3,,"there is a forest fire at spot pond, geese are...","[forest, fire, spot, pond, geese, fleeing, acr...",forest fire spot pond geese fleeing across str...
3,9,,Apocalypse lighting. #Spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]",apocalypse lighting spokane wildfires
4,11,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, china, taiwan]",typhoon soudelor kills 28 china taiwan


In [30]:

submission_embed = embed( newtest.new_text )
submission_preds = np.reshape( np.round(model.predict(submission_embed)), (-1,) )

newtest['target'] = submission_preds
newtest.target = newtest.target.astype(int)
# newtest[['id','target']]
newtest[['id','target']].to_csv( '../data/submissions/submission_20200501.csv', index = False )