In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import re
import nltk
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import SimpleRNN
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

%matplotlib inline

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import keras.backend as K

In [2]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [3]:
# setup for consolidated table, later

table = {"model 1":{"activationFn":"relu/sigmoid","max_log_length":None,"num_words":None,"embedding_param":None,"num_ltsm_lyrs":None,"num_dense_lyrs":None,"accuracy":None,"mse":None,"f1":None,"precision":None,"recall":None,"loss":None},
         "model 2":{"activationFn":"relu/sigmoid","max_log_length":None,"num_words":None,"embedding_param":None,"num_ltsm_lyrs":None,"num_dense_lyrs":None,"accuracy":None,"mse":None,"f1":None,"precision":None,"recall":None,"loss":None},
         "model 3":{"activationFn":"relu/sigmoid","max_log_length":None,"num_words":None,"embedding_param":None,"num_ltsm_lyrs":None,"num_dense_lyrs":None,"accuracy":None,"mse":None,"f1":None,"precision":None,"recall":None,"loss":None},
         "model 4":{"activationFn":"relu/sigmoid","max_log_length":None,"num_words":None,"embedding_param":None,"num_ltsm_lyrs":None,"num_dense_lyrs":None,"accuracy":None,"mse":None,"f1":None,"precision":None,"recall":None,"loss":None},
        }

pd.DataFrame(table).transpose().head(2)

Unnamed: 0,activationFn,max_log_length,num_words,embedding_param,num_ltsm_lyrs,num_dense_lyrs,accuracy,mse,f1,precision,recall,loss
model 1,relu/sigmoid,,,,,,,,,,,
model 2,relu/sigmoid,,,,,,,,,,,


In [4]:
seed = 8675309

In [5]:
tf.random.set_seed(seed)
np.random.seed(seed)

In [6]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [7]:
train_df.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [8]:
test_df.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [9]:
train_df.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [10]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [11]:
# describe yields 10 duplicates but unique does not.
train_df["text"].unique

<bound method Series.unique of 0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object>

In [12]:
train_df.shape

(7613, 5)

In [13]:
train_df["text"][2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [14]:
# leaving test set alone for now
dataset = train_df.values

In [15]:
dataset.shape

(7613, 5)

In [16]:
X = dataset[:, 3]

In [17]:
y = dataset[:, -1]

In [18]:
y

array([1, 1, 1, ..., 1, 1, 1], dtype=object)

In [19]:
X[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [20]:
y[0]

1

In [21]:
# turns words into numbers
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [22]:
X[0][0]

5

In [23]:
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [24]:
X_processed[0][0]

0

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.30, random_state=seed)

In [26]:
# necessary
X_train = np.asarray(X_train).astype('float32')
y_train = np.asarray(y_train).astype('float32')
X_test = np.asarray(X_test).astype('float32')
y_test = np.asarray(y_test).astype('float32')

In [27]:
X_train.shape

(5329, 1024)

In [28]:
X_test.shape

(2284, 1024)

In [29]:
y_train.shape

(5329,)

In [30]:
y_test.shape

(2284,)

In [31]:
X_train[0]

array([ 0.,  0.,  0., ...,  9., 10., 43.], dtype=float32)

In [32]:
X_test[0]

array([ 0.,  0.,  0., ...,  2., 16., 15.], dtype=float32)

In [33]:
y_train[0]

1.0

In [34]:
y_test[0]

0.0

# Model 1 : Simple RNN

In [35]:
rnn_model = Sequential()

In [36]:
rnn_model.add(Embedding(num_words, 32, input_length=max_log_length))

In [37]:
rnn_model.add(SimpleRNN(units=32, activation='relu'))

In [38]:
rnn_model.add(Dense(units=1, activation = 'sigmoid'))

In [39]:
rnn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1_m, precision_m, recall_m])

In [40]:
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 32)          3072      
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 5,185
Trainable params: 5,185
Non-trainable params: 0
_________________________________________________________________


In [41]:
rnn_model.fit(x=X_train, y=y_train, epochs=3, batch_size = 128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fdccc28d1d0>

In [42]:
loss, accuracy, f1_score, precision, recall = rnn_model.evaluate(x=X_test, y=y_test, batch_size = 128)



In [43]:
print("loss : ", loss)
print("acc : ", accuracy)
print("f1_score : ", f1_score)
print("precision : ", precision)
print("recall : ", recall)

loss :  0.6641525626182556
acc :  0.5893169641494751
f1_score :  0.23083704710006714
precision :  0.5547247529029846
recall :  0.1484539806842804


In [44]:
table["model 1"]["loss"] = loss
table["model 1"]["accuracy"] = accuracy
table["model 1"]["f1"] = f1_score
table["model 1"]["precision"] = precision
table["model 1"]["recall"] = recall

# Model 2 : LSTM + Dropout Layers

In [45]:
lstm_model = Sequential()
lstm_model.add(Embedding(num_words, 32, input_length=max_log_length))
lstm_model.add(LSTM(units=64, recurrent_dropout=0.5))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units=1, activation = 'relu'))

In [46]:
lstm_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', f1_m, precision_m, recall_m])

In [47]:
lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          3072      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 27,969
Trainable params: 27,969
Non-trainable params: 0
_________________________________________________________________


In [48]:
lstm_model.fit(x=X_train, y=y_train, epochs=3, batch_size = 128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fdcb5d11dd0>

In [49]:
loss2, accuracy2, f1_score2, precision2, recall2 = lstm_model.evaluate(x=X_test, y=y_test, batch_size = 128)



In [50]:
print("loss : ", loss2)
print("acc : ", accuracy2)
print("f1_score : ", f1_score2)
print("precision : ", precision2)
print("recall : ", recall2)

loss :  0.6444100737571716
acc :  0.6147110462188721
f1_score :  0.42613521218299866
precision :  0.5756444931030273
recall :  0.3418384790420532


In [51]:
table["model 2"]["loss"] = loss2
table["model 2"]["accuracy"] = accuracy2
table["model 2"]["f1"] = f1_score2
table["model 2"]["precision"] = precision2
table["model 2"]["recall"] = recall2

# Model 3 : LSTM + Dropout Layers

In [52]:
my_model = Sequential()
my_model.add(Embedding(num_words, 32, input_length=max_log_length))
my_model.add(Dense(units=32, activation = 'relu'))
my_model.add(LSTM(units=64, recurrent_dropout=0.33))
my_model.add(Dropout(0.25)) # f1, recall, and precision = 0 when dropout too high
my_model.add(Dense(units=1, activation = 'relu'))

In [53]:
my_model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy', 'mae', f1_m, precision_m, recall_m])

In [54]:
my_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1024, 32)          3072      
_________________________________________________________________
dense_2 (Dense)              (None, 1024, 32)          1056      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 29,025
Trainable params: 29,025
Non-trainable params: 0
_________________________________________________________________


In [None]:
my_model.fit(x=X_train, y=y_train, epochs=3, batch_size = 128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
loss3, mae1, accuracy3, f1_score3, precision3, recall3 = my_model.evaluate(x=X_test, y=y_test, batch_size = 128)

In [None]:
print("loss : ", loss3)
print("mae : ", mae1)
print("acc : ", accuracy3)
print("f1_score : ", f1_score3)
print("precision : ", precision3)
print("recall : ", recall3)

In [None]:
table["model 3"]["loss"] = loss3
table["model 3"]["mae"] = mae1
table["model 3"]["accuracy"] = accuracy3
table["model 3"]["f1"] = f1_score3
table["model 3"]["precision"] = precision3
table["model 3"]["recall"] = recall3

In [None]:
pd.options.display.float_format = "{:,.3f}".format

In [None]:
#### summary
pd.DataFrame(table).transpose()

# Submitting Best Model

In [None]:
to_csv("submission.csv", index=False)