In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import re
import nltk
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import SimpleRNN
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

%matplotlib inline

In [2]:
seed = 8675309

In [3]:
tf.random.set_seed(seed)
np.random.seed(seed)

In [4]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [5]:
train_df.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [6]:
test_df.head(2)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."


In [7]:
train_df.describe(include='all')

Unnamed: 0,id,keyword,location,text,target
count,7613.0,7552,5080,7613,7613.0
unique,,221,3341,7503,
top,,fatalities,USA,11-Year-Old Boy Charged With Manslaughter of T...,
freq,,45,104,10,
mean,5441.934848,,,,0.42966
std,3137.11609,,,,0.49506
min,1.0,,,,0.0
25%,2734.0,,,,0.0
50%,5408.0,,,,0.0
75%,8146.0,,,,1.0


In [8]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [9]:
# describe yields 10 duplicates but unique does not.
train_df["text"].unique

<bound method Series.unique of 0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object>

In [10]:
train_df.shape

(7613, 5)

In [11]:
train_df["text"][2]

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [12]:
# leaving test set alone for now
dataset = train_df.values

In [13]:
dataset.shape

(7613, 5)

In [14]:
X = dataset[:, 3]

In [15]:
y = dataset[:, -1]

In [16]:
y

array([1, 1, 1, ..., 1, 1, 1], dtype=object)

In [17]:
X[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [18]:
y[0]

1

In [19]:
# turns words into numbers
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [20]:
X[0][0]

5

In [21]:
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [22]:
X_processed[0][0]

0

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.30, random_state=seed)

In [24]:
X_train.shape

(5329, 1024)

In [25]:
X_test.shape

(2284, 1024)

In [26]:
y_train.shape

(5329,)

In [27]:
y_test.shape

(2284,)

In [28]:
X_train[0]

array([ 0,  0,  0, ...,  9, 10, 43], dtype=int32)

In [29]:
X_test[0]

array([ 0,  0,  0, ...,  2, 16, 15], dtype=int32)

In [30]:
y_train[0]

1

In [31]:
y_test[0]

0

# Model 1 : Simple RNN

In [32]:
rnn_model = Sequential()

In [33]:
rnn_model.add(Embedding(num_words, 32, input_length=max_log_length))

In [34]:
rnn_model.add(SimpleRNN(units=32, activation='relu'))

In [35]:
rnn_model.add(Dense(units=1, activation = 'sigmoid'))

In [36]:
rnn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [37]:
rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 32)          3072      
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 5,185
Trainable params: 5,185
Non-trainable params: 0
_________________________________________________________________


In [38]:
X_train = np.asarray(X_train).astype('float32')

In [39]:
y_train = np.asarray(y_train).astype('float32')

In [40]:
rnn_model.fit(x=X_train, y=y_train, epochs=3, batch_size = 128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc4b04309d0>

In [41]:
X_test = np.asarray(X_test).astype('float32')

In [42]:
y_test = np.asarray(y_test).astype('float32')

In [43]:
rnn_model.evaluate(x=X_test, y=y_test, batch_size = 128)



[0.6641529202461243, 0.5893169641494751]

# Model 2 : LSTM + Dropout Layers

In [44]:
lstm_model = Sequential()
lstm_model.add(Embedding(num_words, 32, input_length=max_log_length))
lstm_model.add(LSTM(units=64, recurrent_dropout=0.5))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(units=1, activation = 'relu'))

In [45]:
lstm_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [46]:
lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          3072      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 27,969
Trainable params: 27,969
Non-trainable params: 0
_________________________________________________________________


In [47]:
lstm_model.fit(x=X_train, y=y_train, epochs=3, batch_size = 128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc49c426a90>

In [48]:
lstm_model.evaluate(x=X_test, y=y_test, batch_size = 128)



[0.6444100737571716, 0.6147110462188721]

# Model 3 : LSTM + Dropout Layers

In [49]:
my_model = Sequential()
my_model.add(Embedding(num_words, 32, input_length=max_log_length))
my_model.add(Dense(units=32, activation = 'relu'))
my_model.add(LSTM(units=64, recurrent_dropout=0.5))
my_model.add(Dropout(0.5))
my_model.add(Dense(units=1, activation = 'relu'))

In [50]:
my_model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy', 'mae'])

In [51]:
my_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1024, 32)          3072      
_________________________________________________________________
dense_2 (Dense)              (None, 1024, 32)          1056      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 29,025
Trainable params: 29,025
Non-trainable params: 0
_________________________________________________________________


In [52]:
my_model.fit(x=X_train, y=y_train, epochs=3, batch_size = 128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fc46c542050>

In [53]:
my_model.evaluate(x=X_test, y=y_test, batch_size = 128)



[0.679061770439148, 0.5783712863922119, 0.4877036511898041]

# Submitting Best Model

In [54]:
to_csv("submission.csv", index=False)

NameError: name 'to_csv' is not defined