# Convolutional wordvec

## Reading the data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [2]:
wordvec_train = pd.read_pickle('train_wordvec.pickle')
wordvec_test = pd.read_pickle('test_wordvec.pickle')

In [3]:
train = train.merge(wordvec_train, on=['id'])
test = test.merge(wordvec_test, on=['id'])
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,wordvec,keyword_wordvec,wordvec_concat,wordvec_tfidf
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[-0.26623327, 0.05843069, -0.1404636, -0.05265...","[-0.26623327, 0.05843069, -0.1404636, -0.05265...","[[-0.2820900082588196, 0.1519400030374527, -0....","[-2.0410312242232838, 0.1577752003302941, -0.8..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[-0.025449565, 0.031005142, -0.15566371, -0.23...","[-0.025449565, 0.031005142, -0.15566371, -0.23...","[[0.3039900064468384, 0.20476000010967255, -0....","[-0.27185601989428204, 0.2042857458194097, -1...."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[0.0059339865, 0.016337818, -0.105279535, -0.0...","[0.0059339865, 0.016337818, -0.105279535, -0.0...","[[0.00997759960591793, -0.20995000004768372, -...","[0.07528745450756767, 0.11175614595413208, -0...."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,"[-0.18147185, 0.20731743, 0.014147284, -0.2182...","[-0.18147185, 0.20731743, 0.014147284, -0.2182...","[[-0.19686000049114227, 0.1157900020480156, -0...","[-1.3403782035623277, 1.2000715562275477, 0.11..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[-0.06394094, -0.01423019, 0.0063574947, 0.071...","[-0.06394094, -0.01423019, 0.0063574947, 0.071...","[[-0.02556299977004528, 0.444240003824234, -0....","[-0.7245167245467504, -0.364056259393692, 0.52..."


## Train a model

In [4]:
max_words = max(train['wordvec_concat'].apply(lambda x: x.shape[0]).max(),
                test['wordvec_concat'].apply(lambda x: x.shape[0]).max())
max_words

33

In [5]:
import numpy
def get_X(df, col):
    X = concat = numpy.empty((0, max_words, 300))
    for index, row in df.iterrows():
        x = numpy.pad(row[col],((0,max_words - row[col].shape[0]),(0, 0)))
        X = numpy.append(X, [x], axis=0)
    return X

In [6]:
y = train['target']
X = get_X(train, 'wordvec_concat')

In [7]:
X.shape

(7561, 33, 300)

In [8]:
import tensorflow as tf

def get_model():
    conv_model = tf.keras.Sequential([\
        tf.keras.layers.Dropout(0.4, input_shape=(max_words,300)),          
        tf.keras.layers.Conv1D(filters=6, kernel_size=3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Dropout(0.3),  
        tf.keras.layers.Conv1D(filters=8, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.2),                           
        tf.keras.layers.Dense(units=1, activation='sigmoid'),
    ])
    return conv_model

In [13]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [9]:
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback
import numpy as np

class Metrics(Callback):
    def __init__(self, train, validation):   
        super(Metrics, self).__init__()
        self.validation = validation    
        self.train = train        
        
    def on_train_begin(self, logs={}):        
        self.val_f1s = []
        self.train_f1s = []
             
    def on_epoch_end(self, epoch, logs={}):
        val_targ = self.validation[1]   
        val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
        
        train_targ = self.train[1]   
        train_predict = (np.asarray(self.model.predict(self.train[0]))).round()   
        
        val_f1 = f1_score(val_targ, val_predict)
        train_f1 = f1_score(train_targ, train_predict)
        self.val_f1s.append(round(val_f1, 6))
        self.train_f1s.append(round(train_f1, 6))
        
        print(f'— train_f1: {train_f1} — val_f1: {val_f1}')

In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    conv_model = get_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    conv_model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    conv_model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
  3/189 [..............................] - ETA: 3s - loss: 0.2512

In [134]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [112]:
scores

[0.767461, 0.748264, 0.769231, 0.75616, 0.782824]

In [113]:
np.mean(scores)

0.764788

Marginally better as averaging?? Train model on all data.

In [119]:
conv_model = get_model()
conv_model.compile(loss='binary_crossentropy', optimizer= "adam")    
conv_model.fit(x=X, y=y, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f8f98527880>

# Prepare submission

In [50]:
X_test = get_X(test, 'wordvec_concat')

In [146]:
pred = conv_model.predict(X_test)
pred = pred.flatten().round()
pred

array([0., 0., 1., ..., 1., 1., 0.], dtype=float32)

In [147]:
submission = pd.DataFrame({"id":test['id'], "target":pred.flatten().round().astype(int)})
submission.to_csv('conv_net.csv', index=False)

# Let's try RNNs instead

## Simple RNN

In [13]:
import tensorflow as tf

def get_rnn_model():
    return tf.keras.Sequential([
        tf.keras.layers.SimpleRNN(16, dropout=0.1, input_shape=(max_words,300)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [14]:
get_rnn_model().summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_3 (SimpleRNN)     (None, 16)                5072      
_________________________________________________________________
batch_normalization_3 (Batch (None, 16)                64        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 5,153
Trainable params: 5,121
Non-trainable params: 32
_________________________________________________________________


In [15]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    model = get_rnn_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25
— train_f1: 0.33575909661229614 — val_f1: 0.38040345821325644
Epoch 2/25
— train_f1: 0.5455941639470149 — val_f1: 0.5429403202328967
Epoch 3/25
— train_f1: 0.6517571884984025 — val_f1: 0.633855331841909
Epoch 4/25
— train_f1: 0.6924838642515095 — val_f1: 0.6771523178807948
Epoch 5/25
— train_f1: 0.7522531544161827 — val_f1: 0.726698262243286
Epoch 6/25
— train_f1: 0.7553542009884681 — val_f1: 0.7168284789644013
Epoch 7/25
— train_f1: 0.7748193229364778 — val_f1: 0.7317436661698957
Epoch 8/25
— train_f1: 0.7530095475300955 — val_f1: 0.7220394736842105
Epoch 9/25
— train_f1: 0.777080380643855 — val_f1: 0.7352472089314195
Epoch 10/25
— train_f1: 0.7738070646560629 — val_f1: 0.7386363636363635
Epoch 11/25
— train_f1: 0.7780758911460329 — val_f1: 0.7345724907063197
Epoch 12/25
— train_f1: 0.7924309712299671 — val_f1: 0.7449209932279908
Epoch 13/25
— train_f1: 0.7881694644284571 — val_f1: 0.744945567651633
Epoch 14/25
— train_f1: 0.7831768068599428 — val_

— train_f1: 0.8183062781146099 — val_f1: 0.7069767441860464
Epoch 9/25
— train_f1: 0.8199881959472752 — val_f1: 0.726144297905353
Epoch 10/25
— train_f1: 0.8362779740871613 — val_f1: 0.7165109034267912
Epoch 11/25
— train_f1: 0.8156879554222032 — val_f1: 0.6910994764397905
Epoch 12/25
— train_f1: 0.8392338943702844 — val_f1: 0.7337909992372235
Epoch 13/25
— train_f1: 0.8364544319600499 — val_f1: 0.7164685908319185
Epoch 14/25
— train_f1: 0.8519024390243901 — val_f1: 0.7281177381874516
Epoch 15/25
— train_f1: 0.8481797056545315 — val_f1: 0.727833461835004
Epoch 16/25
— train_f1: 0.8594267888477286 — val_f1: 0.7274119448698315
Epoch 17/25
— train_f1: 0.8665993945509586 — val_f1: 0.731785428342674
Epoch 18/25
— train_f1: 0.8685852284161802 — val_f1: 0.7244979919678716
Epoch 19/25
— train_f1: 0.8588088296543108 — val_f1: 0.7222222222222223
Epoch 20/25
— train_f1: 0.8754762382193705 — val_f1: 0.7335473515248797
Epoch 21/25
— train_f1: 0.8651894802236488 — val_f1: 0.7304785894206548
Epoch 22

— train_f1: 0.8009687362395419 — val_f1: 0.7193460490463215
Epoch 17/25
— train_f1: 0.8300202839756592 — val_f1: 0.746317512274959
Epoch 18/25
— train_f1: 0.8398373983739837 — val_f1: 0.7595762021189895
Epoch 19/25
— train_f1: 0.8217800131204899 — val_f1: 0.725135623869801
Epoch 20/25
— train_f1: 0.8339085418464194 — val_f1: 0.7291849255039439
Epoch 21/25
— train_f1: 0.8414143552826593 — val_f1: 0.7358490566037736
Epoch 22/25
— train_f1: 0.8505122308174786 — val_f1: 0.7404902789518174
Epoch 23/25
— train_f1: 0.8511253355358248 — val_f1: 0.7487437185929648
Epoch 24/25
— train_f1: 0.8439086294416244 — val_f1: 0.7383015597920278
Epoch 25/25
— train_f1: 0.842706131078224 — val_f1: 0.725085910652921


In [16]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [17]:
scores

[0.748031, 0.742317, 0.733388, 0.735733, 0.725086]

In [18]:
np.mean(scores)

0.7369110000000001

## GRU

In [42]:
import tensorflow as tf

def get_gru_model():
    return tf.keras.Sequential([
        tf.keras.layers.GRU(8, dropout=0.35, input_shape=(max_words,300)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [43]:
get_gru_model().summary()

Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_22 (GRU)                 (None, 8)                 7440      
_________________________________________________________________
batch_normalization_31 (Batc (None, 8)                 32        
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 9         
Total params: 7,481
Trainable params: 7,465
Non-trainable params: 16
_________________________________________________________________


In [44]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    model = get_gru_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 2/25
— train_f1: 0.0007716049382716049 — val_f1: 0.0
Epoch 3/25
— train_f1: 0.6020230205790024 — val_f1: 0.602510460251046
Epoch 4/25
— train_f1: 0.7750105529759392 — val_f1: 0.7397489539748955
Epoch 5/25
— train_f1: 0.7776809067131648 — val_f1: 0.7398444252376837
Epoch 6/25
— train_f1: 0.8082691528171869 — val_f1: 0.7609912070343724
Epoch 7/25
— train_f1: 0.7961921246213761 — val_f1: 0.75
Epoch 8/25
— train_f1: 0.814625850340136 — val_f1: 0.7669172932330828
Epoch 9/25
— train_f1: 0.8273748723186924 — val_f1: 0.7601593625498008
Epoch 10/25
— train_f1: 0.8257504247687371 — val_f1: 0.7602339181286549
Epoch 11/25
— train_f1: 0.8178067318132464 — val_f1: 0.7586805555555556
Epoch 12/25
— train_f1: 0.8294051627384962 — val_f1: 0.7580174927113702
Epoch 13/25
— train_f1: 0.8399615754082613 — val_f1: 0.7624060150375941
Epoch 14/25
— train_f1: 0.8465520811762269 — val_f1: 0.7698607698607699
Epoch 15/25
— train_f1: 0.8418821

— train_f1: 0.818738130407259 — val_f1: 0.7732656514382402
Epoch 10/25
— train_f1: 0.8154490799781381 — val_f1: 0.7644508670520231
Epoch 11/25
— train_f1: 0.8305256748528517 — val_f1: 0.778050778050778
Epoch 12/25
— train_f1: 0.8235546958951214 — val_f1: 0.7694974003466205
Epoch 13/25
— train_f1: 0.8370659543866859 — val_f1: 0.780608052588332
Epoch 14/25
— train_f1: 0.8366666666666668 — val_f1: 0.7723785166240409
Epoch 15/25
— train_f1: 0.8398231951168175 — val_f1: 0.7689684569479966
Epoch 16/25
— train_f1: 0.8375634517766497 — val_f1: 0.7755102040816326
Epoch 17/25
— train_f1: 0.8447048974466304 — val_f1: 0.7753378378378379
Epoch 18/25
— train_f1: 0.8597536750099325 — val_f1: 0.7697160883280757
Epoch 19/25
— train_f1: 0.8452607135317711 — val_f1: 0.7771135781383434
Epoch 20/25
— train_f1: 0.8504633529907328 — val_f1: 0.7696245733788396
Epoch 21/25
— train_f1: 0.8594539939332658 — val_f1: 0.7782290820471163
Epoch 22/25
— train_f1: 0.8546438983756767 — val_f1: 0.7750631844987363
Epoch 2

— train_f1: 0.8519953506392871 — val_f1: 0.7799227799227799
Epoch 17/25
— train_f1: 0.8466076696165192 — val_f1: 0.7786131996658314
Epoch 18/25
— train_f1: 0.845356462872858 — val_f1: 0.7774030354131535
Epoch 19/25
— train_f1: 0.8472427635748995 — val_f1: 0.7746243739565944
Epoch 20/25
— train_f1: 0.8623816240177312 — val_f1: 0.786624203821656
Epoch 21/25
— train_f1: 0.8589058000822707 — val_f1: 0.7919028340080971
Epoch 22/25
— train_f1: 0.8627209206740649 — val_f1: 0.7864077669902912
Epoch 23/25
— train_f1: 0.8630975143403441 — val_f1: 0.7785234899328858
Epoch 24/25
— train_f1: 0.868720281166012 — val_f1: 0.7779605263157895
Epoch 25/25
— train_f1: 0.8749752328115712 — val_f1: 0.7838258164852255


In [45]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [46]:
scores

[0.753857, 0.761301, 0.772496, 0.766667, 0.783826]

In [47]:
np.mean(scores)

0.7676293999999999

In [48]:
gru_model = get_gru_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
gru_model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
gru_model.fit(x=X, y=y, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f60a97a81f0>

In [51]:
pred = gru_model.predict(X_test)
pred = pred.flatten().round()
pred

array([1., 1., 1., ..., 1., 1., 0.], dtype=float32)

In [52]:
submission = pd.DataFrame({"id":test['id'], "target":pred.flatten().round().astype(int)})
submission.to_csv('gru.csv', index=False)

## LSTM

In [16]:
import tensorflow as tf

def get_lstm_model():
    return tf.keras.Sequential([
        tf.keras.layers.LSTM(8, dropout=0.35, input_shape=(max_words,300)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [17]:
get_lstm_model().summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 8)                 9888      
_________________________________________________________________
batch_normalization_6 (Batch (None, 8)                 32        
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 9         
Total params: 9,929
Trainable params: 9,913
Non-trainable params: 16
_________________________________________________________________


In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    model = get_lstm_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25


In [None]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [None]:
scores

In [None]:
np.mean(scores)