# Convolutional wordvec

## Reading the data

In [148]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [149]:
wordvec_train = pd.read_pickle('train_wordvec_glove.twitter.27B.50d.pickle')
wordvec_test = pd.read_pickle('test_wordvec_glove.twitter.27B.50d.pickle')

In [150]:
train = train.merge(wordvec_train, on=['id'])
test = test.merge(wordvec_test, on=['id'])
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,glove_cleaned_text,wordvec,keyword_wordvec,wordvec_concat,wordvec_tfidf
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,our deeds are the reason of this <hashtag> ear...,"[0.34491748, 0.22578713, 0.07250176, 0.0747826...","[0.34491748, 0.22578713, 0.07250176, 0.0747826...","[[0.15189999341964722, 0.042114000767469406, 0...","[1.5667167289980821, 1.2339818275400571, 0.247..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask. canada,"[0.012921251, -0.21105601, -0.035843372, -0.42...","[0.012921251, -0.21105601, -0.035843372, -0.42...","[[-0.18448999524116516, -0.9394599795341492, -...","[-0.3091522455215454, -0.7302671798637935, -0...."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,all residents asked to 'shelter in place' are ...,"[0.18257721, 0.20186086, -0.12769058, 0.055459...","[0.18257721, 0.20186086, -0.12769058, 0.055459...","[[0.3380799889564514, 0.24919000267982483, 0.2...","[1.0368728519163348, 2.0143982385369865, -1.50..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,<number> people receive <hashtag> wildfires ev...,"[0.40764716, 0.24536107, -0.17184022, -0.19602...","[0.40764716, 0.24536107, -0.17184022, -0.19602...","[[0.476610004901886, 0.20970000326633453, 0.33...","[2.8280542948179774, 1.9225198494063482, -2.34..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,just got sent this photo from ruby <hashtag> a...,"[0.2010703, 0.15718287, 0.14506513, -0.1997651...","[0.2010703, 0.15718287, 0.14506513, -0.1997651...","[[0.07746600359678268, 0.37777000665664673, 0....","[1.232528381049633, 1.0207276116399204, 0.7743..."


## Train a model

In [151]:
max_words = max(train['wordvec_concat'].apply(lambda x: x.shape[0]).max(),
                test['wordvec_concat'].apply(lambda x: x.shape[0]).max())
max_words

109

In [152]:
wv_size = train['wordvec_concat'].iloc[0].shape[1]

In [153]:
import numpy
def get_X(df, col, wv_size):
    X = concat = numpy.empty((0, max_words, wv_size))
    for index, row in df.iterrows():
        x = numpy.pad(row[col],((0,max_words - row[col].shape[0]),(0, 0)))
        X = numpy.append(X, [x], axis=0)
    return X

In [154]:
y = train['target']
X = get_X(train, 'wordvec_concat', wv_size)

In [155]:
X.shape

(7561, 109, 50)

In [156]:
import tensorflow as tf

def get_model():
    conv_model = tf.keras.Sequential([\
        tf.keras.layers.Dropout(0.4, input_shape=(max_words, wv_size)),          
        tf.keras.layers.Conv1D(filters=6, kernel_size=3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Dropout(0.3),  
        tf.keras.layers.Conv1D(filters=8, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.2),                           
        tf.keras.layers.Dense(units=1, activation='sigmoid'),
    ])
    return conv_model

In [157]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [158]:
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback
import numpy as np

class Metrics(Callback):
    def __init__(self, train, validation):   
        super(Metrics, self).__init__()
        self.validation = validation    
        self.train = train        
        
    def on_train_begin(self, logs={}):        
        self.val_f1s = []
        self.train_f1s = []
             
    def on_epoch_end(self, epoch, logs={}):
        val_targ = self.validation[1]   
        val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
        
        train_targ = self.train[1]   
        train_predict = (np.asarray(self.model.predict(self.train[0]))).round()   
        
        val_f1 = f1_score(val_targ, val_predict)
        train_f1 = f1_score(train_targ, train_predict)
        self.val_f1s.append(round(val_f1, 6))
        self.train_f1s.append(round(train_f1, 6))
        
        print(f'— train_f1: {train_f1} — val_f1: {val_f1}')

In [159]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    conv_model = get_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    conv_model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    conv_model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25
— train_f1: 0.6259373621526246 — val_f1: 0.6538124452234882
Epoch 2/25
— train_f1: 0.7117539213689141 — val_f1: 0.7124394184168013
Epoch 3/25
— train_f1: 0.7324516785350966 — val_f1: 0.732258064516129
Epoch 4/25
— train_f1: 0.7463355048859934 — val_f1: 0.7390263367916999
Epoch 5/25
— train_f1: 0.7510862818125388 — val_f1: 0.739591836734694
Epoch 6/25
— train_f1: 0.7540305911533691 — val_f1: 0.7406199021207178
Epoch 7/25
— train_f1: 0.7612850752338349 — val_f1: 0.752205292702486
Epoch 8/25
— train_f1: 0.7628021302744776 — val_f1: 0.7526358475263584
Epoch 9/25
— train_f1: 0.7548278757346768 — val_f1: 0.7441860465116279
Epoch 10/25
— train_f1: 0.7726358148893361 — val_f1: 0.7605409705648369
Epoch 11/25
— train_f1: 0.7469405594405595 — val_f1: 0.7340241796200345
Epoch 12/25
— train_f1: 0.7583438952260245 — val_f1: 0.745
Epoch 13/25
— train_f1: 0.7645580226225388 — val_f1: 0.7524752475247524
Epoch 14/25
— train_f1: 0.7698004525817733 — val_f1: 0.76074614

— train_f1: 0.7678355501813785 — val_f1: 0.7595141700404858
Epoch 9/25
— train_f1: 0.7719652595435266 — val_f1: 0.7625201938610663
Epoch 10/25
— train_f1: 0.7719226856561546 — val_f1: 0.7603574329813161
Epoch 11/25
— train_f1: 0.7742461287693562 — val_f1: 0.7662337662337663
Epoch 12/25
— train_f1: 0.7705358990275192 — val_f1: 0.7597027250206442
Epoch 13/25
— train_f1: 0.7768595041322314 — val_f1: 0.7691069991954949
Epoch 14/25
— train_f1: 0.7810392701309005 — val_f1: 0.7677725118483412
Epoch 15/25
— train_f1: 0.7634522051065626 — val_f1: 0.7472527472527473
Epoch 16/25
— train_f1: 0.7551152272237777 — val_f1: 0.742127659574468
Epoch 17/25
— train_f1: 0.7781583719524481 — val_f1: 0.7586206896551724
Epoch 18/25
— train_f1: 0.7718496989827694 — val_f1: 0.7506255212677231
Epoch 19/25
— train_f1: 0.7508189561039529 — val_f1: 0.7378472222222223
Epoch 20/25
— train_f1: 0.7522336020919589 — val_f1: 0.738115816767502
Epoch 21/25
— train_f1: 0.7599480968858132 — val_f1: 0.7401032702237521
Epoch 2

— train_f1: 0.7333481844508799 — val_f1: 0.742556917688266
Epoch 17/25
— train_f1: 0.7453794303109372 — val_f1: 0.751931330472103
Epoch 18/25
— train_f1: 0.7363941769316911 — val_f1: 0.7387068201948627
Epoch 19/25
— train_f1: 0.7495674740484429 — val_f1: 0.7510692899914458
Epoch 20/25
— train_f1: 0.7306908267270668 — val_f1: 0.7357142857142858
Epoch 21/25
— train_f1: 0.7423407538020719 — val_f1: 0.7462946817785527
Epoch 22/25
— train_f1: 0.7667952100669778 — val_f1: 0.7676113360323886
Epoch 23/25
— train_f1: 0.7289550891446626 — val_f1: 0.7323187108325873
Epoch 24/25
— train_f1: 0.7416777629826897 — val_f1: 0.7421052631578948
Epoch 25/25
— train_f1: 0.7320942883046238 — val_f1: 0.7383512544802868


In [160]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [161]:
scores

[0.749788, 0.75453, 0.734155, 0.756713, 0.738351]

In [162]:
np.mean(scores)

0.7467074

Marginally better as averaging?? Train model on all data.

In [163]:
conv_model = get_model()
conv_model.compile(loss='binary_crossentropy', optimizer= "adam")    
conv_model.fit(x=X, y=y, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fc7701a1820>

# Prepare submission

In [164]:
X_test = get_X(test, 'wordvec_concat', wv_size)

In [165]:
pred = conv_model.predict(X_test)
pred = pred.flatten().round()
pred

array([1., 1., 1., ..., 1., 1., 0.], dtype=float32)

In [166]:
submission = pd.DataFrame({"id":test['id'], "target":pred.flatten().round().astype(int)})
submission.to_csv('conv_net.csv', index=False)

# Let's try RNNs instead

## Simple RNN

In [167]:
import tensorflow as tf

def get_rnn_model():
    return tf.keras.Sequential([
        tf.keras.layers.SimpleRNN(16, dropout=0.1, input_shape=(max_words,wv_size)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [168]:
get_rnn_model().summary()

Model: "sequential_91"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_20 (SimpleRNN)    (None, 16)                1072      
_________________________________________________________________
batch_normalization_68 (Batc (None, 16)                64        
_________________________________________________________________
dense_91 (Dense)             (None, 1)                 17        
Total params: 1,153
Trainable params: 1,121
Non-trainable params: 32
_________________________________________________________________


In [169]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    model = get_rnn_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 2/25
— train_f1: 0.0 — val_f1: 0.0030816640986132513
Epoch 3/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 4/25
— train_f1: 0.0007716049382716049 — val_f1: 0.0
Epoch 5/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 6/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 7/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 8/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 9/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 10/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 11/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 12/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 13/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 14/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 15/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 16/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 17/25
— train_f1: 0.47105075053609724 — val_f1: 0.4642604387827317
Epoch 18/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 19/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 20/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 21/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 22/25
— train_f1: 0.0 — val_

— train_f1: 0.006917755572636434 — val_f1: 0.00920245398773006
Epoch 17/25
— train_f1: 0.009984639016897083 — val_f1: 0.009216589861751152
Epoch 18/25
— train_f1: 0.21846072467601166 — val_f1: 0.2191780821917808
Epoch 19/25
— train_f1: 0.0015420200462606015 — val_f1: 0.0
Epoch 20/25
— train_f1: 0.0015420200462606015 — val_f1: 0.0
Epoch 21/25
— train_f1: 0.0015420200462606015 — val_f1: 0.0
Epoch 22/25
— train_f1: 0.0030816640986132513 — val_f1: 0.0
Epoch 23/25
— train_f1: 0.0030816640986132513 — val_f1: 0.0
Epoch 24/25
— train_f1: 0.0038476337052712585 — val_f1: 0.003072196620583717
Epoch 25/25
— train_f1: 0.0 — val_f1: 0.0
---- Starting fold 4 ----
Epoch 1/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 2/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 3/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 4/25
— train_f1: 0.0015426147319706905 — val_f1: 0.003072196620583717
Epoch 5/25
— train_f1: 0.6001853138753764 — val_f1: 0.5996292863762743
Epoch 6/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 7/25
— train_f1: 0.0 — val

InvalidArgumentError:  indices[14] = 66734 is not in [0, 1512)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]] [Op:__inference_predict_function_1171959]

Function call stack:
predict_function


In [None]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [None]:
scores

In [None]:
np.mean(scores)

## GRU

In [170]:
import tensorflow as tf

def get_gru_model():
    return tf.keras.Sequential([
        #tf.keras.layers.GRU(8, dropout=0.35, input_shape=(max_words,wv_size)),
        tf.keras.layers.GRU(8, dropout=0.3, input_shape=(max_words,wv_size)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [171]:
get_gru_model().summary()

Model: "sequential_97"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_40 (GRU)                 (None, 8)                 1440      
_________________________________________________________________
batch_normalization_74 (Batc (None, 8)                 32        
_________________________________________________________________
dense_97 (Dense)             (None, 1)                 9         
Total params: 1,481
Trainable params: 1,465
Non-trainable params: 16
_________________________________________________________________


In [172]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    model = get_gru_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 2/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 3/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 4/25
— train_f1: 0.6000463177396943 — val_f1: 0.6
Epoch 5/25
— train_f1: 0.6000231669176417 — val_f1: 0.6002779064381658
Epoch 6/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 7/25
— train_f1: 0.6000231669176417 — val_f1: 0.6002779064381658
Epoch 8/25
— train_f1: 0.6001853138753764 — val_f1: 0.6002779064381658
Epoch 9/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 10/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 11/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 12/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 13/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 14/25
— train_f1: 0.6001853138753764 — val_f1: 0.6002779064381658
Epoch 15/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 16/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 17/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 18/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 19/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 20/25
— train_f1: 0.0 — val_f1: 0.0
Epoch 21

KeyboardInterrupt: 

In [None]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [None]:
scores

In [None]:
np.mean(scores)

In [None]:
gru_model = get_gru_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
gru_model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
gru_model.fit(x=X, y=y, batch_size=32, epochs=25)

In [None]:
pred = gru_model.predict(X_test)
pred = pred.flatten().round()
pred

In [None]:
submission = pd.DataFrame({"id":test['id'], "target":pred.flatten().round().astype(int)})
submission.to_csv('gru.csv', index=False)

## LSTM

In [None]:
import tensorflow as tf

def get_lstm_model():
    return tf.keras.Sequential([
        tf.keras.layers.LSTM(8, dropout=0.35, input_shape=(max_words,200)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ])

In [None]:
get_lstm_model().summary()

In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    model = get_lstm_model()
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer= optimizer, metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

In [None]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [None]:
scores

In [None]:
np.mean(scores)