# Convolutional wordvec

## Reading the data

In [1]:
import pandas as pd
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [2]:
wordvec_train = pd.read_pickle('train_wordvec.pickle')
wordvec_test = pd.read_pickle('test_wordvec.pickle')

In [3]:
train = train.merge(wordvec_train, on=['id'])
test = test.merge(wordvec_test, on=['id'])
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,wordvec,keyword_wordvec,wordvec_concat,wordvec_tfidf
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[-0.26623327, 0.05843069, -0.1404636, -0.05265...","[-0.26623327, 0.05843069, -0.1404636, -0.05265...","[[-0.2820900082588196, 0.1519400030374527, -0....","[-2.0410312242232838, 0.1577752003302941, -0.8..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[-0.025449565, 0.031005142, -0.15566371, -0.23...","[-0.025449565, 0.031005142, -0.15566371, -0.23...","[[0.3039900064468384, 0.20476000010967255, -0....","[-0.27185601989428204, 0.2042857458194097, -1...."
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[0.0059339865, 0.016337818, -0.105279535, -0.0...","[0.0059339865, 0.016337818, -0.105279535, -0.0...","[[0.00997759960591793, -0.20995000004768372, -...","[0.07528745450756767, 0.11175614595413208, -0...."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,"[-0.18147185, 0.20731743, 0.014147284, -0.2182...","[-0.18147185, 0.20731743, 0.014147284, -0.2182...","[[-0.19686000049114227, 0.1157900020480156, -0...","[-1.3403782035623277, 1.2000715562275477, 0.11..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[-0.06394094, -0.01423019, 0.0063574947, 0.071...","[-0.06394094, -0.01423019, 0.0063574947, 0.071...","[[-0.02556299977004528, 0.444240003824234, -0....","[-0.7245167245467504, -0.364056259393692, 0.52..."


## Train a model

In [100]:
max_words = max(train['wordvec_concat'].apply(lambda x: x.shape[0]).max(),
                test['wordvec_concat'].apply(lambda x: x.shape[0]).max())
max_words

33

In [101]:
import numpy
def get_X(df, col):
    X = concat = numpy.empty((0, max_words, 300))
    for index, row in df.iterrows():
        x = numpy.pad(row[col],((0,max_words - row[col].shape[0]),(0, 0)))
        X = numpy.append(X, [x], axis=0)
    return X

In [102]:
y = train['target']
X = get_X(train, 'wordvec_concat')

In [103]:
X.shape

(7561, 33, 300)

In [106]:
import tensorflow as tf

def get_model():
    conv_model = tf.keras.Sequential([\
        tf.keras.layers.Dropout(0.4, input_shape=(max_words,300)),          
        tf.keras.layers.Conv1D(filters=6, kernel_size=3, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Dropout(0.3),  
        tf.keras.layers.Conv1D(filters=8, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.2),                           
        tf.keras.layers.Dense(units=1, activation='sigmoid'),
    ])
    return conv_model

In [107]:
m = get_model()
m.summary()

Model: "sequential_66"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_174 (Dropout)        (None, 33, 300)           0         
_________________________________________________________________
conv1d_127 (Conv1D)          (None, 31, 6)             5406      
_________________________________________________________________
max_pooling1d_123 (MaxPoolin (None, 15, 6)             0         
_________________________________________________________________
dropout_175 (Dropout)        (None, 15, 6)             0         
_________________________________________________________________
conv1d_128 (Conv1D)          (None, 11, 8)             248       
_________________________________________________________________
max_pooling1d_124 (MaxPoolin (None, 5, 8)              0         
_________________________________________________________________
flatten_66 (Flatten)         (None, 40)              

In [108]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [132]:
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback
import numpy as np

class Metrics(Callback):
    def __init__(self, train, validation):   
        super(Metrics, self).__init__()
        self.validation = validation    
        self.train = train        
        
    def on_train_begin(self, logs={}):        
        self.val_f1s = []
        self.train_f1s = []
             
    def on_epoch_end(self, epoch, logs={}):
        val_targ = self.validation[1]   
        val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
        
        train_targ = self.train[1]   
        train_predict = (np.asarray(self.model.predict(self.train[0]))).round()   
        
        val_f1 = f1_score(val_targ, val_predict)
        train_f1 = f1_score(train_targ, train_predict)
        self.val_f1s.append(round(val_f1, 6))
        self.train_f1s.append(round(train_f1, 6))
        
        print(f'— train_f1: {train_f1} — val_f1: {val_f1}')

In [133]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5, random_state=42, shuffle=True)
metrics = []
for k_fold, (tr_inds, val_inds) in enumerate(kfold.split(X=X, y=y)):
    print('---- Starting fold %d ----'%(k_fold+1))
    
    x_train, y_train = X[tr_inds], y[tr_inds]
    x_val, y_val = X[val_inds], y[val_inds]
    conv_model = get_model()
    conv_model.compile(loss='binary_crossentropy', optimizer= "adam", metrics=[])    
    m = Metrics(train=(x_train,y_train), validation=(x_val, y_val))
    conv_model.fit(x=x_train, y=y_train, batch_size=32, epochs=25, 
               callbacks=[m])
    metrics.append(m)

---- Starting fold 1 ----
Epoch 1/25
— train_f1: 0.7232365145228216 — val_f1: 0.7063621533442088
Epoch 2/25
— train_f1: 0.7550328881801873 — val_f1: 0.745736434108527
Epoch 3/25
— train_f1: 0.7679671457905545 — val_f1: 0.7536231884057971
Epoch 4/25
— train_f1: 0.7782274382840955 — val_f1: 0.7653543307086614
Epoch 5/25
— train_f1: 0.7882591093117409 — val_f1: 0.7657232704402516
Epoch 6/25
— train_f1: 0.7910478128179044 — val_f1: 0.7685039370078741
Epoch 7/25
— train_f1: 0.7886178861788617 — val_f1: 0.7537437603993343
Epoch 8/25
— train_f1: 0.8031464300121017 — val_f1: 0.7600314712824547
Epoch 9/25
— train_f1: 0.7977886455453965 — val_f1: 0.7548117154811717
Epoch 10/25
— train_f1: 0.8117719190680565 — val_f1: 0.7745019920318724
Epoch 11/25
— train_f1: 0.8086627417998318 — val_f1: 0.7635467980295566
Epoch 12/25
— train_f1: 0.8087431693989071 — val_f1: 0.7619834710743801
Epoch 13/25
— train_f1: 0.8098547062539482 — val_f1: 0.7655737704918032
Epoch 14/25
— train_f1: 0.8018154311649017 — val

— train_f1: 0.7998328458002507 — val_f1: 0.7682008368200837
Epoch 9/25
— train_f1: 0.7816245006657789 — val_f1: 0.7455197132616487
Epoch 10/25
— train_f1: 0.8095139607032058 — val_f1: 0.7739783152627189
Epoch 11/25
— train_f1: 0.8006068487212831 — val_f1: 0.7552447552447552
Epoch 12/25
— train_f1: 0.8107308829742291 — val_f1: 0.7670068027210885
Epoch 13/25
— train_f1: 0.8042396712091714 — val_f1: 0.76
Epoch 14/25
— train_f1: 0.8158449220396122 — val_f1: 0.7633587786259541
Epoch 15/25
— train_f1: 0.8023689405571396 — val_f1: 0.7526501766784451
Epoch 16/25
— train_f1: 0.8253175098896522 — val_f1: 0.7644593461860856
Epoch 17/25
— train_f1: 0.8185689948892675 — val_f1: 0.7583834909716253
Epoch 18/25
— train_f1: 0.8131062729036431 — val_f1: 0.7580225498699046
Epoch 19/25
— train_f1: 0.8186706818670682 — val_f1: 0.7556325823223571
Epoch 20/25
— train_f1: 0.8273075287111868 — val_f1: 0.7538461538461538
Epoch 21/25
— train_f1: 0.8052584670231729 — val_f1: 0.7430117222723175
Epoch 22/25
— train

— train_f1: 0.8125533731853117 — val_f1: 0.7783505154639175
Epoch 17/25
— train_f1: 0.813237774030354 — val_f1: 0.7774030354131535
Epoch 18/25
— train_f1: 0.8207024029574862 — val_f1: 0.7881773399014779
Epoch 19/25
— train_f1: 0.8213689482470785 — val_f1: 0.7939949958298582
Epoch 20/25
— train_f1: 0.8202995008319468 — val_f1: 0.7980132450331126
Epoch 21/25
— train_f1: 0.8234313112361893 — val_f1: 0.7943615257048092
Epoch 22/25
— train_f1: 0.8152920962199313 — val_f1: 0.7715996578272026
Epoch 23/25
— train_f1: 0.8194207836456557 — val_f1: 0.7751277683134582
Epoch 24/25
— train_f1: 0.8317891702697139 — val_f1: 0.7940691927512357
Epoch 25/25
— train_f1: 0.831179658190913 — val_f1: 0.7886855241264561


In [134]:
scores = []
for m in metrics:
    scores.append(m.val_f1s[24])

In [112]:
scores

[0.767461, 0.748264, 0.769231, 0.75616, 0.782824]

In [113]:
np.mean(scores)

0.764788

Marginally better as averaging?? Train model on all data.

In [119]:
conv_model = get_model()
conv_model.compile(loss='binary_crossentropy', optimizer= "adam")    
conv_model.fit(x=X, y=y, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f8f98527880>

# Prepare submission

In [122]:
X_test = get_X(test, 'wordvec_concat')

In [146]:
pred = conv_model.predict(X_test)
pred = pred.flatten().round()
pred

array([0., 0., 1., ..., 1., 1., 0.], dtype=float32)

In [147]:
submission = pd.DataFrame({"id":test['id'], "target":pred.flatten().round().astype(int)})
submission.to_csv('conv_net.csv', index=False)