With the advent of the word2vec model, and recurrent neural networks, we can use neural networks to derive further insight from the language of tweets on their disaster level over what typical bag of words models can give us with traditional machine learning techniques. We will use Keras to implement our neural network, and derive new word2vec word embeddings rather than use a prebuild set as we expect the use of language in tweets to be somewhat different than conventional.

In [14]:
# We'll be using our corpus of previously processed tweets
import pandas as pd

tweet_df = pd.read_csv('../data/processed_kaggle_training.csv')
tweet_df['processed_text'] = tweet_df['processed_text'].astype(str)
texts = tweet_df.processed_text.values

In [15]:
# prepare for skipgram word2vec with keras helpers
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import skipgrams

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

wids = [[word2id[w] for w in text_to_word_sequence(text)] for text in texts]
skip_grams = [skipgrams(wid, vocabulary_size=len(word2id)) for wid in wids]

In [3]:
# construct our keras model for skipgrams
from keras.layers import Dot, Input
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Model

VOCAB_SIZE = len(word2id) + 1
EMBED_SIZE = 100

word_input = Input(name='word_input',shape=[1])
layer = Embedding(
    VOCAB_SIZE, EMBED_SIZE, embeddings_initializer="glorot_uniform",
    input_length=1)(word_input)
word_layer = Reshape((EMBED_SIZE,))(layer)

context_input = Input(name='context_input',shape=[1])
layer = Embedding(
    VOCAB_SIZE, EMBED_SIZE, embeddings_initializer="glorot_uniform",
    input_length=1)(context_input)
context_layer = Reshape((EMBED_SIZE,))(layer)

merge_layer = Dot(axes=1)([word_layer, context_layer])
output = Dense(1, init="glorot_uniform", activation="sigmoid")(merge_layer)
sg_model = Model(inputs = [word_input, context_input], outputs = output)
    
sg_model.compile(loss="mean_squared_error", optimizer="adam")

Instructions for updating:
Colocations handled automatically by placer.




In [4]:
# and now train our prepared skipgrams on our model
import numpy as np
for epoch in range(1, 6):
    loss = 0
    for i, elem in enumerate(skip_grams):
        if len(elem[0]) == 0:
            continue
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        loss += sg_model.train_on_batch(X,Y)

    print('Epoch:', epoch, 'Loss:', loss)

Instructions for updating:
Use tf.cast instead.
Epoch: 1 Loss: 1482.4700885526836
Epoch: 2 Loss: 1094.4184742826037
Epoch: 3 Loss: 897.1317858066177
Epoch: 4 Loss: 662.4907645033127
Epoch: 5 Loss: 480.5593598252046


In [5]:
# still seeing a pretty big drop in the loss at the end there, lets run
# a few more epochs before saving the embedding
for epoch in range(6, 10):
    loss = 0
    for i, elem in enumerate(skip_grams):
        if len(elem[0]) == 0:
            continue
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        loss += sg_model.train_on_batch(X,Y)

    print('Epoch:', epoch, 'Loss:', loss)

Epoch: 6 Loss: 387.7060118282038
Epoch: 7 Loss: 279.3309925262673
Epoch: 8 Loss: 238.44166559127513
Epoch: 9 Loss: 172.69524405942457
