In [169]:
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from keras.losses import binary_crossentropy
from keras import backend as K
from sklearn.model_selection import train_test_split

## Gloable vars ##
RANDOM_SEED = 100
TRAINNING_PATH = 'data/train.csv'
EMBEDDING_DIMENSION = 300
MAX_LEN = 100
STD = 0.16  # std of embedding matrix entries
# EMBEDDING_PATH = 'data/crawl-300d-2M.vec'
EMBEDDING_PATH = 'data/glove.twitter.27B.25d.txt'  


def get_coefs(word, *arr):
    "[good 2 4 3] --> good, [2,3,4]"
    return word, np.asarray(arr, dtype='float32')

def build_embeddings_dict(embedding_path):
    "Build embedding dictionary."
    with open(embedding_path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def initilize_embedding_matrix(word_num, embedding_dimension):
    "Initilize embedding matrix as zeros."
    return np.random.randn(word_num+1, embedding_dimension)*STD  # 0.16 = std of embedding matrix entries

def build_embedding_matrix(word_index, embedding_path):
    "Build embedding matrix."
    embeddings_dict = build_embeddings_dict(embedding_path)
    embedding_dimension = len(embedding_index['good'])
    embedding_matrix = initilize_embedding_matrix(word_num, embedding_dimension)
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass  # if miss give a random vector
    return embedding_matrix

In [155]:
trainning_data = pd.read_csv('data/train.csv')
x_train = trainning_data.comment_text
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
word_num = len(tokenizer.word_index)
y_train = trainning_data.target
embedding_matrix = build_embedding_matrix(tokenizer.word_index, EMBEDDING_PATH)

Building embedding matrix


In [174]:
def build_model(
    embedding_matrix,
    LSTM_UNITS = 128,
    DENSE_HIDDEN_UNITS = 512
):    
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [175]:
model = build_model(embedding_matrix)

In [177]:
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
BATCH_SIZE = 512
model.fit(x_tr,y_tr,batch_size=BATCH_SIZE,validation_data=[x_val, y_val])

Train on 1443899 samples, validate on 360975 samples
Epoch 1/1