# Train the Model

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import psutil
import gc
import pickle
import datetime

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, concatenate, Activation, Dropout, BatchNormalization, Reshape
from tensorflow.keras.callbacks import TensorBoard, Callback, ModelCheckpoint, EarlyStopping


In [2]:
# Constants
model_name = 'model_dropout0.5_no_earlystop_lossweights1.0,0.2'
embeddings_path = 'data/glove.6B.50d.txt'
load_existing_model = False # load existing model
model_save_path = 'models/{}.hdf5'.format(model_name)
max_features = 40000
maxlen = 20 # max input length
batch_size = 32
embedding_dims = 50 # word embedding dim
meta_embedding_dims = 64 # metadata embedding dim
epochs = 20


In [3]:
# Load data
with open('loaded_data.pickle', 'rb') as file:
    payload = pickle.load(file)

for k in payload.keys():
    exec('{} = payload["{}"]'.format(k, k))

In [5]:
# Load glove vectors to bootstrap embeddings if not loading existing model
if not load_existing_model:
    embedding_vectors = {}
    weights_matrix = np.zeros((max_features + 1, 50))

    with open(embeddings_path, 'r') as f:
        for line in f:
            line_split = line.strip().split(" ")
            vec = np.array(line_split[1:], dtype=float)
            word = line_split[0]
            embedding_vectors[word] = vec

    for word, i in tokenizer.word_index.items():
        embedding_vector = embedding_vectors.get(word)
        if embedding_vector is not None and i <= max_features:
            weights_matrix[i] = embedding_vector

In [6]:
def build_model(load_existing):
    tf.keras.backend.clear_session()
    
    titles_input = Input(shape=(maxlen,), name='titles_input')
    if load_existing:
        titles_embedding = Embedding(max_features + 1, embedding_dims)(titles_input)
    else:
        titles_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(titles_input)
    titles_pooling = GlobalAveragePooling1D()(titles_embedding)
    
    aux_output = Dense(1, activation='sigmoid', name='aux_out')(titles_pooling)
    
    hours_input = Input(shape=(1,), name='hours_input')
    hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
    hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

    dayofweeks_input = Input(shape=(1,), name='dayofweeks_input')
    dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
    dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

    minutes_input = Input(shape=(1,), name='minutes_input')
    minutes_embedding = Embedding(60, meta_embedding_dims)(minutes_input)
    minutes_reshape = Reshape((meta_embedding_dims,))(minutes_embedding)

    dayofyears_input = Input(shape=(1,), name='dayofyears_input')
    dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
    dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)
    
    merged = concatenate([titles_pooling, hours_reshape, dayofweeks_reshape, minutes_reshape, dayofyears_reshape])

    hidden_1 = Dense(256, activation='relu')(merged)
    hidden_1 = BatchNormalization()(hidden_1)
    hidden_1 = Dropout(.5)(hidden_1)

    main_output = Dense(1, activation='sigmoid', name='main_out')(hidden_1)
    
    model = Model(inputs=[titles_input,
                      hours_input,
                      dayofweeks_input,
                      minutes_input,
                      dayofyears_input], outputs=[main_output, aux_output])
    
    if load_existing:
        model.load_weights(model_save_path)
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'],
                  loss_weights=[1, 0.2])
    
    return model

In [7]:
model = build_model(load_existing_model)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
titles_input (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
hours_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
dayofweeks_input (InputLayer)   (None, 1)            0                                            
__________________________________________________________________________________________________
minutes_input (InputL

In [8]:
# Callback to hopefully keep memory leaks down
class MemoryCallback(Callback):
    def on_epoch_end(self, epoch, log={}):
        gc.collect()
        process = psutil.Process(os.getpid())
        print('\n Memory usage: {} \n'.format(process.memory_info()[0]))

In [9]:
log_dir = "logs/scalars/{}_{}".format(model_name,
                                      datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
model_checkpoint_path = 'models/checkpoints/'+ model_name +'.{epoch:02d}-{val_loss:.2f}.hdf5'

callbacks = [
    MemoryCallback(),
    TensorBoard(log_dir=log_dir),
    ModelCheckpoint(model_checkpoint_path)
#     EarlyStopping(monitor='val_loss', patience=1)
]

In [10]:
history = model.fit(
    x=[titles_train, hours_train, weekdays_train, minutes_train, dates_train],
    y=[is_top_submission_train, is_top_submission_train],
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([titles_val, hours_val, weekdays_val, minutes_val, dates_val],
                     [is_top_submission_val, is_top_submission_val]),
    callbacks=callbacks
)

model.save_weights(model_save_path)

Train on 1118162 samples, validate on 139770 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
 Memory usage: 1382109184 

Epoch 2/20
 Memory usage: 1401745408 

Epoch 3/20
 Memory usage: 1405374464 

Epoch 4/20
 Memory usage: 1411325952 

Epoch 5/20
 Memory usage: 1413468160 

Epoch 6/20
 Memory usage: 1415090176 

Epoch 7/20
 Memory usage: 1418854400 

Epoch 8/20
 Memory usage: 1426423808 

Epoch 9/20
 Memory usage: 1428025344 

Epoch 10/20
 Memory usage: 1429377024 

Epoch 11/20
 Memory usage: 1430978560 

Epoch 12/20
 Memory usage: 1432334336 

Epoch 13/20
 Memory usage: 1433935872 

Epoch 14/20
 Memory usage: 1439346688 

Epoch 15/20
 Memory usage: 1449058304 

Epoch 16/20
 Memory usage: 1456631808 

Epoch 17/20
 Memory usage: 1458774016 

Epoch 18/20
 Memory usage: 1460137984 

Epoch 19/20
 Memory usage: 1461219328 

Epoch 20/20
 Memory usage: 1462571008 



In [11]:
print('done')

done
