In [1]:
import numpy as np
import os
import csv
from random import random, sample, seed
from datetime import datetime
import psutil
import resource
import gc

import tensorflow as tf

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, concatenate, Activation
from tensorflow.keras.layers import Masking, Dropout, Reshape
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import CSVLogger, TensorBoard, Callback

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Constants
data_path = 'data/submissions.csv'
embeddings_path = 'data/glove.6B.50d.txt'
maxlen = 20 # max input length
batch_size = 32
embedding_dims = 50 # word embedding dim
meta_embedding_dims = 64 # metadata embedding dim
epochs = 10

In [3]:
titles = []
hours = []
minutes = []
dayofweeks = []
dayofyears = []
is_top_submission = []

max_rows = 16000

with open(data_path, 'r', encoding="latin1") as f:
    reader = csv.DictReader(f)
    i = 0
    for submission in reader:
        if i >= max_rows:
            break
        i += 1
        titles.append(submission['title'])
        hours.append(submission['hour'])
        minutes.append(submission['minute'])
        dayofweeks.append(submission['dayofweek'])
        dayofyears.append(submission['dayofyear'])
        is_top_submission.append(submission['is_top_submission'])
            
titles = np.array(titles)
hours = np.array(hours, dtype=int)
minutes = np.array(minutes, dtype=int)
dayofweeks = np.array(dayofweeks, dtype=int)
dayofyears = np.array(dayofyears, dtype=int)
is_top_submission = np.array(is_top_submission, dtype=int)

In [4]:
for i in range(max_rows//2, max_rows):
    is_top_submission[i] = 0

In [5]:
max_features = 40000

word_tokenizer = tf.keras.preprocessing.text.Tokenizer(max_features)
word_tokenizer.fit_on_texts(titles)


In [6]:
titles_tf = word_tokenizer.texts_to_sequences(titles)
titles_tf = tf.keras.preprocessing.sequence.pad_sequences(titles_tf, maxlen)

In [7]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec

In [8]:
weights_matrix = np.zeros((max_features + 1, 50))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if embedding_vector is not None and i <= max_features:
        weights_matrix[i] = embedding_vector

In [9]:
# zero based year
dayofyears_tf = dayofyears - 1

In [11]:
tf.reset_default_graph()
# sess.close()
tf.keras.backend.clear_session()
sess = tf.InteractiveSession()

In [34]:
def build_model():
    titles_input = Input(shape=(maxlen,), name='titles_input')
#     titles_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(titles_input)
#     titles_pooling = GlobalAveragePooling1D()(titles_embedding)
    
    aux_output = Dense(1, activation='sigmoid', name='aux_out')(titles_input)
    
#     hours_input = Input(shape=(1,), name='hours_input')
#     hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
#     hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

#     dayofweeks_input = Input(shape=(1,), name='dayofweeks_input')
#     dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
#     dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

#     minutes_input = Input(shape=(1,), name='minutes_input')
#     minutes_embedding = Embedding(60, meta_embedding_dims)(minutes_input)
#     minutes_reshape = Reshape((meta_embedding_dims,))(minutes_embedding)

#     dayofyears_input = Input(shape=(1,), name='dayofyears_input')
#     dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
#     dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)
    
#     merged = concatenate([titles_pooling, hours_reshape, dayofweeks_reshape, minutes_reshape, dayofyears_reshape])

#     hidden_1 = Dense(256, activation='relu')(merged)
#     hidden_1 = BatchNormalization()(hidden_1)

#     main_output = Dense(1, activation='sigmoid', name='main_out')(hidden_1)
    
#     model = Model(inputs=[titles_input,
#                       hours_input,
#                       dayofweeks_input,
#                       minutes_input,
#                       dayofyears_input], outputs=[main_output, aux_output])
    
    model = Model(inputs=[titles_input], outputs=[aux_output])

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

In [35]:

model = build_model()

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
titles_input (InputLayer)    [(None, 20)]              0         
_________________________________________________________________
aux_out (Dense)              (None, 1)                 21        
Total params: 21
Trainable params: 21
Non-trainable params: 0
_________________________________________________________________


In [14]:
seed(123)
split = 0.2

# returns randomized indices with no repeats
idx = sample(range(titles_tf.shape[0]), titles_tf.shape[0])

titles_tf = titles_tf[idx, :]
hours = hours[idx]
dayofweeks = dayofweeks[idx]
minutes = minutes[idx]
dayofyears_tf = dayofyears_tf[idx]
is_top_submission = is_top_submission[idx]

In [15]:
class MemoryCallback(Callback):
    def on_epoch_end(self, epoch, log={}):
        gc.collect()
        process = psutil.Process(os.getpid())
        print('\n memory: {} \n'.format(process.memory_info()))

In [12]:
model = Sequential()
model.add(Dense(1, input_shape=(maxlen,)))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [16]:
# history = model.fit([titles_tf, hours, dayofweeks, minutes, dayofyears_tf], [is_top_submission, is_top_submission],
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_split=split,
#           callbacks=[MemoryCallback()])
history = model.fit([titles_tf], [is_top_submission],
          batch_size=batch_size,
          epochs=epochs,
          validation_split=split,
          callbacks=[MemoryCallback()])

Train on 12800 samples, validate on 3200 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
 memory: pmem(rss=647008256, vms=4368171008, shared=122826752, text=2322432, lib=0, data=917622784, dirty=0) 

Epoch 2/10
 memory: pmem(rss=648089600, vms=4368171008, shared=122826752, text=2322432, lib=0, data=918593536, dirty=0) 

Epoch 3/10
 memory: pmem(rss=648630272, vms=4368171008, shared=122830848, text=2322432, lib=0, data=919441408, dirty=0) 

Epoch 4/10
 memory: pmem(rss=649711616, vms=4368171008, shared=122830848, text=2322432, lib=0, data=920317952, dirty=0) 

Epoch 5/10
 memory: pmem(rss=650792960, vms=4368171008, shared=122830848, text=2322432, lib=0, data=921227264, dirty=0) 

Epoch 6/10
 memory: pmem(rss=651603968, vms=4368171008, shared=122830848, text=2322432, lib=0, data=922124288, dirty=0) 

Epoch 7/10
 memory: pmem(rss=652414976, vms=4368171008, shared=122830848, text=2322432, lib=0, data=923037696, dirty=0) 

Epo