In [39]:
import numpy as np
import os
import csv
from random import random, sample, seed
from datetime import datetime
import psutil
import resource

from keras import backend as K
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.models import Input, Model
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, concatenate, Activation
from keras.layers.core import Masking, Dropout, Reshape
from keras.layers.normalization import BatchNormalization
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from keras.callbacks import CSVLogger, TensorBoard, Callback

In [44]:
def build_model():
    titles_input = Input(shape=(maxlen,), name='titles_input')
    titles_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(titles_input)
    titles_pooling = GlobalAveragePooling1D()(titles_embedding)
    
    aux_output = Dense(1, activation='sigmoid', name='aux_out')(titles_pooling)
    
    hours_input = Input(shape=(1,), name='hours_input')
    hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
    hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

    dayofweeks_input = Input(shape=(1,), name='dayofweeks_input')
    dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
    dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

    minutes_input = Input(shape=(1,), name='minutes_input')
    minutes_embedding = Embedding(60, meta_embedding_dims)(minutes_input)
    minutes_reshape = Reshape((meta_embedding_dims,))(minutes_embedding)

    dayofyears_input = Input(shape=(1,), name='dayofyears_input')
    dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
    dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)
    
    merged = concatenate([titles_pooling, hours_reshape, dayofweeks_reshape, minutes_reshape, dayofyears_reshape])

    hidden_1 = Dense(256, activation='relu')(merged)
    hidden_1 = BatchNormalization()(hidden_1)

    main_output = Dense(1, activation='sigmoid', name='main_out')(hidden_1)
    
    model = Model(inputs=[titles_input,
                      hours_input,
                      dayofweeks_input,
                      minutes_input,
                      dayofyears_input], outputs=[main_output, aux_output])

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'],
                  loss_weights=[1, 0.2])
    return model

In [41]:
def get_mem_usage():                                                                                                                               
    process = psutil.Process(os.getpid())                                                                                                          
    return process.memory_info()  

In [42]:
for _ in range(10):
    model = build_model()
    print(get_mem_usage())
    K.clear_session()

pmem(rss=20657205248, vms=24420659200, shared=141750272, text=2322432, lib=0, data=20957470720, dirty=0)
pmem(rss=20668616704, vms=24507568128, shared=141750272, text=2322432, lib=0, data=20977401856, dirty=0)
pmem(rss=20672032768, vms=24510976000, shared=141750272, text=2322432, lib=0, data=20980809728, dirty=0)
pmem(rss=20672581632, vms=24511500288, shared=141750272, text=2322432, lib=0, data=20981334016, dirty=0)
pmem(rss=20672581632, vms=24511500288, shared=141750272, text=2322432, lib=0, data=20981334016, dirty=0)
pmem(rss=20675432448, vms=24514383872, shared=141750272, text=2322432, lib=0, data=20984217600, dirty=0)
pmem(rss=20679094272, vms=24518053888, shared=141750272, text=2322432, lib=0, data=20987887616, dirty=0)
pmem(rss=20682506240, vms=24521461760, shared=141750272, text=2322432, lib=0, data=20991295488, dirty=0)
pmem(rss=20672397312, vms=24511238144, shared=141750272, text=2322432, lib=0, data=20981071872, dirty=0)
pmem(rss=20672397312, vms=24511238144, shared=141750272

In [9]:
data_path = 'data/submissions.csv'
embeddings_path = 'data/glove.6B.50d.txt'

In [10]:
titles = []
hours = []
minutes = []
dayofweeks = []
dayofyears = []
is_top_submission = []

# max_rows = 16000 # my crappy computer can't handle too many rows

with open(data_path, 'r', encoding="latin1") as f:
    reader = csv.DictReader(f)
    i = 0
    for submission in reader:
#         if i >= max_rows:
#             break
        i += 1
        titles.append(submission['title'])
        hours.append(submission['hour'])
        minutes.append(submission['minute'])
        dayofweeks.append(submission['dayofweek'])
        dayofyears.append(submission['dayofyear'])
        is_top_submission.append(submission['is_top_submission'])
            
titles = np.array(titles)
hours = np.array(hours, dtype=int)
minutes = np.array(minutes, dtype=int)
dayofweeks = np.array(dayofweeks, dtype=int)
dayofyears = np.array(dayofyears, dtype=int)
is_top_submission = np.array(is_top_submission, dtype=int)

In [11]:
print(titles[0:2])
print(titles.shape)
print(hours[0:2])
print(minutes[0:2])
print(dayofweeks[0:2])
print(dayofyears[0:2])
print(is_top_submission[0:2])

['People who downloaded their Google data and went through it, what were the most unsettling things you found out they had stored about you?'
 "Have you ever felt you don't know/have forgotten who you really are? That you've spent years just adapting to surroundings to make life easier and don't know what's the real you anymore? If so, how did you overcome this?"]
(1397703,)
[22  3]
[45 53]
[3 4]
[219 206]
[1 1]


In [12]:
1 - np.mean(is_top_submission)

0.6742076106297261

In [13]:
max_features = 40000

word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(titles)

print(str(word_tokenizer.word_counts)[0:100])
print(str(word_tokenizer.word_index)[0:100])
print(len(word_tokenizer.word_counts))   # true word count

OrderedDict([('people', 148888), ('who', 120348), ('downloaded', 187), ('their', 35401), ('google', 
{'you': 1, 'what': 2, 'the': 3, 'to': 4, 'a': 5, 'of': 6, 'your': 7, 'is': 8, 'do': 9, 'and': 10, 'i
127351


In [14]:
titles_tf = word_tokenizer.texts_to_sequences(titles)

print(titles_tf[0])

[20, 26, 5020, 73, 736, 1626, 10, 392, 224, 15, 2, 54, 3, 30, 3319, 95, 1, 187, 59, 62, 47, 6354, 35, 1]


In [15]:
maxlen = 20
titles_tf = sequence.pad_sequences(titles_tf, maxlen=maxlen)

print(titles_tf[0])

[ 736 1626   10  392  224   15    2   54    3   30 3319   95    1  187
   59   62   47 6354   35    1]


In [16]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec
        
print(embedding_vectors['you'])

[-1.0919e-03  3.3324e-01  3.5743e-01 -5.4041e-01  8.2032e-01 -4.9391e-01
 -3.2588e-01  1.9972e-03 -2.3829e-01  3.5554e-01 -6.0655e-01  9.8932e-01
 -2.1786e-01  1.1236e-01  1.1494e+00  7.3284e-01  5.1182e-01  2.9287e-01
  2.8388e-01 -1.3590e+00 -3.7951e-01  5.0943e-01  7.0710e-01  6.2941e-01
  1.0534e+00 -2.1756e+00 -1.3204e+00  4.0001e-01  1.5741e+00 -1.6600e+00
  3.7721e+00  8.6949e-01 -8.0439e-01  1.8390e-01 -3.4332e-01  1.0714e-02
  2.3969e-01  6.6748e-02  7.0117e-01 -7.3702e-01  2.0877e-01  1.1564e-01
 -1.5190e-01  8.5908e-01  2.2620e-01  1.6519e-01  3.6309e-01 -4.5697e-01
 -4.8969e-02  1.1316e+00]


In [17]:
weights_matrix = np.zeros((max_features + 1, 50))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if embedding_vector is not None and i <= max_features:
        weights_matrix[i] = embedding_vector

# index 0 vector should be all zeroes, index 1 vector should be the same one as above
print(weights_matrix[0:2,:])

[[ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00
   0.0000e+00  0.0000e+00]
 [-1.0919e-03  3.3324e-01  3.5743e-01 -5.4041e-01  8.2032e-01 -4.9391e-01
  -3.2588e-01  1.9972e-03 -2.3829e-01  3.5554e-01 -6.0655e-01  9.8932e-01
  -2.1786e-01  1.1236e-01  1.1494e+00  7.3284e-01  5.1182e-01  2.9287e-01
   2.8388e-01 -1.3590e+00 -3.7951e-01  5.0943e-01  7.0710e-01  6.2941e-01
   1.0534e+00 -2.1756e+00 -1.3204e+00  4.0001e-01  1.5741e+00 -1.6600e+00
   3.7721e+

In [18]:
dayofyears_tf = dayofyears - 1

print(dayofyears_tf[0:10])

[218 205 220 209 234 190 233 224 206 151]


In [19]:
batch_size = 32
embedding_dims = 50
epochs = 10

In [20]:
K.clear_session()

W0926 03:13:14.257246 140597914457920 deprecation_wrapper.py:119] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:95: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.

W0926 03:13:14.258311 140597914457920 deprecation_wrapper.py:119] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:98: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0926 03:13:14.409564 140597914457920 deprecation_wrapper.py:119] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:102: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [21]:
titles_input = Input(shape=(maxlen,), name='titles_input')
titles_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(titles_input)
titles_pooling = GlobalAveragePooling1D()(titles_embedding)

W0926 03:13:14.415494 140597914457920 deprecation_wrapper.py:119] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0926 03:13:14.418584 140597914457920 deprecation_wrapper.py:119] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [22]:
aux_output = Dense(1, activation='sigmoid', name='aux_out')(titles_pooling)

In [23]:
meta_embedding_dims = 64

hours_input = Input(shape=(1,), name='hours_input')
hours_embedding = Embedding(24, meta_embedding_dims)(hours_input)
hours_reshape = Reshape((meta_embedding_dims,))(hours_embedding)

dayofweeks_input = Input(shape=(1,), name='dayofweeks_input')
dayofweeks_embedding = Embedding(7, meta_embedding_dims)(dayofweeks_input)
dayofweeks_reshape = Reshape((meta_embedding_dims,))(dayofweeks_embedding)

minutes_input = Input(shape=(1,), name='minutes_input')
minutes_embedding = Embedding(60, meta_embedding_dims)(minutes_input)
minutes_reshape = Reshape((meta_embedding_dims,))(minutes_embedding)

dayofyears_input = Input(shape=(1,), name='dayofyears_input')
dayofyears_embedding = Embedding(366, meta_embedding_dims)(dayofyears_input)
dayofyears_reshape = Reshape((meta_embedding_dims,))(dayofyears_embedding)

In [24]:
merged = concatenate([titles_pooling, hours_reshape, dayofweeks_reshape, minutes_reshape, dayofyears_reshape])

hidden_1 = Dense(256, activation='relu')(merged)
hidden_1 = BatchNormalization()(hidden_1)

main_output = Dense(1, activation='sigmoid', name='main_out')(hidden_1)

In [25]:
model = Model(inputs=[titles_input,
                      hours_input,
                      dayofweeks_input,
                      minutes_input,
                      dayofyears_input], outputs=[main_output, aux_output])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'],
              loss_weights=[1, 0.2])

model.summary()

W0926 03:13:15.757168 140597914457920 deprecation_wrapper.py:119] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0926 03:13:15.781638 140597914457920 deprecation.py:323] From /home/david/anaconda3/envs/capstone/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
titles_input (InputLayer)       (None, 20)           0                                            
__________________________________________________________________________________________________
hours_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
dayofweeks_input (InputLayer)   (None, 1)            0                                            
__________________________________________________________________________________________________
minutes_input (InputLayer)      (None, 1)            0                                            
__________________________________________________________________________________________________
dayofyears

In [27]:
seed(123)
split = 0.2

# returns randomized indices with no repeats
idx = sample(range(titles_tf.shape[0]), titles_tf.shape[0])

titles_tf = titles_tf[idx, :]
hours = hours[idx]
dayofweeks = dayofweeks[idx]
minutes = minutes[idx]
dayofyears_tf = dayofyears_tf[idx]
is_top_submission = is_top_submission[idx]

In [28]:
print(1 - np.mean(is_top_submission[:(int(titles_tf.shape[0] * split))]))

0.6732667954496674


In [29]:
# csv_logger = CSVLogger('training.csv')
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = TensorBoard(log_dir=logdir)

In [30]:
model.load_weights('model.h5')

In [31]:
class MemoryCallback(Callback):
    def on_epoch_end(self, epoch, log={}):
        print(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

In [45]:
for _ in range(5):
    model = build_model()
    model.load_weights('model.h5')
    model.fit([titles_tf, hours, dayofweeks, minutes, dayofyears_tf], [is_top_submission, is_top_submission],
              batch_size=batch_size,
              epochs=1,
              validation_split=split,
              callbacks=[tensorboard, MemoryCallback()])
    model.save_weights('model.h5')
    print(get_mem_usage())
    K.clear_session()


Train on 1118162 samples, validate on 279541 samples
Epoch 1/1

KeyboardInterrupt: 

In [33]:
model.save_weights("model.h5")

In [34]:
def encode_text(text, maxlen):
    encoded = word_tokenizer.texts_to_sequences([text])
    return sequence.pad_sequences(encoded, maxlen=maxlen)

In [35]:
input_text = "Which movie's plot would drastically change if you removed a letter from its title?"
encoded_text = encode_text(input_text, maxlen)
input_hour = np.array([15])
input_minute = np.array([46])
input_dayofweek = np.array([1])
input_dayofyear = np.array([16 - 1])

model.predict([encoded_text, input_hour, input_dayofweek, input_minute, input_dayofyear])

[array([[0.63942415]], dtype=float32), array([[0.54909927]], dtype=float32)]

In [36]:
import gc

In [37]:
gc.collect()

116