In [1]:
from keras import callbacks
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.recurrent import LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Embedding, TimeDistributed
from keras import optimizers
import matplotlib.pyplot as plt
import pickle
import re
import pandas as pd
import numpy as np
import zipfile
import gzip

Using TensorFlow backend.


In [3]:
# df has dimensions (506548, 75)
# df.to_pickle('data/3days_df.gzip','gzip')
temp= gzip.open('data/3days_df.gzip')
df=pd.read_pickle(temp)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,66,67,68,69,70,71,72,73,74,75
0,b'32B0630488554327,1509559350,93d8663c-404f-4877-a665-c8797d8ae8e5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,b'32B0630488568972,1509572912,9ab4430a-5a6b-4d00-804e-74634be17a74,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,b'32B0630488568974,1509462399,57f7fffe-1c02-4a73-90ed-1c46bde8c3e6,1509540077,edbf9d69-80ac-4b62-95a8-665792899168,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,b'32C0050489325572,1509394813,25163878-7573-4779-9b2e-63fda24ac838,1509574002,c1d1dcef-9552-4fb8-b812-ec7455b67c7e,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,b'32C0060489462801,1509554398,b79710af-5540-49f5-993f-35c3235740e1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
only_uuids = df.loc[:,::2] #select only uuid 
only_timestamps = df.loc[:,1::2]  #select only timestamp 

Create a dictionary so each uuid has a unique id and a reverse dictionary so we can give an id and get the uuid back

In [182]:
# max_steps=76
# set_uuid = set([0])

# for i in range(2,max_steps,2): #start from 2 because 1 is the device id 
#     set_uuid.update(only_uuids.loc[:,i])

# vocab_size = len(set_uuid) #length of vocabulary
# dictionary={}  # uuid:int
# reversed_dictionary={}  #int:uuid
# # convert uuid to numbers 
# for i,value in enumerate(set_uuid):
#     reversed_dictionary[i] = value
#     dictionary[value]=i


In [209]:
# pickle.dump(dictionary,open('data/dictionary.pkl','wb'))
# pickle.dump(dictionary,open('data/rev_dictionary.pkl','wb'))

dictionary=pickle.load(open('data/dictionary.pkl','rb'))
reversed_dictionary=pickle.load(open('data/rev_dictionary.pkl','rb'))
vocab_size =len(dictionary)

8472

In [183]:
list(reversed_dictionary.keys())[:5]

[0, 1, 2, 3, 4]

##################### Data Filtering  #####################

Here we subset the data by how many movies a particular customer has watched and convert it to a matrix.
We don't use the device id currently

In [184]:
watched_more = 4
watched_less = 10 #len(only_uuids.columns)
four_or_more = only_uuids.loc[(np.sum(only_uuids!=0,axis=1)>watched_more) & (np.sum(only_uuids!=0,axis=1)<watched_less),:]
four_or_more=four_or_more.loc[:,2:].as_matrix() #0 is device id

In [187]:
# integer encode input data
input_encoded_all=[]
for i in range(len(four_or_more)):
    input_encoded_all.append(list(filter(lambda x: x!=0 ,[dictionary[uuid] for uuid in four_or_more[i]])))

 
#pad sequences
max_len = max([len(seq) for seq in input_encoded_all])
padded_data = pad_sequences(input_encoded_all,maxlen=max_len,padding='pre')

In [188]:
padded_data[:3]

array([[   0,    0,    0,    0, 6294, 6294, 6294, 6294],
       [   0,    0,    0,    0, 8065, 8052, 7451, 3683],
       [   0,    0,    0,    0,  925, 3125, 7776, 3433]], dtype=int32)

In [189]:
window_size = watched_more +1 
new_data=[]
new_target=[]
def data_gen(padded_data):
    for i in range(len(padded_data)):
        offset=0
        if window_size - sum(padded_data[0]==0) ==1:  #this means that we always have atleast two 
            offset=1
        while window_size + offset < max_len:
            new_data.append(padded_data[i][offset:window_size+offset])
            new_target.append(padded_data[i][offset+1:window_size+offset+1])
            offset +=1
    return new_data,new_target

inputs,targets= data_gen(padded_data)

In [191]:
inputs[:5]
# len(inputs)

[array([   0,    0,    0, 6294, 6294], dtype=int32),
 array([   0,    0, 6294, 6294, 6294], dtype=int32),
 array([   0,    0,    0, 8065, 8052], dtype=int32),
 array([   0,    0, 8065, 8052, 7451], dtype=int32),
 array([   0,    0,    0,  925, 3125], dtype=int32)]

In [192]:
targets[:5]

[array([   0,    0, 6294, 6294, 6294], dtype=int32),
 array([   0, 6294, 6294, 6294, 6294], dtype=int32),
 array([   0,    0, 8065, 8052, 7451], dtype=int32),
 array([   0, 8065, 8052, 7451, 3683], dtype=int32),
 array([   0,    0,  925, 3125, 7776], dtype=int32)]

In [193]:
# batch generation
num_steps=window_size
def generate(inputs,targets,batch_size,num_steps):
    x = np.zeros((batch_size, num_steps))
    y = np.zeros((batch_size,num_steps,vocab_size))
    data_len= len(inputs[0])
    while True:
        for i in range(batch_size):
            x[i, :] = inputs[i]
            temp_y = targets[i]
            # convert all of temp_y into a one hot representation
            y[i, :,:] = to_categorical(temp_y, num_classes=vocab_size)
        yield x, y


##################### LSTM model #####################

In [201]:
# I have subset the data even more to just 5000 samples just for quicker iterations
# and validation is 200 samples 
n=round(0.95 * len(inputs))
np.random.seed(1)
idx = np.random.choice(len(inputs),size=len(inputs))
training_idx = idx[:n]
val_idx = idx[n:]

val_data = [inputs[i] for i in val_idx]
val_targets = [targets[i] for i in val_idx]
train_data = [inputs[i] for i in training_idx]
train_targets = [targets[i] for i in training_idx]


In [213]:
# train_data[0]
len(train_data)

128269

In [196]:
train_targets[0]

array([   0, 4485, 2151, 2247,  212], dtype=int32)

In [218]:
hidden_size=400 #number of hidden nodes in LSTM blocks (i.e. number of nodes for the input, forget and output gates )
batch_size =100

model = Sequential() #initialise a sequential model
model.add(Embedding(vocab_size, hidden_size, input_length=num_steps)) #put the uuid into a vector representation with length equal to the number of hidden nodes
model.add(LSTM(hidden_size, return_sequences=True)) # first lstm layer, we return all nodes output
model.add(LSTM(hidden_size,return_sequences=True)) 
model.add(Dropout(0.2)) 
model.add(LSTM(hidden_size,return_sequences=True)) 
model.add(Dropout(0.3))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(hidden_size, return_sequences=True))# last lstm layer, we return (all nodes output) this is optionally but I think its benefial. We can discus?
model.add(TimeDistributed(Dense(vocab_size))) # apply a fully connected nn to each output 
model.add(Activation('softmax')) #logits --> probs 



In [219]:
# compile the model 
model.compile(loss='categorical_crossentropy', optimizer='adam' ,metrics=['categorical_accuracy'])

In [220]:
# train the model 

model.fit_generator(generate(inputs=train_data,targets=train_targets,batch_size=batch_size,num_steps=num_steps), n//(batch_size), 10000,
                    validation_data=generate(val_data,targets=val_targets,batch_size=batch_size,num_steps=num_steps),
                        validation_steps=len(val_data)//batch_size)

Epoch 1/10000
  69/1282 [>.............................] - ETA: 19:56 - loss: 5.6900 - categorical_accuracy: 0.1564

KeyboardInterrupt: 

In [200]:
# number of predictions wanted 
num_predict = 2

true_print_out = []
pred_print_out = []
for i in range(num_predict):
    data = next(generate(inputs=inputs,targets=targets,batch_size=1,num_steps=num_steps), 1)
    prediction = model.predict(data[0])
    actual = data[1][0]
    predict_idx = np.argmax(prediction[:, -1, :])
    actual = np.argmax(actual[num_steps-1, :])
    true_print_out.append(reversed_dictionary[actual])
    pred_print_out.append(reversed_dictionary[predict_idx])
print('actual :',true_print_out)
print('predicted :',pred_print_out)

actual : ['ea1f7105-f2a6-444d-998c-c72dd8f76fa0', 'ea1f7105-f2a6-444d-998c-c72dd8f76fa0']
predicted : ['139c7233-19a9-420a-9b33-04fc28ab8052', '139c7233-19a9-420a-9b33-04fc28ab8052']


In [181]:
data[1].shape

(1, 5, 8472)

In [180]:
targets[0]

array([ 0.        ,  0.        ,  0.74291784,  0.74291784,  0.74291784], dtype=float32)