In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
from tensorflow import keras

import numpy as np

import scipy.io
from datetime import datetime, timedelta
import time
import sys, os
import itertools
import numpy
from collections import deque

In [2]:
matlab_filename = 'realitymining.mat'
print("Loading in matlab data - this takes a while and about 2gb memory")
matlab_obj = scipy.io.loadmat(matlab_filename)
print("Done loading matlab data.")

Loading in matlab data - this takes a while and about 2gb memory
Done loading matlab data.


In [3]:
def validSubjects(allSubjects):
    return [s for s in allSubjects if hasNumeric(s,'mac') and hasNumeric(s,'my_hashedNumber')]


# idDicts: subjects -> {int: subject}, {float: (int, subject)}, {int: (int, subject)}
# First hash is contiguousId: subjectObject
# second hash is macAddress: contiguousId, subjectObject
# third hash is hashedNumber: contiguousId, subjectObject
# because the id dictionaries reference the subject object, we can replace
# the array of subject objects with these dictionaries.

def idDicts(subjects):
    return (dict((i, s) for (i,s) in enumerate(subjects)),
        dict((getNumeric(s,'mac'), (i, s)) for (i,s) in enumerate(subjects)),
        dict((getNumeric(s, 'my_hashedNumber'), (i, s)) for (i,s) in enumerate(subjects)))

def hasNumeric(obj, field):
    try:
        obj[field][0][0]
        return True
    except:
        return False
    
def getNumeric(obj, field):
    return obj[field][0][0]

def hasArray(obj, field):
    try:
        obj[field][0]
        return True
    except:
        return False


def getArray(obj, field):
    return obj[field][0]

def convertDatetime(dt):
    return datetime.fromordinal(int(dt)) + timedelta(days=dt%1) - timedelta(days=366) - timedelta(hours=5)

In [4]:
print('Extracting valid subjects and creating id dictionaries.')
subjects = validSubjects(matlab_obj['s'][0])
idDictionaries = idDicts(subjects)
idDict, macDict, hashNumDict = idDictionaries

Extracting valid subjects and creating id dictionaries.


## Create dataset

In [5]:
# datetime, area.cell -> userID
adversaryData = []

for subjectID, subject in idDict.items():
    if hasArray(subject, 'locs'):
        for event in subject['locs']:
            try:
                timeplace = list(event)
#                 time = convertDatetime(timeplace[0])
                time = timeplace[0]
                place = timeplace[1]
                # assumes two people aren't texting at the exact same time from the same place
                if place != 0.0:
                    adversaryData.append([time, place, subjectID])
            except:
                pass
RNNinput = np.array(adversaryData)

In [11]:
RNNinput = RNNinput[RNNinput[:,0].argsort()]
extendedData = RNNinput
LSTMData = RNNinput

In [9]:
RNNinput[0:10]

array([[7.31947157e+05, 5.11940811e+03, 8.70000000e+01],
       [7.31947157e+05, 5.11940811e+03, 8.70000000e+01],
       [7.31947157e+05, 5.11940811e+03, 8.70000000e+01],
       [7.31964715e+05, 5.18860241e+03, 6.70000000e+01],
       [7.31964715e+05, 5.18840811e+03, 6.70000000e+01],
       [7.31964716e+05, 5.18840813e+03, 6.70000000e+01],
       [7.31964716e+05, 5.18840811e+03, 6.70000000e+01],
       [7.31964716e+05, 5.18842171e+03, 6.70000000e+01],
       [7.31964717e+05, 5.18840811e+03, 6.70000000e+01],
       [7.31964717e+05, 5.18840332e+03, 6.70000000e+01]])

## Vanilla Classifier

In [20]:
data = RNNinput[:,0:2]
labels = RNNinput[:,2]

In [21]:
len(RNNinput)

2847313

In [24]:
N = 6000
n = 5800
train_data = data[:][0:n]
train_labels = labels[:][0:n]
test_data = data[:][n:N]
test_labels = labels[:][n:N]
print("Training set: {}".format(train_data.shape))
print("Testing set:  {}".format(test_data.shape))

Training set: (5800, 2)
Testing set:  (200, 2)


In [25]:
model = keras.Sequential()
model.add(keras.layers.Dense(3, activation=tf.nn.relu))
model.add(keras.layers.Dense(3, activation=tf.nn.relu))
model.add(keras.layers.Dense(90, activation=tf.nn.softmax))
model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(train_data, train_labels, epochs=300, verbose=0, batch_size=100)
test_loss, test_acc = model.evaluate(test_data, test_labels)
print(test_acc)

0.0


## Previous Location as a Feature

In [26]:
last_loc = np.append(0, extendedData[0:len(extendedData)-1,1])

In [27]:
extendedData = np.column_stack((extendedData, last_loc))

In [28]:
for i in range(len(extendedData)):
    if i != 0:
        if extendedData[i,2] != extendedData[i-1,2]:
            extendedData[i, 3] = 0

In [29]:
data = np.column_stack((extendedData[:,0:2], extendedData[:,3]))
labels = extendedData[:,2]

In [30]:
N = 5000
n = 4800
train_data = data[:][0:n]
train_labels = labels[:][0:n]
test_data = data[:][n:N]
test_labels = labels[:][n:N]
print("Training set: {}".format(train_data.shape))
print("Testing set:  {}".format(test_data.shape))

Training set: (4800, 3)
Testing set:  (200, 3)


In [99]:
model = keras.Sequential()
model.add(keras.layers.Dense(8, activation=tf.nn.relu))
model.add(keras.layers.Dense(8, activation=tf.nn.relu))
model.add(keras.layers.Dense(8, activation=tf.nn.relu))
model.add(keras.layers.Dense(90, activation=tf.nn.softmax))
model.compile(optimizer=tf.train.AdamOptimizer(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(train_data, train_labels, epochs=2000, verbose=0, batch_size=100)
test_loss, test_acc = model.evaluate(test_data, test_labels)
print(test_acc)

0.025


## RNN

In [32]:
data = LSTMData[:,0:2]
labels = LSTMData[:,2]

In [67]:
N = 5000
n = 4800
train_data = data[:][0:n]
train_labels = labels[:][0:n]
test_data = data[:][n:N]
test_labels = labels[:][n:N]
print("Training set: {}".format(train_data.shape))
print("Testing set:  {}".format(test_data.shape))

Training set: (4800, 2)
Testing set:  (200, 2)


In [68]:
train_data[6], train_labels[6]

(array([731964.71611111,   5188.40811   ]), 67.0)

In [69]:
model = keras.Sequential()
model.add(keras.layers.Embedding(90, 10, input_length=2))
model.add(keras.layers.LSTM(10, return_sequences=True))
model.add(keras.layers.LSTM(10, return_sequences=True))
model.add(keras.layers.TimeDistributed(keras.layers.Dense(90)))
model.add(keras.layers.Activation('softmax'))

In [71]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
# model.fit(train_data, train_labels, epochs=200, verbose=0, batch_size=100)
# test_loss, test_acc = model.evaluate(test_data, test_labels)
# print(test_acc)

### Not working RNN

In [None]:
model.fit(data, labels)

In [106]:
def create_model(seq_input_len, n_input_nodes, n_embedding_nodes, 
                 n_hidden_nodes, stateful=False, batch_size=None):
    
    # Layer 1
    input_layer = keras.layers.Input(batch_shape=(batch_size, seq_input_len), name='input_layer')

    # Layer 2
    embedding_layer = keras.layers.Embedding(input_dim=n_input_nodes, 
                                output_dim=n_embedding_nodes, 
                                mask_zero=True, name='embedding_layer')(input_layer) #mask_zero=True will ignore padding
    # Output shape = (batch_size, seq_input_len, n_embedding_nodes)

    #Layer 3
    lstm_layer1 = keras.layers.LSTM(n_hidden_nodes,
                     return_sequences=True, #return hidden state for each word, not just last one
                     stateful=stateful, name='hidden_layer1')(embedding_layer)
    # Output shape = (batch_size, seq_input_len, n_hidden_nodes)

    #Layer 4
    lstm_layer2 = keras.layers.LSTM(n_hidden_nodes,
                     return_sequences=True,
                     stateful=stateful, name='hidden_layer2')(lstm_layer1)
    # Output shape = (batch_size, seq_input_len, n_hidden_nodes)

    #Layer 5
    output_layer = keras.layers.TimeDistributed(keras.layers.Dense(n_input_nodes, activation="softmax"), 
                                   name='output_layer')(lstm_layer2)
    # Output shape = (batch_size, seq_input_len, n_input_nodes)
    
    model = keras.Model(inputs=input_layer, outputs=output_layer)

    #Specify loss function and optimization algorithm, compile model
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer='adam')
    
    return model

In [107]:
data = RNNinput[:,0:2]
labels = RNNinput[:,2]
N = 4000
order = np.argsort(np.random.random(labels.shape))
n = 3900
train_data = data[order][0:n]
train_labels = labels[order][0:n]
train_labels = np.reshape(train_labels, (n, 1, 1))
test_data = data[order][n:N]
test_labels = labels[order][n:N]
print("Training set: {}".format(train_data.shape))
print("Testing set:  {}".format(test_data.shape))

model = create_model(seq_input_len=2,
                     n_input_nodes = 90,
                     n_embedding_nodes = 300,
                     n_hidden_nodes = 500)

Training set: (3900, 2)
Testing set:  (100, 2)


In [109]:
# model.fit(x=train_data, y=train_labels, epochs=5, batch_size=20)

## True Adversary, Notes

### Theoretically what I think the ML model should do
<code>
userdict : userid -> [[time, location]]
userdict[0] = [train_data[0]]

max_dist = estimate of how far away points ever are
</code>

#### Prediction
<code>
for point in train_data:
    likelihood vector = zeros
    for user in userdict:
        ## This is the nonlinearity that needs to train
        p = ((point.location - user.location) - max_dist/2)/(point.time - user.time)
    if the max value of the likelihood vector is below a threshhold, and there are less than 90 users:
        user[new] = [point]
    else
        guess = argmax(likelihood vector)
        user[guess] = user[guess] + point
</code>
        
#### Loss
<code>
for point in train_data:
    if [point.time, point.location] not in userdict[point.user]:
            loss += 1
</code>

In [72]:
size = 10000
inputdata = RNNinput[0:size]

userdict = {}
userdict[0] = [inputdata[0]]
max_dist = 1
threshold = -10000

for point in inputdata:
    l = np.zeros(len(userdict))
    for u in userdict.keys():
        deltaloc = abs(point[1] - userdict[u][-1][1])
        deltat = point[0] - userdict[u][-1][0]
        if deltat == 0:
            p = 0
        else:
            p = -(deltaloc - max_dist/2)/(deltat)
        l[u] = p
    if max(l) < threshold and len(userdict) < 90:
        userdict[max(userdict.keys())+1] = [point]
    else:
        guess = np.argmax(l)
        userdict[guess] = np.vstack([userdict[guess], point])

In [73]:
for u in userdict.keys():
    print(u, len(userdict[u]))

0 969
1 2449
2 1576
3 4684
4 323


In [274]:
loss = 0
for point in inputdata:
    if point.tolist() not in userdict[point[2]].tolist():
        loss += 1

In [275]:
loss/size

0.6066

## Notes

In [34]:
# words = tf.placeholder(tf.float32, [time_steps, batch_size, num_features])
# lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
# # initial state of memory
# state = lstm.zero_state(batch_size, dtype=tf.float32)
# probabilities = []
# loss = 0.0
# output, state = lstm(words, state)
# logits = lstm.matmul(output, softmax_w) + softmax_b
# probabilities.append(tf.nn.softmax(logits))
# loss += loss_function(probabilities, target_words)

To use the tensorflow lstm used in the language processing example,
I could recreate the data so that it's in the form
[location, userID, location, userID, ....]
where inputs are time ordered by the location time stamps.
This way to estimate a user, input the location to the RNN and predict the next value.

I should remove any loc=0.0 events since those are errors anyway.

in an unsupervised setting, the adversary would start by assigning userID 0 to the first location data, and then userID 1 to the next location event that is more than some distance d away, etc, and feed that data into a recurrent nn that has "other" as one of the options. When "other" is the most likely, add a new userID and restart the RNN.

In [74]:
j = np.array([[1],[2],[3]])

In [75]:
j[0]

array([1])

In [77]:
j[1:]

array([[2],
       [3]])