In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from scipy.special import rel_entr

In [2]:
################## DATA FRAME ##################
datafile = './dailyActivity_merged.csv'
all_fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

#pd.set_option('display.max_rows', 100)
df = pd.read_csv(datafile, skipinitialspace=True, usecols=all_fields)
fields = 'FairlyActiveMinutes'
selected_data = df[['Id',fields]]
n_rows, n_cols = df.shape

In [3]:
selected_data.head()

Unnamed: 0,Id,FairlyActiveMinutes
0,1503960366,13
1,1503960366,19
2,1503960366,11
3,1503960366,34
4,1503960366,10


In [4]:
all_IDs = df.Id.unique()
num_users = len(all_IDs)

user_dict = dict()
for idx, user in enumerate(all_IDs):
    temp_df = df.loc[df['Id'] == user]
    user_dict[user] = temp_df[fields].values

In [23]:
test_IDs = all_IDs[[0, 1]]  # [0, 5, 6, 10, 11, 15, 16, 20, 25, 30]
training_IDs = [ID for ID in all_IDs if ID not in test_IDs]
attack_IDs = list(test_IDs) + list(training_IDs[0:2])

ths = [0]

In [6]:
print(test_IDs)

[1503960366 1624580081]


In [7]:
print(user_dict[test_IDs[0]])

[13 19 11 34 10 20 16 31 12  8 27 21  5 14 23 11 28 12 34 35 15 24 22 24
  6 46  8 11 31 23  0]


In [8]:
print(user_dict[training_IDs[1]])

[ 0  0  0  0  0  8 12  0  0 13  0  0  0  0  0  0  0  0  0  7  0  0  0  0
  0  0  0  0  0  0  0]


In [18]:
for ID in training_IDs:
    print(user_dict[ID])
    print(len(user_dict[ID]))
    print(ID)

[51 16 58  4 42 13 33 58  0  0  0 15  0  1 41  0 53  0 71 24  7 94  0 12
  6 17  0  6 19  0]
30
1644430081
[ 0  0  0  0  0  8 12  0  0 13  0  0  0  0  0  0  0  0  0  7  0  0  0  0
  0  0  0  0  0  0  0]
31
1844505072
[0 0 9 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 1 8 0 0 0 0 0 0 0 0 0 0]
31
1927972279
[14  5  3  9 11 29  3  7 63 53 10  0 26  8 24 20 20 40 23 28  8 14 27 20
 17  2  0 47 28 25 16]
31
2022484408
[8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
31
2026352035
[ 9  0  0  0  0  0  7 11 11 10  0  0  0  5  0  0  0  0  0  0  6 10  0 11
  0  0  0  0  0  0  0]
31
2320127002
[13 32 48 24 72  7 16  7 43 26 27 35  0 11  0  9  0  0]
18
2347167796
[21  8  0  0  1  8  6  0  5 10  0  5  1  0  5  0  7 23 20 18  7  6  0 23
  5 11  0  0  0  0  0]
31
2873212765
[ 0  8 25  0  0  2  7  0  3  9  0  2  7  0  2 13  0  0  4  0]
20
3372868164
[ 19  46  23  42  83  58  95  67  98   0  12  92  95   9  95  10   8  32
  52  40 143  41  96  88  55  86 116 122 115   0]
30
3977333714
[15  0  0 18 2

In [25]:
max_elements = 31

data_storage = np.zeros((len(attack_IDs), len(training_IDs), len(ths), max_elements))
accuracy_matrix = np.zeros((len(attack_IDs), len(training_IDs), len(ths)))

In [26]:
for i, attack_ID in enumerate(attack_IDs):
    for j, training_ID in enumerate(training_IDs):
        #for k, training_value in enumerate(user_dict[training_ID]):
        for l, th in enumerate(ths):
            for m, attack_value in enumerate(user_dict[attack_ID]):
                if (abs(np.array(user_dict[training_ID]) - attack_value) <= th).any():
                    # Then this given attack value is in the given person's vector
                    #print(f"{attack_value} found in training array: {attack_value in user_dict[training_ID]}")
                    data_storage[i,j,l,m] = 1
                else:
                    #print(f"{attack_value} NOT found in training array: {not (attack_value in user_dict[training_ID])}")
                    pass
                    
            # Once we have established membership for all attack values:
            # Calculate the accuracy of that given vector
            # E.g. percentage score of how many of attack values were present in given person
            accuracy_matrix[i,j,l] = sum(data_storage[i,j,l,:]) / len(user_dict[attack_ID])
        #print("--------------------------------------------")
            

In [55]:
matching_matrix = np.zeros((len(attack_IDs), len(training_IDs), len(ths)))
for i, attack_ID in enumerate(attack_IDs):
    v1 = user_dict[attack_ID]
    k = len(v1) #len of the subsequence in the attack vector -> k =len(v1) means match the whole vector
    
    for j, training_ID in enumerate(training_IDs[:2]):
        #for k, training_value in enumerate(user_dict[training_ID]):
        v2 = user_dict[training_ID]
        
        for l, th in enumerate(ths):
            if (isMatch(v1,v2,k,th)):
                 print(str(attack_ID)+" vs "+str(training_ID)+" is a Match")
                 matching_matrix[i,j,l] = 1
           
                    
             

1644430081 vs 1644430081 is a Match
1844505072 vs 1844505072 is a Match


In [56]:
matching_matrix[:,:,0]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [40]:
def isMatch(v1,v2,k,th):
    '''
    Are a match is there is a common k-subsequence that is matched with L1 dist <= th
    '''
    
    for i in range(0,len(v1)-k+1):
        a = np.array([v1[i:i+k]])
        for j in range(0,len(v2)-k+1):
            b = np.array([v2[i:i+k]])
            if (np.linalg.norm((a - b), ord=1) <= th):
                return True
            
    return False