In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from scipy.special import rel_entr

In [2]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
all_fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

#pd.set_option('display.max_rows', 100)
df = pd.read_csv(datafile, skipinitialspace=True, usecols=all_fields)
fields = 'FairlyActiveMinutes'
selected_data = df[['Id',fields]]
n_rows, n_cols = df.shape

In [3]:
selected_data.head()

Unnamed: 0,Id,FairlyActiveMinutes
0,1503960366,13
1,1503960366,19
2,1503960366,11
3,1503960366,34
4,1503960366,10


## Generate attack vectors

In [4]:
#['TotalSteps', 'TotalDistance', 'TrackerDistance', 
#          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 
#          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

all_IDs = df.Id.unique()
num_users = len(all_IDs)

user_dict = dict()
for idx, user in enumerate(all_IDs):
    temp_df = df.loc[df['Id'] == user]
    user_dict[user] = temp_df[fields].values

In [5]:
test_IDs = all_IDs[[0, 1]]  # [0, 5, 6, 10, 11, 15, 16, 20, 25, 30]
training_IDs = [ID for ID in all_IDs if ID not in test_IDs]
attack_IDs = list(test_IDs) + list(training_IDs[0:2])

ths = [0]

In [6]:
#attack_IDs = [1503960366]
#training_IDs = [5577150313, 6117666160, 6290855005, 6775888955]

# Max number of elements for any field
max_elements = 31

data_storage = np.zeros((len(attack_IDs), len(training_IDs), len(ths), max_elements))
accuracy_matrix = np.zeros((len(attack_IDs), len(training_IDs), len(ths)))

In [8]:
for i, attack_ID in enumerate(attack_IDs):
    for j, training_ID in enumerate(training_IDs):
        #for k, training_value in enumerate(user_dict[training_ID]):
        for l, th in enumerate(ths):
            for m, attack_value in enumerate(user_dict[attack_ID]):
                if (abs(np.array(user_dict[training_ID]) - attack_value) <= th).any():
                    # Then this given attack value is in the given person's vector
                    #print(f"{attack_value} found in training array: {attack_value in user_dict[training_ID]}")
                    data_storage[i,j,l,m] = 1
                else:
                    #print(f"{attack_value} NOT found in training array: {not (attack_value in user_dict[training_ID])}")
                    pass
                    
            # Once we have established membership for all attack values:
            # Calculate the accuracy of that given vector
            # E.g. percentage score of how many of attack values were present in given person
            accuracy_matrix[i,j,l] = sum(data_storage[i,j,l,:]) / len(user_dict[attack_ID])
        #print("--------------------------------------------")
            

In [11]:
accuracy_matrix.shape

(4, 31, 1)

In [10]:
accuracy_matrix[:,:,0]

array([[0.32258065, 0.19354839, 0.12903226, 0.5483871 , 0.09677419,
        0.22580645, 0.32258065, 0.41935484, 0.12903226, 0.32258065,
        0.32258065, 0.06451613, 0.5483871 , 0.51612903, 0.16129032,
        0.58064516, 0.38709677, 0.38709677, 0.4516129 , 0.16129032,
        0.09677419, 0.32258065, 0.51612903, 0.58064516, 0.38709677,
        0.41935484, 0.25806452, 0.5483871 , 0.29032258, 0.19354839,
        0.4516129 ],
       [0.80645161, 0.77419355, 0.77419355, 0.87096774, 0.74193548,
        0.83870968, 0.77419355, 0.90322581, 0.77419355, 0.77419355,
        0.80645161, 0.77419355, 0.80645161, 0.83870968, 0.80645161,
        0.83870968, 0.80645161, 0.83870968, 0.80645161, 0.80645161,
        0.74193548, 0.80645161, 0.83870968, 0.83870968, 0.83870968,
        0.87096774, 0.80645161, 0.90322581, 0.93548387, 0.74193548,
        0.87096774],
       [1.        , 0.4       , 0.4       , 0.46666667, 0.3       ,
        0.4       , 0.43333333, 0.43333333, 0.4       , 0.5       ,
      

In [20]:
label_matrix = np.zeros((accuracy_matrix.shape))

In [23]:
for attack_idx, attack_ID in enumerate(attack_IDs):
    if attack_ID in training_IDs:
        label_idx = training_IDs.index(attack_ID)
        for j in range(len(ths)):
            label_matrix[attack_idx, label_idx, j] = 1
            print(f"Accuracy Matrix: {accuracy_matrix[attack_idx, label_idx, j]} VS Label Matrix: {label_matrix[attack_idx, label_idx, j]}")

Accuracy Matrix: 1.0 VS Label Matrix: 1.0
Accuracy Matrix: 1.0 VS Label Matrix: 1.0


As you can see from the above, when the attack vector was one of the vectors taken from the training set, the accuracy matrix generated scores of 1 (100%, e.g. 100% of the elements from the attack vector matched with its training vector correspondent), as would be expected.  However, the accuracy matrix also reports scores of 1 for a few other vectors, namely ones where the attack vector matched perfectly to the training vector, but it was not that training vector.
<br>
<br>
I have not implemented multiple threshold values yet.
<br>
<br>
The way this code was executed is that it simply subtracts the current attack value (e.g. the current value pulled from the given attack vector) from the full training vector, and if any of the resulting values in the new array are less than or equal to the threshold (for now, th=0 only), then it counts the attack value as found in the training vector.  This process is repeated for each training vector, in order to generate a boolean matrix of "hits" and "misses" depending on whether or not each attack value was found in each training vector.
> (abs(np.array(user_dict[training_ID]) - attack_value) <= th).any()
