In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from scipy.special import rel_entr

In [2]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
all_fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

#pd.set_option('display.max_rows', 100)
df = pd.read_csv(datafile, skipinitialspace=True, usecols=all_fields)
fields = 'FairlyActiveMinutes'
selected_data = df[['Id',fields]]
n_rows, n_cols = df.shape

selected_data.head()

Unnamed: 0,Id,FairlyActiveMinutes
0,1503960366,13
1,1503960366,19
2,1503960366,11
3,1503960366,34
4,1503960366,10


## Generate attack vectors
> Using the full vectors, only 1 field

In [3]:
all_IDs = df.Id.unique()
num_users = len(all_IDs)

user_dict = dict()
for idx, user in enumerate(all_IDs):
    temp_df = df.loc[df['Id'] == user]
    user_dict[user] = temp_df[fields].values

In [4]:
#test_IDs = all_IDs[[0, 1]]
test_IDs = all_IDs[[0, 6, 11, 15, 16, 25]]  #5, 10, 20, 30
n_attack_training = len(test_IDs)

training_IDs = [ID for ID in all_IDs if ID not in test_IDs]
attack_IDs = list(test_IDs) + list(training_IDs[0:n_attack_training])

#ths = [0]
ths = [0, 2, 5]

In [5]:
#attack_IDs = [1503960366]
#training_IDs = [5577150313, 6117666160, 6290855005, 6775888955]

# Max number of elements for any field
max_elements = 31

data_storage = np.zeros((len(attack_IDs), len(training_IDs), len(ths), max_elements))
accuracy_matrix = np.zeros((len(attack_IDs), len(training_IDs), len(ths)))

In [6]:
for i, attack_ID in enumerate(attack_IDs):
    for j, training_ID in enumerate(training_IDs):
        #for k, training_value in enumerate(user_dict[training_ID]):
        for l, th in enumerate(ths):
            for m, attack_value in enumerate(user_dict[attack_ID]):
                if (abs(np.array(user_dict[training_ID]) - attack_value) <= th).any():
                    # Then this given attack value is in the given person's vector
                    #print(f"{attack_value} found in training array: {attack_value in user_dict[training_ID]}")
                    data_storage[i,j,l,m] = 1
                else:
                    #print(f"{attack_value} NOT found in training array: {not (attack_value in user_dict[training_ID])}")
                    pass
                    
            # Once we have established membership for all attack values:
            # Calculate the accuracy of that given vector
            # E.g. percentage score of how many of attack values were present in given person
            accuracy_matrix[i,j,l] = sum(data_storage[i,j,l,:]) / len(user_dict[attack_ID])
        #print("--------------------------------------------")
            

In [7]:
accuracy_matrix.shape
#accuracy_matrix[:,:,0]

(12, 27, 3)

In [8]:
true_positives = [0] * len(ths)
false_negatives = [0] * len(ths)
#label_matrix = np.zeros((accuracy_matrix.shape))

for attack_idx, attack_ID in enumerate(attack_IDs):
    if attack_ID in training_IDs:
        label_idx = training_IDs.index(attack_ID)
        for j in range(len(ths)):
            #label_matrix[attack_idx, label_idx, j] = 1
            if accuracy_matrix[attack_idx, label_idx, j] == 1:
                true_positives[j] += 1
            else:
                false_negatives[j] += 1
            #print(f"Accuracy Matrix: {accuracy_matrix[attack_idx, label_idx, j]} VS Label Matrix: {label_matrix[attack_idx, label_idx, j]}")

for j in range(len(ths)):
    if true_positives[j] == n_attack_training:
        # Note that this is not necessarily mean all of them were true positives
        print(f"Th={ths[j]}: Correct number of positives found ({true_positives[j]})!")
    else:
        print(f"Th={ths[j]}: Incorrect number of positives... found {true_positives[j]} vs expected {n_attack_training[j]}")

Th=0: Correct number of positives found (6)!
Th=2: Correct number of positives found (6)!
Th=5: Correct number of positives found (6)!


In [9]:
precision = [0] * len(ths)
recall = [0] * len(ths)
accuracy = [0] * len(ths)

for j in range(len(ths)):
    all_positives = np.count_nonzero(accuracy_matrix[:,:,j] == 1) 
    false_positives = all_positives - true_positives[j]
    true_negatives = np.product(accuracy_matrix.shape) - all_positives - false_negatives[j]
        
    precision[j] = true_positives[j] / (true_positives[j] + false_positives)
    print(f"Th={ths[j]}: Precision of {precision[j]*100:.2f}")
    recall[j] = true_positives[j] / (true_positives[j] + false_negatives[j])
    print(f"Th={ths[j]}: Recall of {recall[j]*100:.2f}")
    accuracy[j] = (true_positives[j] + true_negatives) / np.product(accuracy_matrix.shape)
    print(f"Th={ths[j]}: Accuracy of {accuracy[j]*100:.2f}")
    print()

Th=0: Precision of 24.00
Th=0: Recall of 100.00
Th=0: Accuracy of 98.05

Th=2: Precision of 5.31
Th=2: Recall of 100.00
Th=2: Accuracy of 88.99

Th=5: Precision of 3.68
Th=5: Recall of 100.00
Th=5: Accuracy of 83.85



## Conclusions
As you can see from the above, when the attack vector was one of the vectors taken from the training set, the accuracy matrix generated scores of 1 (100%, e.g. 100% of the elements from the attack vector matched with its training vector correspondent), as would be expected.  However, the accuracy matrix also reports scores of 1 for a few other vectors, namely ones where the attack vector matched perfectly to the training vector, but it was not that training vector.
<br>
<br>
The way this code was executed is that it simply subtracts the current attack value (e.g. the current value pulled from the given attack vector) from the full training vector, and if any of the resulting values in the new array are less than or equal to the threshold (for now, th=0 only), then it counts the attack value as found in the training vector.  This process is repeated for each training vector, in order to generate a boolean matrix of "hits" and "misses" depending on whether or not each attack value was found in each training vector.
> (abs(np.array(user_dict[training_ID]) - attack_value) <= th).any()

## Next Steps
> An improved version of this code can be found in NB 061

<br>

The next steps would be to functionalize the above, and simulate a real attacker, going through multiple runs, where each run consists of randomly sampled values (e.g. random users selected for testing sets and random users pulled from the training set)