In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from scipy.special import rel_entr

In [2]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

#pd.set_option('display.max_rows', 100)
df = pd.read_csv(datafile, skipinitialspace=True, usecols=fields)
field = 'FairlyActiveMinutes'
selected_data = df[['Id',field]]
n_rows, n_cols = df.shape

In [3]:
selected_data.head()

Unnamed: 0,Id,FairlyActiveMinutes
0,1503960366,13
1,1503960366,19
2,1503960366,11
3,1503960366,34
4,1503960366,10


## Generate attack vectors

In [4]:
def id_long_enough_entries(IDs,  selected_data, j):
    persons_list = []
    for person in IDs:
        if len(set(selected_data.loc[person, field])) > j:
            persons_list.append(person)
    return persons_list

In [5]:
IDs = selected_data['Id'].unique()
individualAct = dict()
for Id in IDs:
    act = selected_data[selected_data['Id']==Id]
    individualAct[Id] = act['FairlyActiveMinutes'].tolist()
    
## Get the activity in the data
data_act = list(selected_data['FairlyActiveMinutes'].unique())

In [6]:
mu = np.mean(data_act)
sigma = np.std(data_act)
round(random.gauss(mu, sigma))

10

In [7]:
selected_data = selected_data.set_index('Id')

In [8]:
attack_lengths_list = [1, 5, 10, 15, 20]
my_true_attack_vectors = [0]*len(attack_lengths_list)
my_noisy_attack_vectors = [0]*len(attack_lengths_list)
selected_person = [0]*len(attack_lengths_list)

for i, attack_length in enumerate(attack_lengths_list):
    # Select a random person with at least attack_length entries 
    selected_person[i] = random.choice(id_long_enough_entries(IDs,  selected_data, attack_length))
    selected_individual_dataset = set(selected_data.loc[selected_person[i], field].values)
    for j in range(attack_length):
        rs = random.sample(selected_individual_dataset, j+1)
        
        my_true_attack_vectors[i] = np.array(rs)
        for k in range(j):
            rs[k] += round(random.gauss(mu, sigma))
        my_noisy_attack_vectors[i]  = np.array(rs)

In [9]:
my_true_attack_vectors

[array([15], dtype=int64),
 array([ 9,  4,  0, 13,  2], dtype=int64),
 array([ 7, 22, 54, 91, 44,  6,  2,  5, 28,  0], dtype=int64),
 array([65, 36, 56, 54, 32, 24,  9, 11, 72, 30, 21, 15, 38,  0, 37],
       dtype=int64),
 array([21,  7, 14,  8, 22, 36, 38, 33, 27, 34, 41, 39, 16,  4, 42, 15,  0,
        13, 24, 31], dtype=int64)]

In [10]:
my_noisy_attack_vectors

[array([15], dtype=int64),
 array([44, 99, 98, 98,  2], dtype=int64),
 array([ 58, 119, 107, 121,  82, 111,  36,  -1,  83,   0], dtype=int64),
 array([117, -20, 124,  96,  94,  48, 108,  79, 136, 163,  87,  80, 154,
         34,  37], dtype=int64),
 array([146,  35,  33,  43,  36, 141,  92,  76,  94, 111, 109, 117,  34,
         31, 138,  83, 103,  50,  95,  31], dtype=int64)]

## Bonomi's Suggested Euclidean
> He says iterate through all values, index based matching...

## Now compute KL-divergence to establish matching

In [33]:
my_attack_vectors = my_true_attack_vectors
my_attack_vectors.extend(my_noisy_attack_vectors)

In [56]:
arbitrarily_high_constant = 999999999

In [57]:
KL_matches = [0]*len(my_attack_vectors)

In [59]:
for i, av in enumerate(my_attack_vectors):
    min_KL = arbitrarily_high_constant
    min_user = ""
    
    for j, user in enumerate(all_IDs):        
        temp_user_values0 = selected_data.loc[user].values
        temp_user_values = [val[0] for val in temp_user_values0]
        
        # av needs to be the same length as temp_user_values...
        if len(temp_user_values) < len(av):
            #print("TOSSED ONE")
            current_KL = arbitrarily_high_constant
        else:
            av_padded = list(av)
            # Not sure if setting the padding to 1 is the best...
            # ^0s cause INF for KL divergence
            av_padded.extend([1]*(len(temp_user_values)-len(av)))
            current_KL = sum(rel_entr(temp_user_values, av_padded))
        
        if current_KL < min_KL:
            min_KL = current_KL
            min_user = user
            
        '''
        if np.isinf(current_KL):
            print("INFINITY FOUND")
        elif np.isnan(current_KL):
            print("NAN FOUND")
        else:
            print("SUCCESS")
        '''
    
    KL_matches[i] = [min_KL, min_user]
        

In [60]:
KL_matches

[[-5.028869275378994, 2026352035],
 [-0.9422642852510681, 2026352035],
 [1.0682511409961806, 2026352035],
 [-64.62290724224111, 6117666160],
 [-48.77965785448977, 1844505072],
 [-5.028869275378994, 2026352035],
 [-13.637984737907402, 2026352035],
 [999999999, ''],
 [999999999, ''],
 [-159.8771877782993, 8792009665]]

In [48]:
temp_user_values

[7,
 18,
 13,
 1,
 29,
 15,
 5,
 13,
 19,
 13,
 2,
 6,
 1,
 8,
 3,
 8,
 8,
 5,
 4,
 12,
 0,
 3,
 8,
 15,
 17,
 28,
 4,
 19,
 11,
 12,
 1]

In [49]:
av_padded

[146,
 35,
 33,
 43,
 36,
 141,
 92,
 76,
 94,
 111,
 109,
 117,
 34,
 31,
 138,
 83,
 103,
 50,
 95,
 31,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [52]:
rel_entr(temp_user_values, av_padded)

array([-21.26387531, -11.96957346, -12.11025665,  -3.76120012,
        -6.27047015, -33.61064534, -14.56175332, -22.95519178,
       -30.37826026, -27.87955097,  -7.9964014 , -17.82248679,
        -3.52636052, -10.8363653 , -11.48592419, -18.71519253,
       -20.44229957, -11.51292546, -12.67033012, -11.38896666,
         0.        ,          inf,          inf,          inf,
                inf,          inf,          inf,          inf,
                inf,          inf,          inf])

In [30]:
me = [1,2,3,4,5]
you = [1,2,3,4,211]

In [31]:
rel_entr(me, you)

array([  0.        ,   0.        ,   0.        ,   0.        ,
       -18.71210111])

In [None]:
def isHit(activity, query, ths):
    
    hits = list()
    for th in ths:
        hit = False
        i = 0
        for j in range(0,len(activity)):
            #print(query[j])
            if (abs(query[i] - activity[j])<=th):
                i = i + 1
            
                if i == len(query):
                    hit = True
                    break
        
        hits.append(hit)
    return hits


In [None]:
def uniqueness(queries, domain, k, l, uniq_score, individuals, ths):
    '''
    queries:
    domain:
    k: limit 
    l: current iteration I think, current number of entries more like
    individuals: individualAct[Id] = act['FairlyActiveMinutes'].tolist()
    '''
    
    print(queries[0])
    print(len(queries))
    print(len(individuals.keys()))
    
    if l > k:
        print("L > K!!!")
        return uniq_score
    
    present_queries = []
    query_matches = dict()
    
    for query in queries:
        match = dict()
        for th in ths:
            match[th] = []
            
        inData = False
        #print(query)
        for Id in individuals:
            ind_act = individuals[Id]
                
            hits = isHit(ind_act, query, ths)
                
            for i in range(0,len(ths)):
                hit = hits[i]
                
                if hit:
                    match[ths[i]].append(Id)
                    inData = True
                    #print(ths[i])
                    #print(Id)
                    #print(match)
        if inData:
            present_queries.append(query)
            query_matches[str(query)] = match
            #print(match)
    
    for i in range(0, len(ths)):
        uniquesId = []
        th = ths[i]
        #print(th)
        
        for keyquery in query_matches:
            match = query_matches[keyquery]
            
            matchTH = match[th]
            #print(keyquery)
            #print(matchTH)
            if len(list(set(matchTH))) == 1:
                #print("Unique")
                uniquesId.append(matchTH[0]) 
        
        score = (1.0*len(list(set(uniquesId))))/len(individuals.keys())
        uniq_score.append((l,th,score))
                        
    l = l + 1
    newqueries = []
    for query in present_queries:  
        for val in domain:
            newquery = query.copy()
            newquery.append(val)
            newqueries.append(newquery)
                   
    return uniqueness(newqueries, domain, k, l, uniq_score, individuals, ths)
            

In [None]:
#uniqueness up to 3 readings with th=0 and th=2
uniqueScores = list()
start_query = [[a] for a in data_act]
uniqueScores = uniqueness(start_query, data_act, 3, 1, uniqueScores, individualAct, [0, 2, 5])