In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

pd.set_option('display.max_rows', 100)
df = pd.read_csv(datafile, skipinitialspace=True, usecols=fields)
n_rows, n_cols = df.shape
df = df.loc[:, ["Id", "FairlyActiveMinutes", "VeryActiveDistance", "LightlyActiveMinutes"]]
df = df.set_index('Id')
display(df.head(32))

Unnamed: 0_level_0,FairlyActiveMinutes,VeryActiveDistance,LightlyActiveMinutes
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1503960366,13,1.88,328
1503960366,19,1.57,217
1503960366,11,2.44,181
1503960366,34,2.14,209
1503960366,10,2.71,221
1503960366,20,3.19,164
1503960366,16,3.25,233
1503960366,31,3.53,264
1503960366,12,1.96,205
1503960366,8,1.34,211


In [14]:
fam_domain = set(df.loc[:, 'FairlyActiveMinutes'])

x=20 # NUMBER OF VALUES IN A SUBSET, THAT MUST ALL MATCH THE TRAINING SET
#^81 is the max because that's how many unique numbers exist in the fam_domain
n=500 #len(fam_domain) # NUMBER OF SUBSETS
thresh_vec=list(range(0, 6))
field='FairlyActiveMinutes'
#def determine_uniqueness(x, n, df, thresh_vec=list(range(1, 6)), field='FairlyActiveMinutes'):

# VECTOR INITIALIZATION
all_IDs = set(df.index)
num_IDs = len(all_IDs)

'''We are looking to see that each of the n subsets had COMPLETE OVERLAP with the 
user dataset (e.g. all values in the subset were in the data set, or within the threshold)'''
occurence_dict = dict(zip(all_IDs, [np.zeros((len(thresh_vec), n))]*len(all_IDs)))

'''Here we are looking to see that across ALL of the subsets for a given threshold, 
there were AT LEAST x subsets that had 100% matching with the dataset'''
compatible_dict = dict(zip(all_IDs, [np.zeros(len(thresh_vec))]*len(all_IDs)))

# This is the full testing dataset
x_fam = [0 for ele in range(n)]
for i in range(n):
    # Each subset n consists of x randomly sampled values 
    x_fam[i] = random.sample(fam_domain, x)

# CHECKS
for i, user_i in enumerate(all_IDs):
    user_data = list(df.loc[user_i, field])
    for j, subset in enumerate(x_fam):
        for k, thresh in enumerate(thresh_vec):
            counter = 0
            for l, val in enumerate(subset):
                if (abs(np.array(user_data) - val) <= thresh).any():
                    # Given value was within the specified threshold
                    counter += 1
            # If all values in this subset are represented
            if counter == x:  # E.g. counter matches the number of values present in the subset
                # Then this subset is represented fully, in the dataset
                occurence_dict[user_i][k, j] = 1
                
# CONVERT OCCURENCE TO COMPATIBILITY
for i, user_i in enumerate(all_IDs):
    for j, thresh in enumerate(thresh_vec):
        compatible_dict[user_i][j] = (sum(occurence_dict[user_i][j]) >= x)
        
# CONVERT COMPATIBILITY TO UNIQUENESS
unique_ID = np.zeros((num_IDs, len(thresh_vec), 1))

for i in range(len(thresh_vec)):
    running_sum = 0
    for j, user_j in enumerate(all_IDs):
        if compatible_dict[user_j][i] == 1:
            running_sum += 1
    if running_sum == 1:
        unique_ID[i] = 1

In [11]:
occurence_dict

{8053475328: array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 1., 0., ..., 1., 0., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 1644430081: array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 1., 0., ..., 1., 0., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 8378563200: array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 1., 0., ..., 1., 0., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 4558609924: array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 1., 0., 0.],
        [1., 1., 0., ..., 1., 0., 1.],
        [1., 1., 1., ..., 1., 1., 1.]]),
 8583815

In [12]:
compatible_dict

{8053475328: array([0., 0., 0., 1., 1., 1.]),
 1644430081: array([0., 0., 0., 1., 1., 1.]),
 8378563200: array([0., 0., 0., 1., 1., 1.]),
 4558609924: array([0., 0., 0., 1., 1., 1.]),
 8583815059: array([0., 0., 0., 1., 1., 1.]),
 4319703577: array([0., 0., 0., 1., 1., 1.]),
 2320127002: array([0., 0., 0., 1., 1., 1.]),
 2026352035: array([0., 0., 0., 1., 1., 1.]),
 7007744171: array([0., 0., 0., 1., 1., 1.]),
 1503960366: array([0., 0., 0., 1., 1., 1.]),
 8877689391: array([0., 0., 0., 1., 1., 1.]),
 2347167796: array([0., 0., 0., 1., 1., 1.]),
 4388161847: array([0., 0., 0., 1., 1., 1.]),
 1927972279: array([0., 0., 0., 1., 1., 1.]),
 2022484408: array([0., 0., 0., 1., 1., 1.]),
 6775888955: array([0., 0., 0., 1., 1., 1.]),
 8792009665: array([0., 0., 0., 1., 1., 1.]),
 5553957443: array([0., 0., 0., 1., 1., 1.]),
 3372868164: array([0., 0., 0., 1., 1., 1.]),
 7086361926: array([0., 0., 0., 1., 1., 1.]),
 6962181067: array([0., 0., 0., 1., 1., 1.]),
 4057192912: array([0., 0., 0., 1.

In [13]:
unique_ID

array([[[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]],

       [[0.],
        [0.],
        [0.],
        [0.]

In [None]:
'''
x=30
n=len(fam_domain)
thresh_vec=list(range(0, 6))
field='FairlyActiveMinutes'
#def determine_uniqueness(x, n, df, thresh_vec=list(range(1, 6)), field='FairlyActiveMinutes'):

# VECTOR INITIALIZATION
all_IDs = set(df.index)
num_IDs = len(all_IDs)

occurence_dict = dict(zip(all_IDs, [np.zeros((len(thresh_vec), n))]*len(all_IDs)))
compatible_dict = dict(zip(all_IDs, [np.zeros(len(thresh_vec))]*len(all_IDs)))

x_fam = [0 for ele in range(n)]
for i in range(n):
    x_fam[i] = random.sample(fam_domain, x)

# CHECKS
for i, user_i in enumerate(all_IDs):
    user_data = list(df.loc[user_i, field])
    for j, subset in enumerate(x_fam):
        for k, thresh in enumerate(thresh_vec):
            counter = 0
            for val in subset:
                for og_val in user_data:
                    if abs(og_val - val) <= thresh:
                        occurence_dict[user_i][k, j] = 1
                        break
            #if counter > x:
            #    occurence_dict[user_i][k, j] = 1
            #occurence_dict[user_i][k, j] = counter
                
# CONVERT OCCURENCE TO COMPATIBILITY
for i, user_i in enumerate(all_IDs):
    for j, thresh in enumerate(thresh_vec):
        compatible_dict[user_i][j] = (sum(occurence_dict[user_i][j]) > x)
        
# CONVERT COMPATIBILITY TO UNIQUENESS
unique_ID = np.zeros((num_IDs, len(thresh_vec), 1))

for i in range(len(thresh_vec)):
    running_sum = 0
    for j, user_j in enumerate(all_IDs):
        if compatible_dict[user_j][i] == 1:
            running_sum += 1
    if running_sum == 1:
        unique_ID[i] = 1
'''
0