# 3/28
1. Measure the unique features across individual
2. Matching: tell how many people are hit by RNG fields (most informative, divided by the total number of records
3. How much information do we really need to identify individuals --> So far we only know some features that are informative.  Next step is how much we need to know to uniquely identify an individual
4. Step through 1 RNG reading, then try 2, etc. we should show higher uniquessness in the data as we increase the number of readings
5. Need to understand how we design the test set for the patients linking.  Should also include some patients that were not in the original dataset in order to determine the precision of said target matching.

## This NB:
> **Conclusion From Previous Notebook:** The highest values are 0.289, 0.273, and 0.266, for the following fields, respectively: 'FairlyActiveMinutes', 'VeryActiveDistance', and 'LightlyActiveMinutes'.  Calories and total steps were both relatively uninformative.

## Goals
1. Assume to conduct the uniqueness analysis on each activity separately. In other words, we compute a uniqueness score, for each activity. Here, we can start by considering the top-3 activity that you have already identified using the KL-divergence.
2. Consider a threshold value to match the reading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

pd.set_option('display.max_rows', 100)
df = pd.read_csv(datafile, skipinitialspace=True, usecols=fields)
n_rows, n_cols = df.shape
df = df.loc[:, ["Id", "FairlyActiveMinutes", "VeryActiveDistance", "LightlyActiveMinutes"]]
df = df.set_index('Id')
display(df.head(32))

Unnamed: 0_level_0,FairlyActiveMinutes,VeryActiveDistance,LightlyActiveMinutes
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1503960366,13,1.88,328
1503960366,19,1.57,217
1503960366,11,2.44,181
1503960366,34,2.14,209
1503960366,10,2.71,221
1503960366,20,3.19,164
1503960366,16,3.25,233
1503960366,31,3.53,264
1503960366,12,1.96,205
1503960366,8,1.34,211


In [3]:
def determine_uniqueness(x, n, df, thresh_vec=list(range(1, 6)), field='FairlyActiveMinutes'):
    # VECTOR INITIALIZATION
    all_IDs = set(df.index)
    num_IDs = len(all_IDs)

    compatible_dict = dict(zip(all_IDs, [np.zeros((len(thresh_vec), n))]*len(all_IDs)))

    max_fam = max(df.loc[:, field])
    min_fam = min(df.loc[:, field])
    x_fam = [[0]*x for ele in range(n)]
    for i in range(n):
        for j in range(x):
            x_fam[i][j] = random.randint(round(min_fam), round(max_fam))

    # CHECKS
    for i, user_i in enumerate(all_IDs):
        user_data = list(df.loc[user_i, field])
        for j, subset in enumerate(x_fam):
            for k, thresh in enumerate(thresh_vec):
                counter = 0
                for val in subset:
                    for og_val in user_data:
                        if abs(og_val - val) < thresh:
                            counter += 1
                if counter > x:
                    compatible_dict[user_i][k, j] = 1

    # UNIQUENESS
    num_unique_IDs = np.zeros((len(thresh_vec), 1))

    for i, user_i in enumerate(all_IDs):
        for j in range(len(thresh_vec)):
            if sum(compatible_dict[user_i][j, :]) == 1:
                #print(F"{user_i} UNIQUE IDENTIFICATION")
                num_unique_IDs[j, 0] += 1

    return compatible_dict, num_unique_IDs

## Testing Functionalized Version
> I keep getting only 0s or 33s... Implies either no one is unique or everyone is...


In [8]:
n = 1000

In [9]:
cdict_1, nuniqIDs_1 = determine_uniqueness(1, n, df)
#print(cdict_1)
print(nuniqIDs_1)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [10]:
cdict_2, nuniqIDs_2 = determine_uniqueness(2, n, df)
#print(cdict_2)
print(nuniqIDs_2)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [11]:
cdict_3, nuniqIDs_3 = determine_uniqueness(3, n, df)
#print(cdict_3)
print(nuniqIDs_3)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [12]:
cdict_4, nuniqIDs_4 = determine_uniqueness(4, n, df)
#print(cdict_4)
print(nuniqIDs_4)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [13]:
cdict_5, nuniqIDs_5 = determine_uniqueness(5, n, df)
#print(cdict_5)
print(nuniqIDs_5)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [14]:
cdict_10, nuniqIDs_10 = determine_uniqueness(10, n, df)
#print(cdict_10)
print(nuniqIDs_10)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [15]:
cdict_50, nuniqIDs_50 = determine_uniqueness(50, n, df)
#print(cdict_50)
print(nuniqIDs_50)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


## Testing VeryActiveDistance (2)

In [16]:
cdict_1, nuniqIDs_1 = determine_uniqueness(1, n, df, field='VeryActiveDistance')
print(nuniqIDs_1)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [17]:
cdict_2, nuniqIDs_2 = determine_uniqueness(2, n, df, field='VeryActiveDistance')
print(nuniqIDs_2)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [18]:
cdict_3, nuniqIDs_3 = determine_uniqueness(3, n, df, field='VeryActiveDistance')
print(nuniqIDs_3)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [19]:
cdict_4, nuniqIDs_4 = determine_uniqueness(4, n, df, field='VeryActiveDistance')
print(nuniqIDs_4)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [20]:
cdict_5, nuniqIDs_5 = determine_uniqueness(5, n, df, field='VeryActiveDistance')
print(nuniqIDs_5)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [21]:
cdict_50, nuniqIDs_50 = determine_uniqueness(50, n, df, field='VeryActiveDistance')
print(nuniqIDs_50)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


## Testing LightlyActiveMinutes (3)

In [22]:
cdict_1, nuniqIDs_1 = determine_uniqueness(1, n, df, field='LightlyActiveMinutes')
print(nuniqIDs_1)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [23]:
cdict_2, nuniqIDs_2 = determine_uniqueness(2, n, df, field='LightlyActiveMinutes')
print(nuniqIDs_2)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [24]:
cdict_3, nuniqIDs_3 = determine_uniqueness(3, n, df, field='LightlyActiveMinutes')
print(nuniqIDs_3)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [25]:
cdict_4, nuniqIDs_4 = determine_uniqueness(4, n, df, field='LightlyActiveMinutes')
print(nuniqIDs_4)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [26]:
cdict_5, nuniqIDs_5 = determine_uniqueness(5, n, df, field='LightlyActiveMinutes')
print(nuniqIDs_5)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [27]:
cdict_50, nuniqIDs_50 = determine_uniqueness(50, n, df, field='LightlyActiveMinutes')
print(nuniqIDs_50)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [24]:
raise

RuntimeError: No active exception to reraise

# OLD VERSIONS
## x = 1

In [None]:
x = 1

max_fam = max(df.loc[:, "FairlyActiveMinutes"])
min_fam = min(df.loc[:, "FairlyActiveMinutes"])

x_fam = [0]*x
for i in range(x):
    x_fam[i] = random.randint(min_fam, max_fam)

In [None]:
occurence_list = list(np.zeros(6))

thresh = 5
thresh_vec = [0] * (2*thresh+1)
for i, val in enumerate(thresh_vec):
    thresh_vec[i] = i - thresh

all_IDs = set(df.loc[:, "Id"])
occurence_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))
compatible_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))

for i, thresh_val in enumerate(thresh_vec):
    for j in range(n_rows):
        for xfam_val in x_fam: 
            if (df.iloc[j,1]-xfam_val < thresh_val):
                occurence_dict[df.iloc[j,0]][i] += 1
        if occurence_dict[df.iloc[j,0]][i] > x:
            compatible_dict[df.iloc[j,0]][i] = 1

In [None]:
occurence_dict

In [None]:
compatible_dict

## x = 1 but with newer method

In [None]:
x = 1
n = 5

max_fam = max(df.loc[:, "FairlyActiveMinutes"])
min_fam = min(df.loc[:, "FairlyActiveMinutes"])

x_fam = [0]*x
for i in range(x):
    x_fam[i] = random.randint(min_fam, max_fam)

thresh = 5
#thresh_vec = [0] * (2*thresh+1)
#for i, val in enumerate(thresh_vec):
#    thresh_vec[i] = i - thresh
thresh_vec = list(range(1, 6))

all_IDs = set(df.loc[:, "Id"])
occurence_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))
compatible_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))

x_fam = [[0]*x for ele in range(n)]
for i in range(n):
    for j in range(x):
        x_fam[i][j] = random.randint(min_fam, max_fam)

for i, thresh_val in enumerate(thresh_vec):
    for j in range(n_rows):
        for subset in x_fam:
            for k, val in enumerate(subset):
                if (abs(df.iloc[j,1]-val) < thresh_val):
                    if (j+1<n_rows) and (k<(len(subset)-1)) and (abs(df.iloc[j+1,1]-subset[k+1]) < thresh_val):
                        occurence_dict[df.iloc[j,0]][i] += 1
                    elif x==1:
                        occurence_dict[df.iloc[j,0]][i] += 1
        if occurence_dict[df.iloc[j,0]][i] > x:
            compatible_dict[df.iloc[j,0]][i] = 1

In [None]:
occurence_dict

In [None]:
compatible_dict

## x = 2

In [None]:
x = 2
n = 5

max_fam = max(df.loc[:, "FairlyActiveMinutes"])
min_fam = min(df.loc[:, "FairlyActiveMinutes"])

x_fam = [0]*x
for i in range(x):
    x_fam[i] = random.randint(min_fam, max_fam)

thresh = 5
#thresh_vec = [0] * (2*thresh+1)
#for i, val in enumerate(thresh_vec):
#    thresh_vec[i] = i - thresh
thresh_vec = list(range(1, 6))

all_IDs = set(df.loc[:, "Id"])
occurence_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))
compatible_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))

x_fam = [[0]*x for ele in range(n)]
for i in range(n):
    for j in range(x):
        x_fam[i][j] = random.randint(min_fam, max_fam)
x_fam

In [None]:
for i, thresh_val in enumerate(thresh_vec):
    for j in range(n_rows):
        for subset in x_fam:
            for k, val in enumerate(subset):
                if (abs(df.iloc[j,1]-val) < thresh_val):
                    if (j+1<n_rows) and (k<(len(subset)-1)) and (abs(df.iloc[j+1,1]-subset[k+1]) < thresh_val):
                        occurence_dict[df.iloc[j,0]][i] += 1
                    elif x==1:
                        occurence_dict[df.iloc[j,0]][i] += 1
        if occurence_dict[df.iloc[j,0]][i] > x:
            compatible_dict[df.iloc[j,0]][i] = 1

In [None]:
occurence_dict

In [None]:
compatible_dict

## x = 3

In [None]:
x = 3
n = 5

max_fam = max(df.loc[:, "FairlyActiveMinutes"])
min_fam = min(df.loc[:, "FairlyActiveMinutes"])

x_fam = [0]*x
for i in range(x):
    x_fam[i] = random.randint(min_fam, max_fam)

thresh_vec = list(range(1, 6))

all_IDs = set(df.loc[:, "Id"])
occurence_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))
compatible_dict = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))

x_fam = [[0]*x for ele in range(n)]
for i in range(n):
    for j in range(x):
        x_fam[i][j] = random.randint(min_fam, max_fam)

for i, thresh_val in enumerate(thresh_vec):
    for j in range(n_rows):
        for subset in x_fam:
            for k, val in enumerate(subset):
                counter = 0
                if (abs(df.iloc[j,1]-val) < thresh_val):
                    counter += 1
            if counter =
                '''
                if (abs(df.iloc[j,1]-val) < thresh_val):
                    if (j+1<n_rows) and (k<(len(subset)-1)) and (abs(df.iloc[j+1,1]-subset[k+1]) < thresh_val):
                        occurence_dict[df.iloc[j,0]][i] += 1
                    elif x==1:
                        occurence_dict[df.iloc[j,0]][i] += 1
                '''
        if occurence_dict[df.iloc[j,0]][i] > x:
            compatible_dict[df.iloc[j,0]][i] = 1

In [None]:
occurence_dict

In [None]:
compatible_dict

In [None]:
"""
# CHECKS
for k, thresh_val in enumerate(thresh_vec):
    for j, ID in enumerate(num_IDs):
        for i,  in emuerate(subsets):
        
        
    for j in range(n_rows):
        for subset in x_fam:
            for k, val in enumerate(subset):
                counter = 0
                if (abs(df.iloc[j,1]-val) < thresh_val):
                    counter += 1
            if counter =
                '''
                if (abs(df.iloc[j,1]-val) < thresh_val):
                    if (j+1<n_rows) and (k<(len(subset)-1)) and (abs(df.iloc[j+1,1]-subset[k+1]) < thresh_val):
                        occurence_dict[df.iloc[j,0]][i] += 1
                    elif x==1:
                        occurence_dict[df.iloc[j,0]][i] += 1
                '''
        if occurence_dict[df.iloc[j,0]][i] > x:
            compatible_dict[df.iloc[j,0]][i] = 1
"""
0

In [None]:
'''
occurence_list = list(np.zeros(6))

for idx, thresh in enumerate(range(1,6)):
    thresh_vec = [0] * (2*thresh+1)
    for i, val in enumerate(thresh_vec):
        thresh_vec[i] = i - thresh

    all_IDs = set(df.loc[:, "Id"])
    occurence_list[idx] = dict(zip(all_IDs, np.array([np.zeros(len(thresh_vec))]*len(all_IDs))))
    
    for i, thresh_val in enumerate(thresh_vec):
        for j in range(n_rows):
            for xfam_val in x_fam: 
                if (df.iloc[j,1]-xfam_val < thresh_val):
                    occurence_list[idx][df.iloc[j,0]][i] += 1
'''
0