# 3/21
1. Find entropy in order to characterize attributes by most informative
2. Experiment with uniqueness

## Finding Most Informative Metrics

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import rel_entr
from math import isinf

In [120]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

df = pd.read_csv(datafile, skipinitialspace=True, usecols=fields)
df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,1.88,0.55,6.06,0.0,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,1.57,0.69,4.71,0.0,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,2.44,0.4,3.91,0.0,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,2.14,1.26,2.83,0.0,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,2.71,0.41,5.04,0.0,36,10,221,773,1863


In [121]:
all_IDs = df.Id.unique()
num_users = int(len(all_IDs))
my_cols = df.columns[2:]

In [122]:
my_hist_bins = [0] * len(my_cols)
for i in range(len(my_hist_bins)):
    my_hist_bins[i] = [0] * num_users

for i in range(len(my_cols)):
    for j, user in enumerate(all_IDs):
        temp_df = df.loc[df['Id'] == user]
        edited_df = temp_df[temp_df[my_cols[i]] !=0]

        my_series = edited_df[my_cols[i]]
        count, division = np.histogram(my_series)
        my_hist_bins[i][j] = count/sum(count)
        #caloric_series.hist(bins=division)

  if sys.path[0] == '':


In [131]:
distance_matrix = np.zeros((len(my_cols), num_users, num_users))
dms = distance_matrix.shape
avg_dist_lst = np.zeros((1, dms[0]))

for z in range(len(my_cols)):
    for i in range(num_users-1):
        for j in range(num_users-1):
            distance_matrix[z, i, j] = sum(rel_entr(np.array(my_hist_bins[z][i]),np.array(my_hist_bins[z][j])))
            if np.isinf(distance_matrix[z, i, j]):
                distance_matrix[z, i, j] = 0
            if np.isnan(distance_matrix[z, i, j]):
                distance_matrix[z, i, j] = 0
                #print(f"NAN @ {z, i, j}")
                #^Looks like just every value for my_col 6, so col 8
    avg_dist_lst[0, z] = np.average(distance_matrix[z])

In [132]:
avg_dist_lst

array([[0.17557752, 0.19994318, 0.19918966, 0.27313604, 0.23293366,
        0.2423137 , 0.01011405, 0.28902699, 0.26617139, 0.14467511,
        0.13966971, 0.21871801]])

In [133]:
dist_std = np.std(distance_matrix)
print(dist_std)
print(dist_std*3)

0.44989156438357863
1.349674693150736


In [134]:
np.average(np.array(avg_dist_lst))

0.19928908464731668

As can be seen above, the highest values are 0.273, 0.289, and 0.266, for the following fields, respectively: 'VeryActiveDistance', 'VeryActiveMinutes', and 'FairlyActiveMinutes'.  Calories were fairly information-dense, although total steps was not.

## Judging Uniqueness

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import rel_entr
from math import isinf

In [74]:
################## DATA FRAME ##################
datafile = 'Data/Fitbit_Kaggle/dailyActivity_merged.csv'
fields = ['Id', 'ActivityDate', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

df = pd.read_csv(datafile, skipinitialspace=True, usecols=fields)
df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960366,4/12/2016,13162,8.5,8.5,1.88,0.55,6.06,25,13,328,728,1985
1,1503960366,4/13/2016,10735,6.97,6.97,1.57,0.69,4.71,21,19,217,776,1797
2,1503960366,4/14/2016,10460,6.74,6.74,2.44,0.4,3.91,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,2.14,1.26,2.83,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,2.71,0.41,5.04,36,10,221,773,1863


In [75]:
unknown_df = pd.DataFrame()
index_list = [0,1,6,12,14,19,22,23,28]

for my_index in index_list:
    unknown_df = unknown_df.append(df.iloc[my_index],ignore_index=True)
    df = df.drop(my_index)
    
unknown_df.head(10)

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960000.0,4/12/2016,13162.0,8.5,8.5,1.88,0.55,6.06,25.0,13.0,328.0,728.0,1985.0
1,1503960000.0,4/14/2016,10460.0,6.74,6.74,2.44,0.4,3.91,30.0,11.0,181.0,1218.0,1776.0
2,1503960000.0,4/20/2016,10544.0,6.68,6.68,1.96,0.48,4.24,28.0,12.0,205.0,818.0,1786.0
3,1503960000.0,4/27/2016,18134.0,12.21,12.21,6.4,0.41,5.41,78.0,11.0,243.0,1108.0,2159.0
4,1503960000.0,4/30/2016,14673.0,9.25,9.25,3.56,1.42,4.27,52.0,34.0,217.0,712.0,1947.0
5,1503960000.0,5/6/2016,12159.0,8.03,8.03,1.97,0.25,5.81,24.0,6.0,289.0,754.0,1896.0
6,1503960000.0,5/10/2016,12207.0,7.77,7.77,3.35,1.16,3.26,46.0,31.0,214.0,746.0,1859.0
7,1503960000.0,5/12/2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1440.0,0.0
8,1624580000.0,4/17/2016,6175.0,4.06,4.06,1.03,1.52,1.49,15.0,22.0,127.0,1276.0,1554.0


In [76]:
df.head(10)

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
2,1503960366,4/14/2016,10460,6.74,6.74,2.44,0.4,3.91,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,2.14,1.26,2.83,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,2.71,0.41,5.04,36,10,221,773,1863
5,1503960366,4/17/2016,9705,6.48,6.48,3.19,0.78,2.51,38,20,164,539,1728
7,1503960366,4/19/2016,15506,9.88,9.88,3.53,1.32,5.03,50,31,264,775,2035
8,1503960366,4/20/2016,10544,6.68,6.68,1.96,0.48,4.24,28,12,205,818,1786
9,1503960366,4/21/2016,9819,6.34,6.34,1.34,0.35,4.65,19,8,211,838,1775
10,1503960366,4/22/2016,12764,8.13,8.13,4.76,1.12,2.24,66,27,130,1217,1827
11,1503960366,4/23/2016,14371,9.04,9.04,2.81,0.87,5.36,41,21,262,732,1949
13,1503960366,4/25/2016,15355,9.8,9.8,5.29,0.57,3.94,73,14,216,814,2013


In [77]:
avg_fields = ['TotalSteps', 'TotalDistance', 'TrackerDistance', 
          'VeryActiveDistance', 'ModeratelyActiveDistance', 'LightActiveDistance', 
          'VeryActiveMinutes', 'FairlyActiveMinutes', 'LightlyActiveMinutes', 'SedentaryMinutes', 'Calories']

all_IDs = df.Id.unique()
num_users = len(all_IDs)

user_dict = dict()
for idx, user in enumerate(all_IDs):
    temp_df = df.loc[df['Id'] == user]
    user_dict[user] = np.round(temp_df[avg_fields].mean(), 3)

avg_df = pd.DataFrame(user_dict.values(), index=user_dict.keys())
print("AVERAGE OF EACH FIELD FOR EACH USER")
avg_df.head()

AVERAGE OF EACH FIELD FOR EACH USER


Unnamed: 0,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
1503960366,12133.182,7.828,7.828,2.984,0.768,4.071,40.091,18.455,209.545,844.818,1790.5
1624580081,5743.903,3.915,3.915,0.939,0.361,2.607,8.677,5.806,153.484,1257.742,1483.355
1644430081,7282.967,5.295,5.295,0.73,0.951,3.609,9.567,21.367,178.467,1161.867,2811.3
1844505072,2580.065,1.706,1.706,0.008,0.049,1.647,0.129,1.29,115.452,1206.613,1573.484
1927972279,916.129,0.635,0.635,0.096,0.031,0.507,1.323,0.774,38.581,1317.419,2172.806


In [78]:
# Wouldn't actually know the user num, but it doesn't really matter here, it's just the index
# unknown_user = 1503960366

avg_unknown_df = pd.DataFrame()
avg_unknown_df = avg_unknown_df.append(np.round(unknown_df[avg_fields].mean(), 3), ignore_index=True)
print("AVERAGE OF UNKNOWN FIELDS (ASSUMING IT IS ONE PERSON)")
avg_unknown_df.head()

AVERAGE OF UNKNOWN FIELDS (ASSUMING IT IS ONE PERSON)


Unnamed: 0,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,10834.889,7.027,7.027,2.51,0.688,3.828,33.111,15.556,200.444,977.778,1662.444


As can be seen above, the highest values are 0.273, 0.289, and 0.266, for the following fields, respectively: 'VeryActiveDistance', 'VeryActiveMinutes', and 'FairlyActiveMinutes'.  Calories were fairly information-dense, although total steps was not.

In [94]:
difference = np.zeros((len(avg_df.index), len(avg_fields)))
L2_norm = np.zeros((len(avg_df.index), len(avg_fields)))

for idx1, user in enumerate(avg_df.index):
    for idx2, field in enumerate(avg_fields):
        base_attr = avg_df.loc[user, field]
        unknown_attr = avg_unknown_df[field][0]

        difference[idx1, idx2] = abs(unknown_attr - base_attr)/unknown_attr
        L2_norm[idx1, idx2] = np.sqrt((base_attr**2) + (unknown_attr**2))

In [104]:
my_diff_mins = difference.min(axis=1)

my_diff_min = my_diff_mins.min()
print(my_diff_min)

pred_index_dif = list(my_diff_mins).index(my_diff_min)
print(pred_index_dif)

0.0019339376711657798
15


In [105]:
my_diff_mins[29] = 100
my_diff_min = my_diff_mins.min()
print(my_diff_min)
pred_index_dif = list(my_diff_mins).index(my_diff_min)
print(pred_index_dif)

my_diff_mins[15] = 100
my_diff_min = my_diff_mins.min()
print(my_diff_min)
pred_index_dif = list(my_diff_mins).index(my_diff_min)
print(pred_index_dif)

my_diff_mins[17] = 100
my_diff_min = my_diff_mins.min()
print(my_diff_min)
pred_index_dif = list(my_diff_mins).index(my_diff_min)
print(pred_index_dif)

0.0019339376711657798
15
0.005224660397074195
17
0.01024619325458945
18


In [111]:
my_L2_mins = L2_norm.min(axis=1)

my_L2_min = my_L2_mins.min()
print(my_L2_min)

pred_index_L2 = list(my_L2_mins).index(my_L2_min)
print(pred_index_L2)

0.6880879304274998
6


In [115]:
col_used = [0]*len(my_L2_mins)
for idx, row_min in enumerate(my_L2_mins):
    col_used[idx] = list(L2_norm[idx]).index(row_min)

counts = [0]*len(col_used)
for idx in range(len(col_used)):
    if col_used[idx]==idx:
        counts[idx] += 1

print(counts.index(max(counts)))

4


In [90]:
my_L2_mins[6] = 100
my_L2_min = my_L2_mins.min()
print(my_L2_min)
pred_index_L2 = list(my_L2_mins).index(my_L2_min)
print(pred_index_L2)

my_L2_mins[4] = 100
my_L2_min = my_L2_mins.min()
print(my_L2_min)
pred_index_L2 = list(my_L2_mins).index(my_L2_min)
print(pred_index_L2)

my_L2_mins[3] = 100
my_L2_min = my_L2_mins.min()
print(my_L2_min)
pred_index_L2 = list(my_L2_mins).index(my_L2_min)
print(pred_index_L2)

0.6886980470423885
4
0.6897427056518973
3
0.6904404391401187
31


For L2 norms, the current approach shows that the following field is being used to match: ModeratelyActiveDistance.  Also, note that all the user estimates are incorrect (correct user is user 0).  Thus, let's try using KL-divergence like we did earlier to see if that works better.  In theory this should work perfectly since they do actually come from the same distribution.

In [116]:
df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
2,1503960366,4/14/2016,10460,6.74,6.74,2.44,0.4,3.91,30,11,181,1218,1776
3,1503960366,4/15/2016,9762,6.28,6.28,2.14,1.26,2.83,29,34,209,726,1745
4,1503960366,4/16/2016,12669,8.16,8.16,2.71,0.41,5.04,36,10,221,773,1863
5,1503960366,4/17/2016,9705,6.48,6.48,3.19,0.78,2.51,38,20,164,539,1728
7,1503960366,4/19/2016,15506,9.88,9.88,3.53,1.32,5.03,50,31,264,775,2035


In [117]:
unknown_df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
0,1503960000.0,4/12/2016,13162.0,8.5,8.5,1.88,0.55,6.06,25.0,13.0,328.0,728.0,1985.0
1,1503960000.0,4/14/2016,10460.0,6.74,6.74,2.44,0.4,3.91,30.0,11.0,181.0,1218.0,1776.0
2,1503960000.0,4/20/2016,10544.0,6.68,6.68,1.96,0.48,4.24,28.0,12.0,205.0,818.0,1786.0
3,1503960000.0,4/27/2016,18134.0,12.21,12.21,6.4,0.41,5.41,78.0,11.0,243.0,1108.0,2159.0
4,1503960000.0,4/30/2016,14673.0,9.25,9.25,3.56,1.42,4.27,52.0,34.0,217.0,712.0,1947.0


In [None]:
# Taken from above

all_IDs = df.Id.unique()
num_users = int(len(all_IDs))
my_cols = df.columns[2:]

my_hist_bins = [0] * len(my_cols)
for i in range(len(my_hist_bins)):
    my_hist_bins[i] = [0] * num_users

for i in range(len(my_cols)):
    for j, user in enumerate(all_IDs):
        temp_df = df.loc[df['Id'] == user]
        edited_df = temp_df[temp_df[my_cols[i]] !=0]

        my_series = edited_df[my_cols[i]]
        count, division = np.histogram(my_series)
        my_hist_bins[i][j] = count/sum(count)
        #caloric_series.hist(bins=division)

In [None]:
num_unknown_users = 1

In [None]:
# Make hist_bins for unknown
my_unknown_hist_bins = [0] * len(my_cols)
for i in range(len(my_unknown_hist_bins)):
    my_unknown_hist_bins[i] = [0] * num_unknown_users

for i in range(len(my_cols)):
    for j, user in enumerate(all_IDs):
        temp_df = df.loc[df['Id'] == user]
        edited_df = temp_df[temp_df[my_cols[i]] !=0]

        my_series = edited_df[my_cols[i]]
        count, division = np.histogram(my_series)
        my_unknown_hist_bins[i][j] = count/sum(count)
        #caloric_series.hist(bins=division)

In [None]:
similarity_matrix = np.zeros((len(my_cols), num_users, num_unknown_users))
sms = similarity_matrix.shape
avg_sim_lst = np.zeros((1, sms[0]))

for z in range(len(my_cols)):
    for i in range(num_users-1):
        for j in range(num_unknown_users):
            similarity_matrix[z, i, j] = sum(rel_entr(np.array(my_hist_bins[z][i]),np.array(my_hist_bins[z][j])))
            if np.isinf(distance_matrix[z, i, j]):
                distance_matrix[z, i, j] = 0
            if np.isnan(distance_matrix[z, i, j]):
                distance_matrix[z, i, j] = 0
    avg_dist_lst[0, z] = np.average(distance_matrix[z])