In [1]:
import pandas as pd
import numpy as np
import glob
import datetime

In [2]:
path = "E:/Data/Monitor Data/data_frames/"

# df_wrist = pd.read_csv(path + "wrist.csv")
# df_hip = pd.read_csv(path + "hip.csv")
df_chest = pd.read_csv(path + "chest.csv")
df_thigh = pd.read_csv(path + "thigh.csv")

In [3]:
df_chest["day_of_week"] = pd.to_datetime(df_chest["time"]).dt.dayofweek
df_thigh["day_of_week"] = pd.to_datetime(df_thigh["time"]).dt.dayofweek

In [4]:
df_chest.dtypes

Accelerometer X          float64
Accelerometer Y          float64
Accelerometer Z          float64
actual_datetime           object
file                      object
time                      object
primary_behavior          object
primary_posture           object
primary_upperbody         object
primary_intensity         object
secondary_behavior        object
secondary_posture         object
secondary_upperbody       object
secondary_intensity       object
num_postures               int64
transition                 int64
actual_time               object
type                      object
start.time                object
broad_activity            object
detailed_activity         object
updated_activity          object
act_type                  object
act_type_broad            object
magnitude_of_change_X    float64
magnitude_of_change_Y    float64
magnitude_of_change_Z    float64
time_diff_seconds          int64
year                       int64
month                      int64
day_of_wee

In [5]:
df_chest["updated_activity"].unique()

array(['stand still', 'mixed-activity', 'sit/lie', 'running', 'walking',
       'vehicle', 'bicycling'], dtype=object)

In [6]:
print(len(df_chest))
print(len(df_thigh))

353710
351279


In [7]:
response_activity = "updated_activity"

categorical_variables = ["primary_behavior", "primary_posture", "primary_upperbody", "primary_intensity", 
                         "secondary_behavior", "secondary_posture", "secondary_upperbody", "secondary_intensity",
                         "type", "broad_activity", "detailed_activity", "act_type", "act_type_broad"]

float_variables = ["Accelerometer X", "Accelerometer Y", "Accelerometer Z", 
                   "magnitude_of_change_X", "magnitude_of_change_Y", "magnitude_of_change_Z"]

int_variables = ["num_postures", "transition", "time_diff_seconds", "day_of_week", "hour"]

datetime_variables = ["actual_datetime", "time", "actual_time", "start.time"]


model_variables = ["Accelerometer X", "Accelerometer Y", "Accelerometer Z", 
                   "magnitude_of_change_X", "magnitude_of_change_Y", "magnitude_of_change_Z"]#,
                   #"time_diff_seconds", "day_of_week", "hour"]

In [8]:
df_chest[model_variables].describe()

Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,magnitude_of_change_X,magnitude_of_change_Y,magnitude_of_change_Z
count,353710.0,353710.0,353710.0,353710.0,353710.0,353710.0
mean,0.035532,0.825666,0.324303,-9.565819e-07,1.440595e-07,-3.803438e-07
std,0.149071,0.227845,0.389453,0.07017598,0.1943561,0.2972622
min,-1.16549,-0.341416,-1.32494,-0.9874773,-1.061177,-1.568698
25%,-0.042127,0.791177,0.131676,-0.01326255,-0.007560599,-0.01395561
50%,0.034582,0.915029,0.352631,0.0,0.0,0.0
75%,0.122301,0.968851,0.570392,0.0132153,0.007387336,0.0138926
max,0.967276,1.368665,1.040457,0.8921665,1.108636,1.566336


In [9]:
from sklearn.feature_selection import SelectKBest, f_classif

#dropped_vars = [response_activity] + datetime_variables + categorical_variables + int_variables

X = df_chest[model_variables]
y = df_chest[response_activity]

best_features = SelectKBest(f_classif, k="all").fit(X,y)

In [10]:
features_results = pd.Series(best_features.scores_)
features_results.index = model_variables
features_results

Accelerometer X           443.184922
Accelerometer Y          2468.448940
Accelerometer Z          9882.928664
magnitude_of_change_X       0.006124
magnitude_of_change_Y       0.007638
magnitude_of_change_Z       0.019744
dtype: float64

In [11]:
activity_vars = ["primary_behavior", "primary_posture", "primary_upperbody", "primary_intensity", 
                 "secondary_behavior", "secondary_posture", "secondary_upperbody", "secondary_intensity",
                 "num_postures", "transition", "type", "broad_activity", "detailed_activity", 
                 "updated_activity", "act_type", "act_type_broad"]

X_vars = model_variables
y_var = response_activity

In [12]:
def cv_split(X, y, num_splits=4):
    n = len(X)
    X_trains, X_tests, y_trains, y_tests = [], [], [], []
    
    for i in range(num_splits):
        current_range = list(range(n*i//num_splits, n*(i+1)//num_splits))
        
        X_train = X.drop(current_range, axis=0)
        X_test = X.iloc[current_range]
        y_train = y.drop(current_range)
        y_test = y.iloc[current_range]
        
        X_trains.append(X_train)
        X_tests.append(X_test)
        y_trains.append(y_train)
        y_tests.append(y_test)
        
    return (X_trains, X_tests, y_trains, y_tests)

In [13]:
train_test_dict = {}

dfs = [#df_wrist, df_hip, 
       df_chest, df_thigh]

bodyparts = [#"wrist", "hip",
             "chest", "thigh"]

for i in range(len(dfs)):
    df = dfs[i]
    bodypart = bodyparts[i]
    
    X = df[X_vars]
    y = df[y_var]
    
    X_trains, X_tests, y_trains, y_tests = cv_split(X, y)
    
    train_test_dict[bodypart] = {}
    
    train_test_dict[bodypart]["X_trains"] = X_trains
    train_test_dict[bodypart]["X_tests"] = X_tests
    train_test_dict[bodypart]["y_trains"] = y_trains
    train_test_dict[bodypart]["y_tests"] = y_tests

In [14]:
print(len(df_chest))
print(len(df_thigh))

353710
351279


In [15]:
display(train_test_dict["chest"]["X_trains"][0].head())
display(train_test_dict["chest"]["X_tests"][0].head())
display(train_test_dict["chest"]["y_trains"][0].head())
display(train_test_dict["chest"]["y_tests"][0].head())

Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,magnitude_of_change_X,magnitude_of_change_Y,magnitude_of_change_Z
88427,-0.179619,0.986887,0.113732,-0.174508,-0.012317,-0.001055
88428,-0.164073,0.989076,0.09765,0.015546,0.002189,-0.016082
88429,-0.124584,0.990998,0.13561,0.039488,0.001922,0.037961
88430,-0.002103,0.99796,0.134051,0.122482,0.006962,-0.001559
88431,0.042394,0.995566,0.14347,0.044497,-0.002394,0.009419


Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,magnitude_of_change_X,magnitude_of_change_Y,magnitude_of_change_Z
0,0.067628,0.870894,0.493904,0.0,0.0,0.0
1,0.01012,0.876218,0.485257,-0.057508,0.005324,-0.008647
2,0.027951,0.922685,0.398483,0.01783,0.046466,-0.086774
3,0.026076,0.905878,0.429324,-0.001874,-0.016807,0.030841
4,0.061737,0.872926,0.485052,0.035661,-0.032952,0.055728


88427    stand still
88428    stand still
88429    stand still
88430    stand still
88431    stand still
Name: updated_activity, dtype: object

0    stand still
1    stand still
2    stand still
3    stand still
4    stand still
Name: updated_activity, dtype: object

In [16]:
display(train_test_dict["thigh"]["X_trains"][2].head())
display(train_test_dict["thigh"]["X_tests"][2].head())
display(train_test_dict["thigh"]["y_trains"][2].head())
display(train_test_dict["thigh"]["y_tests"][2].head())

Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,magnitude_of_change_X,magnitude_of_change_Y,magnitude_of_change_Z
0,-0.920621,-0.155646,-0.318593,0.0,0.0,0.0
1,-0.928607,-0.150999,-0.310559,-0.007986,0.004647,0.008033
2,-0.962693,-0.028313,-0.205231,-0.034086,0.122686,0.105329
3,-0.967954,-0.025304,-0.19945,-0.005261,0.003008,0.005781
4,-0.953352,-0.088372,-0.257887,0.014601,-0.063068,-0.058437


Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,magnitude_of_change_X,magnitude_of_change_Y,magnitude_of_change_Z
175639,-0.090578,0.065754,1.009694,-0.001307,-0.001055,0.004017
175640,-0.110487,0.066195,1.00467,-0.01991,0.000441,-0.005025
175641,-0.033857,0.060414,1.013868,0.07663,-0.005781,0.009199
175642,-0.068053,0.047057,1.010435,-0.034196,-0.013357,-0.003434
175643,-0.056445,0.05257,1.006355,0.011609,0.005513,-0.00408


0    stand still
1    stand still
2    stand still
3    stand still
4    stand still
Name: updated_activity, dtype: object

175639    sit/lie
175640    sit/lie
175641    sit/lie
175642    sit/lie
175643    sit/lie
Name: updated_activity, dtype: object

In [17]:
from hmmlearn import hmm

In [18]:
list(df_chest[model_variables].mean())

[0.0355324794860464,
 0.8256661101315416,
 0.32430282502962965,
 -9.565818526033681e-07,
 1.4405950808498273e-07,
 -3.803438202632952e-07]

In [19]:
def compute_parameters(X, y):    
    act_names = y.unique()
    list_acts = []
    
    for act in act_names:
        temp_activity = X[y == act]
        list_acts.append(temp_activity)
       
    means = []
    for act in list_acts:
        means.append(list(act.mean()))
        
    transmat = []
    for a in range(len(list_acts)):
        act = list_acts[a]
        n_act = len(act)
        act_trans_mat = [0]*len(list_acts)
        
        index = act.index
        prev_i = index[0]
        
        for i in index[1:]:
            if prev_i < i-1:
                trans_act = y[prev_i+1]
                
                for j in range(len(act_names)):
                    
                    if trans_act == act_names[j]:
                        act_trans_mat[j] += 1
                        break
                        
            else:
                act_trans_mat[a] += 1
            prev_i = i
            
        act_trans_mat = [x / (n_act-1) for x in act_trans_mat]
        transmat.append(act_trans_mat)
        
#     dummies = pd.get_dummies(y)
#     dummies = dummies[act_names]
#     covars = dummies.cov()
    
    covars = []
    for act in list_acts:
        covar = act.cov()
        covars.append(covar)
    
    return [means, transmat, covars]

In [20]:
# First runthrough

#time will have to be made into int or float variable
X = df_chest[X_vars]
y = df_chest[y_var]
states = list(y.unique())

hm_model = hmm.GaussianHMM(n_components = len(states), covariance_type = "full", params = "")

startprob = [1] + [0]*6

means, transmat, covars = compute_parameters(X, y)

hm_model.startprob_ = startprob
hm_model.means_ = means
hm_model.transmat_ = transmat
hm_model.covars_ = covars

# hm_model.fit(X)
predictions = hm_model.predict(X)

In [21]:
def print_highest_accuracy(pred, y):
    num_list = []
    for state in y.unique():
        subset = y[y == state]
        indexes = list(subset.index)
        
        numbers = pred.iloc[indexes].value_counts()

        for i in range(len(states)):
            if i not in numbers.index:
                numbers[i] = 0
        
        numbers.sort_index(inplace=True)
        
        print("State:", state)
        print(pd.Series(numbers))
        print("\n")
        
        num_list.append(list(numbers))

    return highest_accuracy(num_list)

#With n as number of states:
# O(n!) -- incredibly inefficient
def highest_accuracy(numbers):
    n = len(numbers)
    
#     highest_count, best_sequence = correct_count(numbers, [], 0)
    highest_count, best_sequence = lazy_correct_count(numbers)
    
    return highest_count, best_sequence
    
# def correct_count(numbers, illegal_nums, count):
#     n = len(numbers)
#     results = {}
    
#     #state
#     for i in range(n):
#         if i not in illegal_nums:
#         #numbers
#             for j in range(n):
#                 print(i, j)
#                 count += numbers[i][j]
#                 print(count)
#                 illegal_nums.append(i)

#                 if len(illegal_nums) == n:
#                     sequence = ""
#                     for number in illegal_nums:
#                         sequence = sequence + str(number)
#                     results[sequence] = count
#                 else:
#                     correct_count(numbers, illegal_nums, count)
#     return results

def lazy_correct_count(numbers):
    results = {}
    n = len(numbers)
    
    for a in range(n):
        a_count = numbers[0][a]
        
        for b in range(n):
            if b == a:
                continue
                
            b_count = a_count + numbers[1][b]
            
            for c in range(n):
                if c in [a,b]:
                    continue
                    
                c_count = b_count + numbers[2][c]
                
                for d in range(n):
                    if d in [a,b,c]:
                        continue
                        
                    d_count = c_count + numbers[3][d]
                                        
                    for e in range(n):
                        if e in [a,b,c,d]:
                            continue

                        e_count = d_count + numbers[4][e]
                        
#                         results[e_count] = [a,b,c,d,e]
                        
                        for f in range(n):
                            if f in [a,b,c,d,e]:
                                continue
                                
                            f_count = e_count + numbers[5][f]
                            
                            for g in range(n):
                                if g in [a,b,c,d,e,f]:
                                    continue
                                
                                g_count = f_count + numbers[6][g]
                        
                                results[g_count] = [a,b,c,d,e,f,g]
    
    highest_count = max(results.keys())
    best_sequence = results[highest_count]
    return highest_count, best_sequence

In [22]:
highest_count, best_sequence = print_highest_accuracy(pd.Series(predictions), y)
best_sequence

State: stand still
0      610
1    15769
2    18245
3     5339
4        9
5    52073
6     2098
dtype: int64


State: mixed-activity
0      149
1    12046
2     2923
3     4606
4      306
5    22225
6      282
dtype: int64


State: sit/lie
0      467
1     3753
2    72861
3     2412
4        0
5    70642
6     7860
dtype: int64


State: running
0       0
1      81
2       0
3    9221
4       0
5      17
6       0
dtype: int64


State: walking
0        0
1     1473
2     1238
3      733
4     3955
5    23846
6        0
dtype: int64


State: vehicle
0        5
1      223
2       86
3       74
4        0
5    11909
6        0
dtype: int64


State: bicycling
0       6
1     107
2      41
3     124
4       0
5      24
6    5872
dtype: int64




[5, 1, 2, 3, 4, 0, 6]

In [23]:
highest_count

156033

In [24]:
best_sequence

[5, 1, 2, 3, 4, 0, 6]

In [25]:
def assign_hidden_states(val, states, results):
    for i in range(len(states)):
        if val == results[i]:
            return states[i]

# sequence = best_sequence
sequence = [0,1,2,3,4,5,6]
y_pred = pd.Series(predictions).apply(lambda x: assign_hidden_states(x, states, sequence))

In [26]:
correct = y_pred == y
print("Accuracy: " + str(sum(correct)/len(y)*100) + "%")

Accuracy: 32.92923581465042%


In [27]:
print(sum(correct))
print(highest_count)

116474
156033


In [28]:
display(y.value_counts())
display(y[correct].value_counts())

sit/lie           157995
stand still        94143
mixed-activity     42537
walking            31245
vehicle            12297
running             9319
bicycling           6174
Name: updated_activity, dtype: int64

sit/lie           72861
mixed-activity    12046
vehicle           11909
running            9221
bicycling          5872
walking            3955
stand still         610
Name: updated_activity, dtype: int64

In [29]:
pd.DataFrame(means)

Unnamed: 0,0,1,2,3,4,5
0,0.039278,0.832263,0.257036,-3.2e-05,1.2e-05,-0.000146
1,0.016796,0.841992,0.191034,-3e-06,-0.000157,0.000147
2,0.040338,0.785853,0.436661,1.8e-05,3.4e-05,0.000106
3,0.025933,0.964556,-0.020609,-4.1e-05,-0.000103,-0.000133
4,0.027238,0.91486,0.328653,-2e-06,4.8e-05,4.6e-05
5,0.0685,0.870219,0.466663,3.4e-05,6.9e-05,-0.000396
6,-0.024669,0.881666,-0.392044,-1.1e-05,-0.000199,-0.000764


In [30]:
print(len(X[X["Accelerometer Z"] > 0.46])/len(X))

0.36056939300556956


In [31]:
for var in activity_vars:
    print(var)
    print(df_chest[var].unique())
    print("\n")

primary_behavior
['LES- socializing, communicating, leisure time not screen'
 'HA- food prep and cleanup'
 'LES- screen basedentary leisure time (TV, video game, computer)'
 'HA- household management/other household activities'
 'HA- animals and pets' 'EAT- eating and drinking, waiting' 'EX- jogging'
 'HA- housework' 'TRAV- walking' 'HA- lawn, garden and houseplants'
 'HA- interior maintenance, repair, & decoration'
 'WRK- general - Education and Health Services'
 'WRK- screen basedentary - Education and Health Services' 'EX- walking'
 'PC- groom, health-related' 'PUR- purchasing goods and services'
 'CA- caring for and helping children'
 'TRAV- driver (car/truck/motorcycle)' 'EX- other'
 'EDU- taking class, research, homework' 'TRAV- passenger bus or train'
 'HA- exterior maintenance, repair, & decoration'
 'WRK- screen basedentary - Office (business, professional services, finance, info)'
 'WRK- general - Office (business, professional services, finance, info)'
 'ORG- organizational 

In [32]:
#Accuracies:
#updated_activity: ~33%
#act_type_broad: ~44% - only one where true order also had the highest accuracy
#broad_activity: ~36%

#updated and broad activity predict vehicle way too often