In [1]:
import pandas as pd
import numpy as np
import glob
import datetime

In [2]:
path = "E:/Data/Monitor Data/data_frames/"

# df_wrist = pd.read_csv(path + "wrist.csv")
# df_hip = pd.read_csv(path + "hip.csv")
df_chest = pd.read_csv(path + "chest.csv")
df_thigh = pd.read_csv(path + "thigh.csv")

In [3]:
df_chest.dtypes

Accelerometer X        float64
Accelerometer Y        float64
Accelerometer Z        float64
actual_datetime         object
time                    object
primary_behavior        object
primary_posture         object
primary_upperbody       object
primary_intensity       object
secondary_behavior      object
secondary_posture       object
secondary_upperbody     object
secondary_intensity     object
num_postures             int64
transition               int64
actual_time             object
type                    object
start.time              object
broad_activity          object
detailed_activity       object
updated_activity        object
act_type                object
act_type_broad          object
time_int                 int64
dtype: object

In [4]:
df_chest["broad_activity"].unique()

array(['sit/stand', 'mixed-activity', 'walking', 'vehicle', 'bicycling'],
      dtype=object)

In [5]:
print(len(df_chest))
print(len(df_thigh))

353710
351279


In [6]:
response_activity = "broad_activity"

categorical_variables = ["primary_behavior", "primary_posture", "primary_upperbody", "primary_intensity", 
                         "secondary_behavior", "secondary_posture", "secondary_upperbody", "secondary_intensity",
                         "type", "broad_activity", "detailed_activity", "act_type", "act_type_broad"]

float_variables = ["Accelerometer X", "Accelerometer Y", "Accelerometer Z"]

int_variables = ["num_postures", "transition", "time_int"]

datetime_variables = ["actual_datetime", "time", "actual_time", "start.time"]

In [7]:
df_chest[float_variables + int_variables].describe()

Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,num_postures,transition,time_int
count,353710.0,353710.0,353710.0,353710.0,353710.0,353710.0
mean,0.035532,0.825666,0.324303,1.024642,0.024619,26063520.0
std,0.149071,0.227845,0.389453,0.155268,0.154961,6767063.0
min,-1.16549,-0.341416,-1.32494,1.0,0.0,17669830.0
25%,-0.042127,0.791177,0.131676,1.0,0.0,21987310.0
50%,0.034582,0.915029,0.352631,1.0,0.0,24170770.0
75%,0.122301,0.968851,0.570392,1.0,0.0,32953400.0
max,0.967276,1.368665,1.040457,3.0,1.0,43438500.0


In [8]:
from sklearn.feature_selection import SelectKBest, f_classif

#dropped_vars = [response_activity] + datetime_variables + categorical_variables + int_variables

X = df_chest[float_variables + ["time_int"]]
y = df_chest[response_activity]

best_features = SelectKBest(f_classif, k="all").fit(X,y)

In [9]:
features_results = pd.Series(best_features.scores_)
features_results.index = float_variables + ["time_int"]
features_results

Accelerometer X     660.982425
Accelerometer Y    3279.608722
Accelerometer Z    9561.485042
time_int           1276.379056
dtype: float64

In [10]:
activity_vars = ["primary_behavior", "primary_posture", "primary_upperbody", "primary_intensity", 
                 "secondary_behavior", "secondary_posture", "secondary_upperbody", "secondary_intensity",
                 "num_postures", "transition", "type", "broad_activity", "detailed_activity", "act_type", "act_type_broad"]

X_vars = ["Accelerometer X", "Accelerometer Y", "Accelerometer Z", "time_int"]
y_var = "broad_activity"

In [11]:
def cv_split(X, y, num_splits=4):
    n = len(X)
    X_trains, X_tests, y_trains, y_tests = [], [], [], []
    
    for i in range(num_splits):
        current_range = list(range(n*i//num_splits, n*(i+1)//num_splits))
        
        X_train = X.drop(current_range, axis=0)
        X_test = X.iloc[current_range]
        y_train = y.drop(current_range)
        y_test = y.iloc[current_range]
        
        X_trains.append(X_train)
        X_tests.append(X_test)
        y_trains.append(y_train)
        y_tests.append(y_test)
        
    return (X_trains, X_tests, y_trains, y_tests)

In [12]:
train_test_dict = {}

dfs = [#df_wrist, df_hip, 
       df_chest, df_thigh]

bodyparts = [#"wrist", "hip",
             "chest", "thigh"]

for i in range(len(dfs)):
    df = dfs[i]
    bodypart = bodyparts[i]
    
    X = df[X_vars]
    y = df[y_var]
    
    X_trains, X_tests, y_trains, y_tests = cv_split(X, y)
    
    train_test_dict[bodypart] = {}
    
    train_test_dict[bodypart]["X_trains"] = X_trains
    train_test_dict[bodypart]["X_tests"] = X_tests
    train_test_dict[bodypart]["y_trains"] = y_trains
    train_test_dict[bodypart]["y_tests"] = y_tests

In [13]:
print(len(df_chest))
print(len(df_thigh))

353710
351279


In [14]:
display(train_test_dict["chest"]["X_trains"][0].head())
display(train_test_dict["chest"]["X_tests"][0].head())
display(train_test_dict["chest"]["y_trains"][0].head())
display(train_test_dict["chest"]["y_tests"][0].head())

Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,time_int
88427,-0.179619,0.986887,0.113732,18446380
88428,-0.164073,0.989076,0.09765,18446381
88429,-0.124584,0.990998,0.13561,18446382
88430,-0.002103,0.99796,0.134051,18446383
88431,0.042394,0.995566,0.14347,18446384


Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,time_int
0,0.067628,0.870894,0.493904,23823885
1,0.01012,0.876218,0.485257,23823886
2,0.027951,0.922685,0.398483,23823887
3,0.026076,0.905878,0.429324,23823888
4,0.061737,0.872926,0.485052,23823889


88427    sit/stand
88428    sit/stand
88429    sit/stand
88430    sit/stand
88431    sit/stand
Name: broad_activity, dtype: object

0    sit/stand
1    sit/stand
2    sit/stand
3    sit/stand
4    sit/stand
Name: broad_activity, dtype: object

In [15]:
display(train_test_dict["thigh"]["X_trains"][2].head())
display(train_test_dict["thigh"]["X_tests"][2].head())
display(train_test_dict["thigh"]["y_trains"][2].head())
display(train_test_dict["thigh"]["y_tests"][2].head())

Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,time_int
0,-0.920621,-0.155646,-0.318593,23823885
1,-0.928607,-0.150999,-0.310559,23823886
2,-0.962693,-0.028313,-0.205231,23823887
3,-0.967954,-0.025304,-0.19945,23823888
4,-0.953352,-0.088372,-0.257887,23823889


Unnamed: 0,Accelerometer X,Accelerometer Y,Accelerometer Z,time_int
175639,-0.090578,0.065754,1.009694,24660595
175640,-0.110487,0.066195,1.00467,24660596
175641,-0.033857,0.060414,1.013868,24660597
175642,-0.068053,0.047057,1.010435,24660598
175643,-0.056445,0.05257,1.006355,24660599


0    sit/stand
1    sit/stand
2    sit/stand
3    sit/stand
4    sit/stand
Name: broad_activity, dtype: object

175639    sit/stand
175640    sit/stand
175641    sit/stand
175642    sit/stand
175643    sit/stand
Name: broad_activity, dtype: object

In [16]:
# Created a function to split with cross-validation in mind
# Since time is a variable, I did not randomly assign indexes to train and test, but rather grouped them together

In [17]:
# Any other explanatory features we should use besides the 4 above? All the others seem too closely related to activity
# Are we going to try using different response variables? There are so many activity variables we could try several

In [18]:
from hmmlearn import hmm

In [19]:
# First runthrough

#time will have to be made into int or float variable
X = df_chest[X_vars]
y = df_chest[y_var]
states = list(y.unique())

hm_model = hmm.GaussianHMM(n_components = len(states))


hm_model.fit(X)
predictions = hm_model.predict(X)

In [20]:
states

['sit/stand', 'mixed-activity', 'walking', 'vehicle', 'bicycling']

In [21]:
def print_highest_accuracy(pred, y):
    num_list = []
    for state in y.unique():
        subset = y[y == state]
        indexes = list(subset.index)
        
        numbers = pred.iloc[indexes].value_counts()

        for i in range(len(states)):
            if i not in numbers.index:
                numbers[i] = 0
        
        numbers.sort_index(inplace=True)
        
        print("State:", state)
        print(pd.Series(numbers))
        print("\n")
        
        num_list.append(list(numbers))

    return highest_accuracy(num_list)

#With n as number of states:
# O(n!) -- incredibly inefficient
def highest_accuracy(numbers):
    n = len(numbers)
    
#     highest_count, best_sequence = correct_count(numbers, [], 0)
    highest_count, best_sequence = lazy_correct_count(numbers)
    
    return highest_count, best_sequence
    
# def correct_count(numbers, illegal_nums, count):
#     n = len(numbers)
#     results = {}
    
#     #state
#     for i in range(n):
#         if i not in illegal_nums:
#         #numbers
#             for j in range(n):
#                 print(i, j)
#                 count += numbers[i][j]
#                 print(count)
#                 illegal_nums.append(i)

#                 if len(illegal_nums) == n:
#                     sequence = ""
#                     for number in illegal_nums:
#                         sequence = sequence + str(number)
#                     results[sequence] = count
#                 else:
#                     correct_count(numbers, illegal_nums, count)
#     return results

def lazy_correct_count(numbers):
    results = {}
    n = len(numbers)
    
    for a in range(n):
        a_count = numbers[0][a]
        
        for b in range(n):
            if b == a:
                continue
                
            b_count = a_count + numbers[1][b]
            
            for c in range(n):
                if c in [a,b]:
                    continue
                    
                c_count = b_count + numbers[2][c]
                
                for d in range(n):
                    if d in [a,b,c]:
                        continue
                        
                    d_count = c_count + numbers[3][d]
                                        
                    for e in range(n):
                        if e in [a,b,c,d]:
                            continue

                        e_count = d_count + numbers[4][e]
                        results[e_count] = [a,b,c,d,e]
    
    highest_count = max(results.keys())
    best_sequence = results[highest_count]
    return highest_count, best_sequence

In [22]:
highest_count, best_sequence = print_highest_accuracy(pd.Series(predictions), y)
best_sequence

State: sit/stand
0    28463
1    64792
2    40973
3    44858
4    73052
dtype: int64


State: mixed-activity
0    1226
1    7496
2    7455
3    5462
4    7056
dtype: int64


State: walking
0     2812
1     2154
2    16148
3    15510
4    17782
dtype: int64


State: vehicle
0    3041
1     144
2    4323
3    3416
4    1373
dtype: int64


State: bicycling
0       0
1       4
2     670
3    5500
4       0
dtype: int64




[4, 1, 2, 0, 3]

In [23]:
highest_count

105237

In [24]:
def assign_hidden_states(val, states, results):
    for i in range(len(states)):
        if val == results[i]:
            return states[i]

y_pred = pd.Series(predictions).apply(lambda x: assign_hidden_states(x, states, best_sequence))

In [25]:
correct = y_pred == y
print("Accuracy: " + str(sum(correct)/len(y)*100) + "%")

Accuracy: 29.752339487150493%


In [26]:
sum(correct) == highest_count

True

In [27]:
display(y.value_counts())
display(y[correct].value_counts())

sit/stand         252138
walking            54406
mixed-activity     28695
vehicle            12297
bicycling           6174
Name: broad_activity, dtype: int64

sit/stand         73052
walking           16148
mixed-activity     7496
bicycling          5500
vehicle            3041
Name: broad_activity, dtype: int64