#Preparing data for input into the hmm["activity"] models

In [2]:
from __future__ import division

%matplotlib inline
import pandas as pd
import thinkdsp
import thinkplot
import numpy as np

from magnitude import magnitude
from pipeline import preprocess, extract_features_with_sliding_window, learn

In [3]:
#Ryan's fancy way of inputting data in an easier way
data_dict = {'walking':{},'jogging':{},'upstairs':{},'downstairs':{}}
names = ['meg','ryan','dennis']
acts = ['walking', 'jogging', 'upstairs', 'downstairs']
for name in names:
    data_file_names = ['data/{}_{}_long.csv'.format(name, activity) for activity in acts]
    for i,file in enumerate(data_file_names):
        df = pd.read_csv(file)
        data_dict[acts[i]][name] = df

In [4]:
data_dict["walking"]["meg"].head()

Unnamed: 0,x,y,z,time
0,0.493804,2.130241,8.994417,1430067490092
1,0.135272,1.395221,7.765593,1430067490265
2,-2.08535,2.178125,9.363723,1430067490445
3,-2.765303,1.742979,9.216479,1430067490625
4,-1.693299,-0.641047,10.671555,1430067490805


In [5]:
feature_dict = {}
for activity, activity_data_dict in data_dict.iteritems():
    print "Activity: {}".format(activity)
    feature_dict[activity] = {}
    for person, person_data in activity_data_dict.iteritems():
        print "Person: {}".format(person)
        print "Person Data: \n {}".format(person_data.head())
        
        a_norm = preprocess(person_data)
        obs = extract_features_with_sliding_window(a_norm, n_windows=10)
        feature_dict[activity][person] = obs
    print

Activity: walking
Person: meg
Person Data: 
           x         y          z           time
0  0.493804  2.130241   8.994417  1430067490092
1  0.135272  1.395221   7.765593  1430067490265
2 -2.085350  2.178125   9.363723  1430067490445
3 -2.765303  1.742979   9.216479  1430067490625
4 -1.693299 -0.641047  10.671555  1430067490805
Person: dennis
Person Data: 
           x         y         z           time
0 -4.590283 -8.205527  1.548450  1430066134259
1 -4.669292 -8.217499  2.177526  1430066134439
2 -4.509479 -8.155848  2.326566  1430066134620
3 -4.317344 -8.166023  2.051831  1430066134800
4 -4.507683 -8.309077  2.343325  1430066134980
Person: ryan
Person Data: 
           x         y         z           time
0 -4.590283 -8.205527  1.548450  1430066134259
1 -4.669292 -8.217499  2.177526  1430066134439
2 -4.509479 -8.155848  2.326566  1430066134620
3 -4.317344 -8.166023  2.051831  1430066134800
4 -4.507683 -8.309077  2.343325  1430066134980

Activity: downstairs
Person: meg
Person Data

Here is the matrix that represents the features extracted for a sequence of windows.  Let's call this matrix $X$

In [6]:
feature_dict['walking']['dennis']

[array([[  0.25      ,  10.42094763],
        [  0.33333333,  41.44359986],
        [  0.33333333,  24.71381943],
        [  0.16666667,  22.16337529],
        [  0.2       ,  34.92973755],
        [  0.2       ,  18.55468375],
        [  1.        ,  17.40342937],
        [  0.25      ,  23.87317402],
        [  0.25      ,  11.30793136],
        [  1.        ,   0.40634913]]), array([[  0.33333333,  41.44359986],
        [  0.33333333,  24.71381943],
        [  0.16666667,  22.16337529],
        [  0.2       ,  34.92973755],
        [  0.2       ,  18.55468375],
        [  1.        ,  17.40342937],
        [  0.25      ,  23.87317402],
        [  0.25      ,  11.30793136],
        [  1.        ,   0.40634913],
        [  1.        ,   0.07156826]]), array([[  0.33333333,  24.71381943],
        [  0.16666667,  22.16337529],
        [  0.2       ,  34.92973755],
        [  0.2       ,  18.55468375],
        [  1.        ,  17.40342937],
        [  0.25      ,  23.87317402],
        [ 

Check out the ```learn``` function in ```pipeline.py```: the real magic takes place there.  After training on the features, a dictionary of hidden markov models is returned (```hidden_markov_models```).  These four models can give us a (log)likelihood that a new sequence belongs to the activity they model. The model with the maximum likelihood will tell us which activity is happening.

In [7]:
hidden_markov_models = learn(feature_dict)

Plug in different strings for the feature dict, to make a score of how likely the sequence, for a particular activity and user, be represented by the activity model.  Right now, we are training and testing on the same dataset.

In [42]:
def compare_to_model(actual_person, actual_activity, feature_dict):
    correct = 0
    max_win_set = len(feature_dict[actual_activity][actual_person])

    for i in range(max_win_set):
        max_val = -np.inf #setting to lowest possible value
        max_activity = "none"
        for activity in acts:
            predicted_score = hidden_markov_models[activity].score(feature_dict[actual_activity][actual_person][i])
            #print activity
            #print hidden_markov_models[activity].score(feature_dict['downstairs']['ryan'][i])
            if predicted_score >= max_val:
                max_val = predicted_score
                max_activity = activity
        if str(max_activity) == str(actual_activity): 
            correct += 1
        #print max_activity
        #print max_win_set

    return "For {} {}, the model predicted {} out of {} ({}) activities correctly!".format(actual_person,actual_activity,correct,max_win_set, float(correct/max_win_set))

In [45]:
for person in names:
    for activity in acts:
        print compare_to_model(person,activity, feature_dict)

For meg walking, the model predicted 47 out of 274 (0.171532846715) activities correctly!
For meg jogging, the model predicted 289 out of 324 (0.891975308642) activities correctly!
For meg upstairs, the model predicted 98 out of 103 (0.95145631068) activities correctly!
For meg downstairs, the model predicted 115 out of 124 (0.927419354839) activities correctly!
For ryan walking, the model predicted 142 out of 177 (0.802259887006) activities correctly!
For ryan jogging, the model predicted 10 out of 50 (0.2) activities correctly!
For ryan upstairs, the model predicted 75 out of 75 (1.0) activities correctly!
For ryan downstairs, the model predicted 6 out of 8 (0.75) activities correctly!
For dennis walking, the model predicted 162 out of 204 (0.794117647059) activities correctly!
For dennis jogging, the model predicted 7 out of 14 (0.5) activities correctly!
For dennis upstairs, the model predicted 20 out of 223 (0.0896860986547) activities correctly!
For dennis downstairs, the model p

Now that we've trained the model and verified that it works (to some extent) on the same dataset, we'll plug in new, unseen data.

In [51]:
#using the inputting code used earlier to input data from 'someone' and 'jen'
unseen_data_dict = {'walking':{},'jogging':{},'upstairs':{},'downstairs':{}}
unseen_names = ['someone','jen']
for name in unseen_names:
    unseen_data_file_names = ['data/{}_{}_long.csv'.format(name, activity) for activity in acts]
    for i,file in enumerate(unseen_data_file_names):
        unseen_df = pd.read_csv(file)
        unseen_data_dict[acts[i]][name] = unseen_df

In [52]:
unseen_feature_dict = {}
for activity, unseen_activity_data_dict in unseen_data_dict.iteritems():
    #print "Activity: {}".format(activity)
    unseen_feature_dict[activity] = {}
    for person, person_data in unseen_activity_data_dict.iteritems():
        #print "Person: {}".format(person)
        #print "Person Data: \n {}".format(person_data.head())
        
        a_norm = preprocess(person_data)
        obs = extract_features_with_sliding_window(a_norm, n_windows=10)
        unseen_feature_dict[activity][person] = obs
    #print

Note - Hiding print statements for now to make it easier to reference code.

Now, plug in different strings for the feature dict, to make a score of how likely the sequence, for a particular unseen activities and user, be represented by the activity model.

In [55]:
for person in unseen_names:
    for activity in acts:
        print compare_to_model(person,activity,unseen_feature_dict)

For someone walking, the model predicted 96 out of 96 (1.0) activities correctly!
For someone jogging, the model predicted 0 out of 28 (0.0) activities correctly!
For someone upstairs, the model predicted 0 out of 26 (0.0) activities correctly!
For someone downstairs, the model predicted 0 out of 28 (0.0) activities correctly!
For jen walking, the model predicted 32 out of 32 (1.0) activities correctly!
For jen jogging, the model predicted 0 out of 25 (0.0) activities correctly!
For jen upstairs, the model predicted 0 out of 66 (0.0) activities correctly!
For jen downstairs, the model predicted 0 out of 4 (0.0) activities correctly!


This inaccuracy is due to a number of factors such as:
    limited datasets, different phones, and possibly different positionings (though the magnitude should have taken that factor out).