#Preparing data for input into the hmm["activity"] models

In [1]:
from __future__ import division

%matplotlib inline
import pandas as pd
import thinkdsp
import thinkplot
import numpy as np

from magnitude import magnitude
from pipeline import preprocess, extract_features_with_sliding_window, learn

In [2]:
acts = ['walking', 'jogging', 'upstairs', 'downstairs']

In [3]:
def import_data(names):
    """
    Arguments
    ---------
    names: array of strings
    
    Returns
    -------
    data_dict: a dictionary
        dictionary of x, y, z accelerometer components as well as time
    """
    data_dict = {'walking':{},'jogging':{},'upstairs':{},'downstairs':{}}
    for name in names:
        data_file_names = ['data/{}_{}_long.csv'.format(name, activity) for activity in acts]
        for i,file in enumerate(data_file_names):
            df = pd.read_csv(file)
            data_dict[acts[i]][name] = df
    return data_dict

In [4]:
names = ['ryan','dennis']

In [5]:
data_dict = import_data(names)

In [6]:
data_dict["walking"]["ryan"].head()

Unnamed: 0,x,y,z,time
0,-4.590283,-8.205527,1.54845,1430066134259
1,-4.669292,-8.217499,2.177526,1430066134439
2,-4.509479,-8.155848,2.326566,1430066134620
3,-4.317344,-8.166023,2.051831,1430066134800
4,-4.507683,-8.309077,2.343325,1430066134980


In [7]:
def get_features(data_dict, n_windows=10, verbose=False):
    """
    Arguments
    ---------
    data_dict: a dictionary
        dictionary of x, y, z accelerometer components as well as time
    
    n_windows: a integer, defaults 10
        number of windows to include in the queue
    
    verbose: boolean, default False
        whether to do print statements while running the function
    Returns
    -------
    feature_dict: a dictionary 
        dictionary of x, y, z accelerometer components as well as time
    """
    feature_dict = {}
    for activity, activity_data_dict in data_dict.iteritems():
        if verbose:
            print "Activity: {}".format(activity)
        feature_dict[activity] = {}
        for person, person_data in activity_data_dict.iteritems():
            if verbose:
                print "Person: {}".format(person)
                print "Person Data: \n {}".format(person_data.head())

            a_norm = preprocess(person_data)
            obs = extract_features_with_sliding_window(a_norm, n_windows=n_windows)
            feature_dict[activity][person] = obs
        if verbose:
            print
    return feature_dict

In [8]:
feature_dict = get_features(data_dict)

Here is the matrix that represents the features extracted for a sequence of windows.  Let's call this matrix $X$

In [9]:
feature_dict['walking']['dennis']

[array([[  0.25      ,  10.42094763],
        [  0.33333333,  41.44359986],
        [  0.33333333,  24.71381943],
        [  0.16666667,  22.16337529],
        [  0.2       ,  34.92973755],
        [  0.2       ,  18.55468375],
        [  1.        ,  17.40342937],
        [  0.25      ,  23.87317402],
        [  0.25      ,  11.30793136],
        [  1.        ,   0.40634913]]), array([[  0.33333333,  41.44359986],
        [  0.33333333,  24.71381943],
        [  0.16666667,  22.16337529],
        [  0.2       ,  34.92973755],
        [  0.2       ,  18.55468375],
        [  1.        ,  17.40342937],
        [  0.25      ,  23.87317402],
        [  0.25      ,  11.30793136],
        [  1.        ,   0.40634913],
        [  1.        ,   0.07156826]]), array([[  0.33333333,  24.71381943],
        [  0.16666667,  22.16337529],
        [  0.2       ,  34.92973755],
        [  0.2       ,  18.55468375],
        [  1.        ,  17.40342937],
        [  0.25      ,  23.87317402],
        [ 

Check out the ```learn``` function in ```pipeline.py```: the real magic takes place there.  After training on the features, a dictionary of hidden markov models is returned (```hidden_markov_models```).  These four models can give us a (log)likelihood that a new sequence belongs to the activity they model. The model with the maximum likelihood will tell us which activity is happening.

In [10]:
hidden_markov_models = learn(feature_dict)

Plug in different strings for the feature dict, to make a score of how likely the sequence, for a particular activity and user, be represented by the activity model.  Right now, we are training and testing on the same dataset.

In [11]:
def compare_to_model(actual_person, actual_activity, feature_dict, metric="likelihood"):
    """
    Arguments
    --------
    actual_person:
    actual_activity:
    feature_dict:
    metric: string
        likelihood, returns an array
        accuracy, returns a string for the number of correct predictions
    
    Returns
    -------
    likelihoods: array-like, shape (n_activities)
        should be an array of log liklihoods predicted by each of the 4 activity models
    """
    correct = 0
    max_win_set = len(feature_dict[actual_activity][actual_person])

    likelihoods = np.zeros(4)
    for i in range(max_win_set):
        max_val = -np.inf #setting to lowest possible value
        max_activity = "none"
        for j, activity in enumerate(acts):
            predicted_score = hidden_markov_models[activity].score(feature_dict[actual_activity][actual_person][i])
            likelihoods[j] += predicted_score
    
            if predicted_score >= max_val:
                max_val = predicted_score
                max_activity = activity
        
        # if you want to calculate accuracy, instead of relevant accuracy.
        if metric == "accuracy":
            if str(max_activity) == str(actual_activity): 
                correct += 1

    if metric == "likelihood":
        return likelihoods / max_win_set
    
    if metric == "accuracy":
        return "For {} {}, the model predicted {} out of {} ({}) activities correctly!".format(actual_person,actual_activity,correct,max_win_set, float(correct/max_win_set))

In [18]:
print acts
for person in names:
    print person
    for activity in acts:
        print compare_to_model(person,activity, feature_dict)#, metric = "accuracy")

['walking', 'jogging', 'upstairs', 'downstairs']
ryan
[ 73.94455904 -26.78372891  39.57825271  67.48306013]
[-41.65809903 -31.51399358 -28.12406508 -56.38646755]
[ -50.47494105  -46.41274532  -15.31172007 -132.10504711]
[-30.84849724 -35.26170539 -39.32240428 -29.04296031]
dennis
[ 64.45221521 -27.56787649  33.63560617  57.90572187]
[-43.64748998 -29.53933415 -43.61213009 -82.93844147]
[ 57.49604059 -28.54718012  30.06935155  55.19756598]
[ 53.24205512 -28.69084009  26.37967946  51.96033729]


Now that we've trained the model and verified that it works (to some extent) on the same dataset, we'll plug in new, unseen data.

In [13]:
unseen_names = ["meg", "jen"]

In [14]:
unseen_data_dict = import_data(unseen_names)

In [15]:
unseen_feature_dict = get_features(unseen_data_dict)

Now, plug in different strings for the feature dict, to make a score of how likely the sequence, for a particular unseen activities and user, be represented by the activity model.

In [19]:
print acts
for person in unseen_names:
    print person
    for activity in acts:
        print compare_to_model(person,activity, unseen_feature_dict, metric = "accuracy")

['walking', 'jogging', 'upstairs', 'downstairs']
meg
For meg walking, the model predicted 0 out of 274 (0.0) activities correctly!
For meg jogging, the model predicted 6 out of 324 (0.0185185185185) activities correctly!
For meg upstairs, the model predicted 103 out of 103 (1.0) activities correctly!
For meg downstairs, the model predicted 0 out of 124 (0.0) activities correctly!
jen
For jen walking, the model predicted 0 out of 32 (0.0) activities correctly!
For jen jogging, the model predicted 0 out of 25 (0.0) activities correctly!
For jen upstairs, the model predicted 0 out of 66 (0.0) activities correctly!
For jen downstairs, the model predicted 4 out of 4 (1.0) activities correctly!
