#Preparing data for input into the hmm["activity"] models

In [2]:
from __future__ import division

%matplotlib inline
import pandas as pd
import thinkdsp
import thinkplot
import numpy as np

from magnitude import magnitude
from pipeline import preprocess, extract_features

In [3]:
#Ryan's fancy way of inputting data in an easier way
data_dict = {'walking':{},'jogging':{},'upstairs':{},'downstairs':{}}
names = ['meg','ryan','dennis']
acts = ['walking', 'jogging', 'upstairs', 'downstairs']
for name in names:
    data_file_names = ['data/{}_{}_long.csv'.format(name, activity) for activity in acts]
    for i,file in enumerate(data_file_names):
        df = pd.read_csv(file)
        data_dict[acts[i]][name] = df

In [4]:
#dictionary of dictionaries where the activity is the key and the inner dictionary is each person
print data_dict['walking']['meg'].head()

          x         y          z           time
0  0.493804  2.130241   8.994417  1430067490092
1  0.135272  1.395221   7.765593  1430067490265
2 -2.085350  2.178125   9.363723  1430067490445
3 -2.765303  1.742979   9.216479  1430067490625
4 -1.693299 -0.641047  10.671555  1430067490805


In [12]:
a_norm

array([ 9.62647508,  9.62804358,  9.62961206, ...,  9.75674002,
        9.79842415,  9.84010843])

In [9]:
feature_dict = {}
for activity, activity_data_dict in data_dict.iteritems():
    print "Activity: {}".format(activity)
    feature_dict[activity] = {}
    for person, person_data in activity_data_dict.iteritems():
        print "Person: {}".format(person)
        print "Person Data: \n {}".format(person_data.head())
        
        a_norm = preprocess(person_data)
        print "Acceleration Magnitude Vector Shape: {}".format(a_norm.shape)
        
        obs = extract_features(a_norm, n_windows=10, dom_freq_method="spectrum")
        print "Extracted Feature Sequence Shape: {}".format(obs.shape)
        
        feature_dict[activity][person] = obs
    print

Activity: walking
Person: meg
Person Data: 
           x         y          z           time
0  0.493804  2.130241   8.994417  1430067490092
1  0.135272  1.395221   7.765593  1430067490265
2 -2.085350  2.178125   9.363723  1430067490445
3 -2.765303  1.742979   9.216479  1430067490625
4 -1.693299 -0.641047  10.671555  1430067490805
Acceleration Magnitude Vector Shape: (4265L,)
Extracted Feature Sequence Shape: (10L, 2L)
Person: dennis
Person Data: 
           x         y         z           time
0 -4.590283 -8.205527  1.548450  1430066134259
1 -4.669292 -8.217499  2.177526  1430066134439
2 -4.509479 -8.155848  2.326566  1430066134620
3 -4.317344 -8.166023  2.051831  1430066134800
4 -4.507683 -8.309077  2.343325  1430066134980
Acceleration Magnitude Vector Shape: (3215L,)
Extracted Feature Sequence Shape: (10L, 2L)
Person: ryan
Person Data: 
           x         y         z           time
0 -4.590283 -8.205527  1.548450  1430066134259
1 -4.669292 -8.217499  2.177526  1430066134439
2 -4.5

In [13]:
feature_dict['jogging']['dennis']

array([[  1.66666667e-01,   8.99725954e+00],
       [  3.33333333e-02,   2.63595196e+01],
       [  4.66666667e-01,   6.12780879e+01],
       [  2.00000000e-01,   5.79367855e+01],
       [  2.00000000e-01,   7.35252300e+01],
       [  2.00000000e-01,   7.08618795e+01],
       [  4.66666667e-01,   5.58378545e+01],
       [  5.00000000e-01,   4.69881384e+01],
       [  5.00000000e-01,   5.59797097e+01],
       [  2.33333333e-01,   6.81809998e+01]])

In [8]:
from hmmlearn.hmm import GaussianHMM

# dict of gaussian hidden markov models for each activity
ghmm = {}

# dict of train data for each activity
X_train = {}

# dict of test data for each activity
X_test = {}

# number of hidden states
n_components = 3

# number of samples in the training set
train_size = 60

# Train a separate GHMM for each activity
for activity, amps in amp_dist.iteritems():
    # Create GHMM
    ghmm[activity] = GaussianHMM(n_components, covariance_type="diag", n_iter=1000)
    
    # Split into Train and Test Data (No Random Shuffling Now)
    # If we wanted to add more features:
    # X_train[activity] = np.column_stack([amp_dist[activity][:train_size], next_feature[activity][:train_size])
    
    features = (amp_dist, domfreq_dist2)

    X_train[activity] = np.column_stack([feature[activity][:train_size] for feature in features])
    X_test[activity] = np.column_stack([feature[activity][train_size:] for feature in features])
    
    # Fit on Training Data
    # Confused about .fit([X])
    ghmm[activity].fit([X_train[activity]])

# For each Test Set
for activity, X in X_test.iteritems():
    print "actual activity: " + activity
    
    # logprobs for each activity_model
    # the log-likelihood that the given sequence of observations looks like things this model could produce
    logprobs = {}

    # Try Out the 4 models
    for model_activity, model in ghmm.iteritems():
        # model.score returns log likelihood of the observation
        logprobs[model_activity] = model.score(X)
    

    # Which ever has the highest probability will be the model
    max_idx = np.argmax(np.array([logprobs[activity] for activity in activities]))
    print max_idx
    pred_activity = activities[max_idx]
    print "predicted activity: " + pred_activity
    
    print "logprobs: "
    print logprobs
    
    print "\n"


NameError: name 'amp_dist' is not defined