In [6]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style("white")

import time
import timeit

import scipy.stats 
import pandas as pd
import pymc as pm

import re
import numpy as np

import string
import itertools

import time
import json

## Exactly the same code from NB

* define house numbers and x types
* read data
* define functions to evaluate prediction

In [2]:
# House Number and X Types
houses = ['A', 'B', 'C']
xtype_dict = {'r':'raw', 'c':'change','l':'last'}

def read_data(house, xtype):
    assert house in houses
    assert xtype in xtype_dict
    
    act_df = pd.read_csv("data/house{}_act.csv".format(house))
    sensor_df = pd.read_csv("data/house{}_sensor.csv".format(house))
    X = np.load("data/X_{}_house{}.npy".format(xtype_dict[xtype], house))
    Y = np.load("data/Y_house{}.npy".format(house))
    miu = np.load("data/mu{}_{}.npy".format(house, xtype))
    prior = np.load('data/Prior_{}.npy'.format(house))
    return act_df, sensor_df, X, Y, miu, prior

# Prediction Evaluation Functions
def precision(pred_label, Y):
    all_label = list(set(Y))
    N = len(all_label)
    res = 0
    for y in list(set(Y)):
        TP = np.sum(pred_label[Y==y]==y)
        TI = np.sum(pred_label==y)
        if TI != 0:
            res += (float(TP)/TI)
    return float(res)/N
def recall(pred_label, Y):
    all_label = list(set(Y))
    N = len(all_label)
    res = 0
    for y in list(set(Y)):
        TP = np.sum(pred_label[Y==y]==y)
        TT = np.sum(Y==y)
        if TT != 0:
            res += float(TP)/TT
    return float(res)/N
def f_score(pred_label, Y):
    p = precision(pred_label, Y)
    r = recall(pred_label, Y)
    return 2*p*r/(p+r)
def accuracy(pred_label, Y):
    res = 0
    all_label = list(set(Y))
    for y in list(set(Y)):
        TP = np.sum(pred_label[Y==y]==y)
        res += TP
    return float(res)/len(Y)

def evaluation(house,res_label, Y):
    print 'Precision of house {} is {}'.format(house,precision(res_label, Y))
    print 'recall of house {} is {}'.format(house,recall(res_label, Y))
    print 'F score of house {} is {}'.format(house,f_score(res_label, Y))
    print 'Accuracy of house {} is {}'.format(house,accuracy(res_label, Y))
    return precision(res_label, Y), recall(res_label, Y), f_score(res_label, Y), accuracy(res_label, Y)

## Experiment with HMM

For debugging purposes. Function written as the next step.

In [3]:
# load data
act_df, sensor_df, X, Y, miu, prior = read_data("A", "c")

In [4]:
print X.shape
print Y.shape
print miu.shape

(40006, 14)
(40006,)
(17, 14)


In [5]:
order = 2
A = miu.shape[0]
S = miu.shape[1]
K = A ** order

activity_single = [int(x) for x in list(set(act_df.label))] + [0]
activity_higher = [prod for prod in itertools.product(*np.tile(activity_single, (order,1)))]

index_to_single = dict(zip(range(A), activity_single)) #ind to activity_single dict
single_to_index = dict(zip(activity_single, range(A))) #activity_single to ind dict
index_to_higher = dict(zip(range(K), activity_higher)) #ind to activity_higher dict
higher_to_index = dict(zip(activity_higher, range(K))) #activity_higher to ind dict

In [6]:
X_train = X
Y_train = Y
X_test = X
Y_test = Y
N_train = len(X)
N_test = len(X)

In [7]:
# construct transition and emission matrix
transition_count = np.zeros((K, K)) + 0.00001
emission_count = np.zeros((K, S, 2)) + 0.00001

for i in range(order-1, N_train-1):
    yi = higher_to_index[tuple(Y_train[i-order+1:i+1])]
    ynext = higher_to_index[tuple(Y_train[i-order+2:i+2])]
    transition_count[yi, ynext] += 1
    
    xi = X[i]
    
    for j in range(S): 
        emission_count[yi, j, int(xi[j])] += 1

higher_last = higher_to_index[tuple(Y_train[-order:])]
for j in range(S): 
    emission_count[higher_last, j, int(X[-1][j])] += 1

In [10]:
# transition_count = np.zeros((K,K)) + 0.00001
# emission_count2 = np.zeros((K, S)) + 0.00001

# for i in range(order-1, N_train-1):
#     yi = higher_to_index[tuple(Y_train[i-order+1:i+1])]
#     ynext = higher_to_index[tuple(Y_train[i-order+2:i+2])]
#     transition_count[yi, ynext] += 1
#     
#     
#     emission_count2[yi, :] += X_train[i]
# 
# higher_last = higher_to_index[tuple(Y_train[-order:])]
# emission_count2[higher_last, :] += X_train[-1]

In [8]:
transition = transition_count/np.sum(transition_count, axis=1).reshape(K,1)
log_transition = np.log(np.nan_to_num(transition))

emission = emission_count/np.sum(emission_count, axis=2).reshape(K,S,1)
log_emission = np.log(np.nan_to_num(emission))
log_emission2 = log_emission[:,:,0].reshape(K,S)

In [9]:
# initial probability
initial = np.zeros(K)
initial[higher_to_index[tuple(Y_test[:order])]] = 1
log_initial = np.log(initial)

In [10]:
# initialize T1 and T2
T1 = np.zeros((K,N_test-order+1))
T2 = np.zeros((K,N_test-order+1))

calc_emission = lambda p, x:  np.power(p,(1-x))*np.power((np.log(1-np.exp(p))),x)
T1[:,0] = log_initial + np.sum(calc_emission(log_emission2, X_train[order-1]), axis=1)

In [11]:
%%time
# iterate through time to update T1 and T2
for i in range(1, N_test-order+1):
    obj = T1[:, i-1].reshape(K,1) + log_transition + np.sum(calc_emission(log_emission2, X_train[i+order-1]), axis=1)
    T1[:,i] = np.max(obj, axis=0)
    T2[:,i] = np.argmax(obj, axis=0)

CPU times: user 27.9 s, sys: 548 ms, total: 28.5 s
Wall time: 28.7 s


In [32]:
%%time
# back-fill the MLE state
Z = np.zeros(N_test-order+1)
Z[-1] = np.argmax(T1[:,-1])

for i in range(-1, -(N_test-order), -1):
    Z[i-1] = T2[int(Z[i]), i]

Y_HMM = np.array([index_to_higher[i][0] for i in Z])
Y_HMM[0] = Y_test[:order][0]

CPU times: user 54.9 ms, sys: 13.4 ms, total: 68.3 ms
Wall time: 60.9 ms


In [33]:
print Y_HMM[:20]
print Y_train[:20]

[ 6  6  0 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10]
[  6.   6.   0.  10.  10.  10.  10.  10.  10.  10.  10.  10.  10.  10.  10.
  10.  10.  10.  10.  10.]


In [35]:
evaluation("A", Y_HMM, Y_train[:N_test-order+1])

Precision of house A is 0.805394415528
recall of house A is 0.769201489369
F score of house A is 0.786881995599
Accuracy of house A is 0.850768653918


In [13]:
count = 0
for i in range(N_train-1):
    if Y_test[i] == Y_HMM[i]:
        count += 1
print count/float(N_train)

0.850747387892


## Nice and Clean function

In [56]:
# define function to fit HMM model

def HMM(house, X_train, Y_train, X_test, Y_test, order):

    # Initialization
    activity_single = [int(x) for x in list(set(Y_train))] + [0]
    activity_higher = [prod for prod in itertools.product(*np.tile(activity_single, (order,1)))]

    A = len(activity_single)
    K = len(activity_higher)
    S = X_train.shape[1]
    N_train = len(X_train)
    N_test = len(Y_test)
    
    index_to_single = dict(zip(range(A), activity_single)) #ind to activity_single dict
    single_to_index = dict(zip(activity_single, range(A))) #activity_single to ind dict
    index_to_higher = dict(zip(range(K), activity_higher)) #ind to activity_higher dict
    higher_to_index = dict(zip(activity_higher, range(K))) #activity_higher to ind dict

    
    # Training transition and emission
    transition_count = np.zeros((K, K)) + 0.00001
    emission_count = np.zeros((K, S, 2)) + 0.00001

    for i in range(order-1, N_train-1):
        yi = higher_to_index[tuple(Y_train[i-order+1:i+1])]
        ynext = higher_to_index[tuple(Y_train[i-order+2:i+2])]
        transition_count[yi, ynext] += 1

        xi = X[i]
        for j in range(S): 
            emission_count[yi, j, int(xi[j])] += 1

    higher_last = higher_to_index[tuple(Y_train[-order:])]
    for j in range(S): 
        emission_count[higher_last, j, int(X[-1][j])] += 1
        
    transition = transition_count/np.sum(transition_count, axis=1).reshape(K,1)
    log_transition = np.log(np.nan_to_num(transition))

    emission = emission_count/np.sum(emission_count, axis=2).reshape(K,S,1)
    log_emission = np.log(np.nan_to_num(emission))[:,:,0].reshape(K,S)

    
    # Viterbi
    ### Initial probability
    initial = np.zeros(K)
    initial[higher_to_index[tuple(Y_test[:order])]] = 1
    log_initial = np.log(initial)
    
    ### Initialize T1 and T2
    T1 = np.zeros((K,N_test-order+1))
    T2 = np.zeros((K,N_test-order+1))

    calc_emission = lambda p, x:  np.power(p,(1-x))*np.power((np.log(1-np.exp(p))),x)
    T1[:,0] = log_initial + np.sum(calc_emission(log_emission, X_train[order-1]), axis=1)

    ### Update T1 and T2
    for i in range(1, N_test-order+1):
        obj = T1[:, i-1].reshape(K,1) + log_transition + np.sum(calc_emission(log_emission, X_train[i+order-1]), axis=1)
        T1[:,i] = np.max(obj, axis=0)
        T2[:,i] = np.argmax(obj, axis=0)
        
    ### Back-fill the most likely state
    Z = np.zeros(N_test-order+1)
    Z[-1] = np.argmax(T1[:,-1])

    for i in range(-1, -(N_test-order), -1):
        Z[i-1] = T2[int(Z[i]), i]

    Y_HMM = np.array([index_to_higher[i][0] for i in Z])
    Y_HMM[0] = Y_test[:order][0]
    
    
    # Output results
    result = evaluation(house, Y_HMM, Y_train[:N_test-order+1])
    result_dict = dict(zip(["precision", "recall", "f", "accuracy"], result))
    return result_dict

In [57]:
act_df, sensor_df, X, Y, miu, prior = read_data("B", "r")
X_train = X
Y_train = Y
X_test = X
Y_test = Y
order = 1
test_result = HMM("B", X_train, Y_train, X_test, Y_test, order)

Precision of house B is 0.302677639835
recall of house B is 0.302677639835
F score of house B is 0.410162668687
Accuracy of house B is 0.636426735219


In [None]:
test_result

# Wrapping all houses and feature representations

In [64]:
# House Number and X Types
houses = ['A', 'B', 'C']
xtype_dict = {'r':'raw', 'c':'change','l':'last'}
orders = [1, 2]


all_result = []
for order in orders:
    print "Order: ", order
    by_xtype = []
    for xtype in xtype_dict.keys():
        print xtype_dict[xtype]
        by_house = []
        for house in houses:
            act_df, sensor_df, X, Y, miu, prior = read_data(house, xtype)
            X_train = X
            Y_train = Y
            X_test = X
            Y_test = Y
            result = HMM(house, X_train, Y_train, X_test, Y_test, order)
            by_house.append(result)
        dict_by_house = dict(zip(houses, by_house))
        by_xtype.append(dict_by_house)
    dict_by_xtype = dict(zip(xtype_dict.keys(), by_xtype))
    all_result.append(dict_by_xtype)
result_by_order = dict(zip(orders, all_result))

Order:  1
change
Precision of house A is 0.572370613118
recall of house A is 0.572370613118
F score of house A is 0.578691859251
Accuracy of house A is 0.722391641254
Precision of house B is 0.414139176825
recall of house B is 0.414139176825
F score of house B is 0.499433964237
Accuracy of house B is 0.558174807198
Precision of house C is 0.476879915639
recall of house C is 0.476879915639
F score of house C is 0.492969831069
Accuracy of house C is 0.787941709453
raw
Precision of house A is 0.401679493576
recall of house A is 0.401679493576
F score of house A is 0.461803506496
Accuracy of house A is 0.557441383792
Precision of house B is 0.302677639835
recall of house B is 0.302677639835
F score of house B is 0.410162668687
Accuracy of house B is 0.636426735219
Precision of house C is 0.186197160245
recall of house C is 0.186197160245
F score of house C is 0.250452247337
Accuracy of house C is 0.312594382362
last
Precision of house A is 0.21753304903
recall of house A is 0.21753304903
F

In [69]:
with open('data/result_hmm.json', 'w') as f:
     json.dump(result_by_order, f)

In [3]:
act_df = pd.read_csv("data/houseC_act.csv")

In [10]:
act_df[["start_time", "end_time", "label", "meaning"]].head(10)

Unnamed: 0,start_time,end_time,label,meaning
0,2008-11-19 17:49:00,2008-11-19 17:49:59,1,leave house
1,2008-11-19 17:50:40,2008-11-19 17:51:45,4,use toilet downstairs
2,2008-11-19 17:59:25,2008-11-19 18:00:00,17,get drink
3,2008-11-19 18:00:50,2008-11-19 20:24:59,28,relax
4,2008-11-19 19:14:50,2008-11-19 19:15:19,17,get drink
5,2008-11-19 19:39:00,2008-11-19 19:39:59,16,get snack
6,2008-11-19 20:30:29,2008-11-19 20:31:10,7,use toilet upstairs
7,2008-11-19 20:31:30,2008-11-19 20:36:20,6,brush teeth
8,2008-11-19 20:36:59,2008-11-20 04:31:40,10,go to bed
9,2008-11-20 03:48:39,2008-11-20 03:49:40,7,use toilet upstairs


In [11]:
sensor_df = pd.read_csv("data/houseC_sensor.csv")

In [13]:
sensor_df[["start_time", "end_time", "on", "label", "meaning"]].head(10)

Unnamed: 0,start_time,end_time,on,label,meaning
0,2008-11-19 17:47:46,2008-11-19 17:49:17,1,28,"voordeur, reed"
1,2008-11-19 17:49:20,2008-11-19 17:49:22,1,28,"voordeur, reed"
2,2008-11-19 17:49:24,2008-11-19 17:50:14,1,28,"voordeur, reed"
3,2008-11-19 17:50:18,2008-11-20 06:14:11,1,28,"voordeur, reed"
4,2008-11-19 17:51:02,2008-11-19 17:51:04,1,25,deur toilet beneden
5,2008-11-19 17:51:04,2008-11-19 17:51:45,1,25,deur toilet beneden
6,2008-11-19 17:51:49,2008-11-20 06:03:26,1,25,deur toilet beneden
7,2008-11-19 17:52:17,2008-11-20 06:04:01,1,10,toilet flush beneden. flush
8,2008-11-19 17:59:12,2008-11-19 17:59:40,1,23,"kastje borden/kruiden,reed"
9,2008-11-19 17:59:53,2008-11-19 18:00:04,1,30,"koelkast, reed"
