In [197]:
# Read each folder in the ADL dataset and create a dictionary with keys = folder/activity name and values = nested list of
# flattened file values. Prints the number of files in each folder after reading the data. This snippet assumes that the current
# directory is ADL_Dataset/HMP_Dataset

import os
import numpy as np
#os.chdir('ADL_Dataset/HMP_Dataset')
folders = [x for x in os.listdir() if x.endswith(('_MODEL','.m','.txt'))==False]
all_dict = {}
for foldername in folders:
    for filename in os.listdir(foldername):
        with open(foldername+'/'+filename,'r') as f:
            if foldername not in all_dict.keys():
                all_dict[foldername] = []
                all_dict[foldername].append(np.loadtxt(f).flatten().tolist())
            else:
                all_dict[foldername].append(np.loadtxt(f).flatten().tolist())
for key,val in all_dict.items():
    print(key, len(val))
print('done')

Brush_teeth 12
Climb_stairs 102
Comb_hair 31
Descend_stairs 42
Drink_glass 100
Eat_meat 5
Eat_soup 3
Getup_bed 101
Liedown_bed 28
Pour_water 100
Sitdown_chair 100
Standup_chair 102
Use_telephone 13
Walk 100
done


In [241]:
# Split the all_dict data into two dictionaries - test and train. The split is made in 80:20 ratio; separately for each category.

import random
import math
test,train = {},{}
for key,value in all_dict.items():
    random.shuffle(value)
    testcount = math.ceil(.2*len(value))
    test[key] = value[:testcount]
    train[key] = value[testcount:]
print('done')

done


In [242]:
# Accepts a signal and divides into segments of size seg_size (each segment is a vector of length seg_size*3 since the 
# observations are in 3 dimensions)

def split_signal(signal,seg_size):
    signal_segs = []
    start = 0
    end = start + (seg_size*3)
    while end <= len(signal):
        signal_segs.append(signal[start:end])
        start = end
        end = start + (seg_size*3)
    return signal_segs

In [244]:
# Create an array of segments for all training signals and cluster the segments using k-means.

from sklearn.cluster import KMeans

def build_dict(k,seg_size,train):
    seg_arr = []
    for key,value in train.items():
        for signal in value:
            signal_segs = split_signal(signal,seg_size)
            seg_arr.extend(signal_segs)
    return KMeans(n_clusters=k).fit(seg_array)

In [245]:
# vector quantize each signal to obtain k length vectors. Accepts a dictionary (train/test) and kmeans object. Returns two lists-
# list of vectors and list of their true labels.
# Referred https://piazza.com/class/jchzguhsowz6n9?cid=781 for bincount usage

def vecquantize(data, kmeans):
    vectors = []
    labels = []
    for key,value in data.items():
        for signal in value:
            signal_segs = split_signal(signal,seg_size)
            cluster_ids = kmeans.predict(signal_segs)
            vector = np.bincount(cluster_ids,minlength=k)
            vectors.append(vector)
            labels.append(key)
    return vectors, labels

In [266]:
# Function classify_train trains a classifier using Random Forest Classifier
# Function classify_measure predicts the labels for input vectors and computes the error rate and confusion matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

def classify_train(vectors,labels):
    random_forest_classifier = RandomForestClassifier(n_estimators=50,n_jobs=-1)
    return random_forest_classifier.fit(vectors, labels)
    
def classify_measure(rfc, vectors, true_labels):
    pred_labels = rfc.predict(vectors)
    error_rate = 1 - rfc.score(vectors, true_labels)
    print(error_rate)
    print(confusion_matrix(true_labels,pred_labels,labels=None))
    return error_rate

In [272]:
# For a given set of test and train data, quantize the data to obtain vectors, train the classifier using training vectors and
# compute the average error rate (from 10 runs).  

from matplotlib import pyplot as plt
error_rates = []
error_rate_sum = 0
# 1. seg_list = [16,25,32,50]                              # Uncomment to run for segment sizes 16,25,32 and 50
seg_list = [16]                                            # comment if the above line is uncommented
# 2. k_list = [100,200,400,600,800,1000]                   # Uncomment to run for k = 100,200,400,600,800,1000    
k_list = [600]                                             # Comment if the above line is uncommented  
for i in range(0,10):
    for segment_size in seg_list:
        for k in k_list:
            kmeans = build_dict(k=k,seg_size=segment_size,train=train)
            train_vecs, train_labels = vecquantize(train,kmeans)
            test_vecs, test_labels = vecquantize(test,kmeans)
            rfc = classify_train(train_vecs,train_labels)
            #error_rates.append(classify_measure(rfc,test_vecs,test_labels))
            error_rate = classify_measure(rfc,test_vecs,test_labels)
            error_rate_sum += error_rate
print('average error rate ',error_rate_sum/10)

# The below code creates a plot of error rates against segment sizes and number of clusters

#fig1 = plt.figure()
#plt.plot(seg_list,error_rates)
#plt.xlabel('Segment Size')
#plt.ylabel('Error Rate')
#plt.xticks(seg_list)
#plt.yticks(error_rates)
#plt.show()
#fig.savefig('Errorrate_seg.png')
#fig2 = plt.figure()
#plt.plot(k_list,error_rates)
#plt.xlabel('K')
#plt.ylabel('Error Rate')
#plt.xticks(k_list)
#plt.yticks(error_rates)
#plt.show()
#fig.savefig('Errorrate_k.png')

print('done')

0.202312138728
[[ 2  0  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0 19  0  0  0  0  0  0  0  0  1  0  0  1]
 [ 0  0  6  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  1  0  8  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0  1  1  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0 15  0  2  0  4  0  0]
 [ 0  0  0  0  0  0  0  1  0  3  2  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 20  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  1 16  2  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  5 16  0  0]
 [ 0  0  0  0  2  0  0  0  0  1  0  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  2  0  0 17]]
0.306358381503
[[ 2  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0 19  0  0  0  0  0  0  0  0  0  1  0  1]
 [ 0  0  4  0  0  0  0  0  0  1  1  1  0  0]
 [ 0  1  0  8  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 19  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0 1