In [18]:
import pandas
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import os
%matplotlib inline

In [19]:
threshold_epsilon = 0.1

In [20]:
def parse_data(filename):
    keys = ['Age', 'Height', 'Weight', 'Gender']
    with open(filename) as f:
        ## collect data from file
        result = {}
        for i in range(8):
            next(f)
        for i in keys:
            tmp = f.readline()
            result[i] = tmp[(tmp.index(' ') + 1):-1]
        for i in range(4):
            next(f)
        data = []
        for r in f:
            d = r[:-1].split(', ')
            data.append([float(i) for i in d])
        
        n = len(data)
        result['duration'] = n
        
        x_data = [d[1] for d in data]
        x_min = min(x_data)
        x_max = max(x_data)
        x_first = x_data[0]
        x_last = x_data[-1]
        result['X_median'] = np.median(x_data)

        y_data = [d[2] for d in data]
        y_min = min(y_data)
        y_max = max(y_data)
        y_first = y_data[0]
        y_last = y_data[-1]
        result['Y_median'] = np.median(y_data)
        
        z_data = [d[3] for d in data]
        z_first = z_data[0]
        z_min = min(z_data)
        z_max = max(z_data)
        z_last = z_data[-1]
        result['Z_median'] = np.median(z_data)   
    
        ## standardizes Y data by median and X time to 0 second start
        data = list(map(lambda d : 
                        [d[0] - data[0][0], 
                         d[1] - result['X_median'], 
                         d[2] - result['Y_median'], 
                         d[3] - result['Z_median']], 
                        data))
        
        ## find change amount to trim
        threshold = min([abs(x_max - x_min), abs(y_max - y_min), abs(z_max - z_min)]) * threshold_epsilon
        
        ## trims left side 
        l_trim = 0
        is_significant = False
        while not is_significant and l_trim < n - 1:
            if (abs(x_data[l_trim] - x_first) > threshold
            or abs(y_data[l_trim] - y_first) > threshold
            or abs(z_data[l_trim] - z_first) > threshold):
                is_significant = True
            else:
                l_trim += 1
                
        ## trims right side
        r_trim = n - 1
        is_significant = False
        while not is_significant and r_trim > 0:
            if (abs(x_data[r_trim] - x_last) > threshold
            or abs(y_data[r_trim] - y_last) > threshold
            or abs(z_data[r_trim] - z_last) > threshold):
                is_significant = True
            else:
                r_trim -= 1
                
        # result['data'] = data[l_trim:r_trim]
        result['data'] = data
    return result, (l_trim, r_trim)

def plot_data(data, bounds=None):
    Xs = [data['data'][i][0] for i in range(len(data['data']))]
    Ys = [data['data'][i][1] for i in range(len(data['data']))]
    plt.plot(Xs, Ys, c="r")
    Ys = [data['data'][i][2] for i in range(len(data['data']))]
    plt.plot(Xs, Ys, c="g")
    Ys = [data['data'][i][3] for i in range(len(data['data']))]
    plt.plot(Xs, Ys, c="b")
    if bounds is not None:
        y_bounds = list(plt.gca().get_ylim())
        plt.plot([data['data'][bounds[0]][0], data['data'][bounds[0]][0]], y_bounds, c='black')
        plt.plot([data['data'][bounds[1]][0], data['data'][bounds[1]][0]], y_bounds, c='black')
    plt.show()

In [None]:
root_dir = './MobiFall/'
categories = {}
durations = []
starts = []
ends = []
for file in [i for i in os.listdir(root_dir)]:
    prefix = file[0:3]
    if prefix not in categories:
        obj = {}
        obj['durations'] = []
        obj['starts'] = []
        obj['ends'] = []
        categories[prefix] = obj
    data, domain = parse_data(root_dir + file)
    categories[prefix]['durations'].append(data['duration'])
    categories[prefix]['starts'].append(domain[0] / data['duration'])
    categories[prefix]['ends'].append(domain[1] / data['duration'])

for prefix in categories:
    plt.hist(categories[prefix]['starts'], bins=40)
    plt.title(prefix+' start')
    plt.show()
    plt.hist(categories[prefix]['ends'], bins=40)
    plt.title(prefix+' end')
    plt.show()
    plt.hist(categories[prefix]['durations'], bins=30)
    plt.title(prefix+' duration')
    plt.show()