In [None]:
import os

import pandas

In [None]:

def load_data(dataset_path):

    """
    Expects dataset to have layout of the following form:
    
    ROOT
    ├── Read Me.txt
    ├── S1-S10-S1-M-R-AW-30-E-3-AG
    │   ├── S1-S10-S1-M-R-A-30-E-3-A.csv
    ....
    """

    dfs = []
    session_no = 0
    for d in os.listdir(dataset_path):
        if d in ('Read Me.txt', ):
            continue
        for f in os.listdir(os.path.join(dataset_path, d)):
            p = os.path.join(dataset_path, d, f)
            df = pandas.read_csv(p)
            df['filename'] = f
            #df['dirname'] = d
                
            dfs.append(df)
            session_no += 1
            

    out = pandas.concat(dfs)

    # Get out time information
    out['time'] = pandas.to_datetime(out['epoc (ms)'], unit='ms')
    out = out.drop(columns=['timestamp (+1100)', 'timestamp (+1000)', 'epoc (ms)']) # redundant wrt 'time'

    # Drop units from colum names
    out = out.rename(columns={
        'elapsed (s)': 'elapsed',
        'x-axis (g)': 'acc_x',
        'y-axis (g)': 'acc_y',
        'z-axis (g)': 'acc_z',
        'x-axis (T)': 'mag_x',
        'y-axis (T)': 'mag_y',
        'z-axis (T)': 'mag_z',
        'x-axis (deg/s)': 'gyro_x',
        'y-axis (deg/s)': 'gyro_y',
        'z-axis (deg/s)': 'gyro_z',
    })
    return out

dataset_path = './data/hx5kkkbr3j-1'
data = load_data(dataset_path)

data

In [None]:

def parse_filename(filename):
    """
    Parse the structured metadata provided in filenames
    Described in "Read me.txt"

    S{setting}-S{subject}-S{session}-{gender}-{hand}-{sensor-location}-{age}-{brush type}-{location}-{sensor}
    """

    setting = None
    tok = filename.split('-')
    if len(tok) == 10:
        setting, subject, session, gender, hand, sensor_loc, age, brush, location, sensor = tok
    elif len(tok) == 9:
        # NOTE: there are 4 instances where the filenames are missing one of the first Sx values
        # but it is not documented which
        # However, the directory names does not seem to have this problem
        # XXX: which one is missing??
        subject, session, gender, hand, sensor_loc, age, brush, location, sensor = tok
        #raise ValueError(f"Missing value in filename: {filename}")
    else:
        raise ValueError(f'Unexpected filename format: {filename}')
    m = pandas.Series(dict(
        setting=setting,
        subject=subject, #.replace('S', 'P'),
        session=int(session.replace('S', '')),
        gender=gender,
        hand=hand,
        sensor_location=sensor_loc,
        brush=brush,
        location=location,
        sensor=sensor.replace('.csv' , ''),
        filename=filename,
    ))
    return m


def load_meta(data):
    meta = pandas.Series(data.filename.unique()).apply(parse_filename).set_index('filename')

    # drop irregular data
    meta = meta[~meta.setting.isna()]
    
    categoricals = set(meta.columns) - set(['session'])
    for c in categoricals:
        meta[c] = meta[c].astype('category')
    
    return meta


meta = load_meta(data)
meta

In [None]:
acc = data.dropna(subset=['acc_x', 'acc_y', 'acc_z']).drop(columns=['mag_x', 'mag_y', 'mag_z', 'gyro_x', 'gyro_y', 'gyro_z'])
acc

In [None]:
def vector_magnitude(vectors, axis=None):
    """Compute the magnitude of multi-dimensional vectors"""
    # alternative is numpy.linalg.norm(vectors)
    mag = numpy.linalg.norm(vectors, axis=axis)
    return mag


In [None]:
def resample(df, freq='1min', func='median', group='device_id', time='end_time', numeric_only=True):
    grouped = df.reset_index().set_index(time).groupby(group, observed=True).resample(freq)
    out = grouped.agg(func, numeric_only=numeric_only).reset_index().set_index([group, time])

    return out


acc_lp = resample(acc, freq='1s', func='mean', group='filename', time='time')
acc_lp = acc_lp.reset_index()
acc_lp = pandas.merge(acc_lp, meta, left_on='filename', right_on='filename')

# Setting 2 has more specific protocol
# Pause for a few seconds in between different regions and bring the brush to a reference point
acc_lp = acc_lp[acc_lp.setting == 'S2'] 
acc_lp = acc_lp[acc_lp.sensor_location == 'A']

acc_lp

In [None]:
import plotly.express

sel_files = acc_lp.reset_index().filename.sample(n=20)
sel = acc_lp.reset_index()
sel = sel[sel.filename.isin(sel_files)]

plotly.express.line(sel, 
                    x='elapsed',
                    y='acc_x',
                    facet_row='filename',
                    height=1000,
                    width=1000,
                   )
                    


In [None]:
fig = plotly.express.scatter(sel, 
                    x='acc_x',
                    y='acc_y',
                    color='filename',
                    height=800,
                    width=800,
                    opacity=0.5,
                   )
fig.update_traces(marker=dict(size=5.0))
fig.update_layout(showlegend=False)

In [None]:
fig = plotly.express.scatter(sel, 
                    x='acc_x',
                    y='acc_y',
                    color='subject',
                    height=800,
                    width=800,
                    opacity=0.5,
                   )
fig.update_traces(marker=dict(size=5.0))
fig.update_layout(showlegend=False)

In [None]:
fig = plotly.express.scatter(sel, 
                    x='acc_y',
                    y='acc_z',
                    color='subject',
                    height=800,
                    width=800,
                    opacity=0.5,
                   )
fig.update_traces(marker=dict(size=5.0))
fig.update_layout(showlegend=False)