In [1]:
import tsfresh as tsf
import glob
import os
import numpy as np
import pandas as pd
import seaborn as sns

from importlib import reload
from datetime import timedelta
from scipy import signal
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
from tsfresh.utilities.distribution import ClusterDaskDistributor
from src import main, feature_model, extract_tsf_features_by_window as extract
from dask_jobqueue import SLURMCluster
from distributed import Client, as_completed

# Extract features from data

In [2]:
try:
    cluster.close()
    client.close()
except NameError:
    pass
finally:
    cluster = SLURMCluster(queue='short', cores=2, memory='7gb', walltime='3:00:00', death_timeout=60)
    client = Client(cluster)
    cluster.adapt(minimum=1, maximum=100)

In [3]:
cis_colnames = {'t_colname': 'Timestamp', 'xyz_colnames': ['X', 'Y', 'Z']}
smartwatch_colnames = {'devid_colnames': ['device_id']}

In [4]:
def extract_tsf_features(input_fp, 
                         samp_rate='100ms',
                         rms_g_constant=1,
                         colnames=dict()):
    seq = main.read_seq(input_fp, use_time_index=True, resample=samp_rate, **colnames)
    # some slight interpolation for missing values
    seq = seq.interpolate(axis=0, limit=1, method='linear')

    # subtract constant for gravity
    rms = pd.DataFrame({'rms': np.sqrt(np.square(seq).sum(axis=1, skipna=False)) - 9.8})
    df = rms

    if 'devid_colnames' in colnames:
        df.reset_index(level=colnames['devid_colnames'], inplace=True)
        df.rename(columns={colnames['devid_colnames'][0]: 'id'}, inplace=True)
    else: 
        df['id'] = 1
        
    # drop timesteps with nulls (tsfresh says timesteps don't have to be equidistant)
    df = df.reset_index(level='t').dropna()
    
    tsf_df = tsf.extract_features(df, column_sort='t', column_id="id", disable_progressbar=False, n_jobs=0)
    samp_id = os.path.splitext(os.path.basename(input_fp))[0]
    tsf_df['samp_id'] = samp_id

    return tsf_df

In [6]:
window_size = 10
window_offset = 5
futures = []

In [44]:
# Training data for cis_pd
fps = glob.glob('data/cis-pd/training_data/*.csv')
futures = client.map(extract_tsf_features, fps, 
                          rms_g_constant=1, 
                          colnames=cis_colnames)

In [50]:
# real_pd smartphone accelerometer
fps = glob.glob('data/real-pd/training_data/smartphone_accelerometer/*.csv')
futures = client.map(extract_tsf_features, fps, 
                          rms_g_constant=9.81)

In [5]:
# real_pd smartwatch accelerometer
fps = glob.glob('data/real-pd/training_data/smartwatch_accelerometer/*.csv')
futures = client.map(extract_tsf_features, fps, 
                          rms_g_constant=9.81, 
                          colnames=smartwatch_colnames)

In [19]:
# real_pd smartwatch gyroscope
fps = glob.glob('data/real-pd/training_data/smartwatch_gyroscope/*.csv')
futures = client.map(extract_tsf_features, fps, 
                          rms_g_constant=0, 
                          colnames=smartwatch_colnames)

In [20]:
# Write to disk directly since too much to store in mem
iterator = as_completed(futures)
future = next(iterator)
while future.status == 'error': 
    future = next(iterator)
result = future.result()
result.to_csv('extracted_features/real_watch_gyro-tsfeatures.csv', header=True)

# Write remaining dfs in append mode 
for future in tqdm(iterator, total=len(futures)-1):
    if future.status == 'finished':
        result = future.result()
        result.to_csv('extracted_features/real_watch_gyro-tsfeatures.csv', header=False, mode='a')

HBox(children=(FloatProgress(value=0.0, max=534.0), HTML(value='')))


