In [108]:
import pandas as pd
import numpy as np
from os.path import join
import tsfresh as tsf
from sklearn.linear_model import LogisticRegression

from utils import filter_signals

In [109]:
import altair as alt

##  Do endpoints stratify along particular signal summary statistic values?

endpoints:
- medication on/off
- Dyskinesia severity
- tremor severity

How do the signal summary statistics change after signal filtering?

In [110]:
SHOULD_FILTER = False

In [111]:
def filter_df(df):
    if SHOULD_FILTER:
        filtered_arr = filter_signals(df[["X", "Y", "Z"]].values)
        return pd.DataFrame(data=filtered_arr).rename(columns={0: "X", 1: "Y", 2: "Z"})
    return df

In [112]:
DATA_DIR = join("..", "..", "data")
labels_df = pd.read_csv(join(DATA_DIR, "cis-pd", "data_labels", "CIS-PD_Training_Data_IDs_Labels.csv"), index_col=0)
labels_df.head()

Unnamed: 0_level_0,subject_id,on_off,dyskinesia,tremor
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cc7b822c-e310-46f0-a8ea-98c95fdb67a1,1004,1.0,1.0,1.0
5163afe8-a6b0-4ea4-b2ba-9b4501dd5912,1004,0.0,0.0,0.0
5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a,1004,1.0,1.0,1.0
fb188ae2-2173-4137-9236-19a137a402c2,1004,3.0,3.0,3.0
19a3e9ea-fce1-40b7-9457-2618970beb7b,1004,1.0,1.0,1.0


In [113]:
labels_df.shape

(1858, 4)

In [120]:
tsf_funcs = [
    ("kurtosis", ()),
    ("mean_abs_change", ()),
    ("mean_second_derivative_central", ()),
    ("cid_ce", (False,))
]
# note: may be easier to use tsf.extract_features()
# https://tsfresh.readthedocs.io/en/latest/api/tsfresh.feature_extraction.html#tsfresh.feature_extraction.extraction.extract_features

In [121]:
def get_measurement_stats(m_id):
    measure_df = pd.read_csv(join(DATA_DIR, "cis-pd", "training_data", f"{m_id}.csv"))
    measure_df = filter_df(measure_df)
    
    stats_row = {
        "measurement_id": m_id
    }
    for tsf_func_name, tsf_func_params in tsf_funcs:
        tsf_func = getattr(tsf.feature_extraction.feature_calculators, tsf_func_name)
        for dim in ["X", "Y", "Z"]:
            dim_data = measure_df[dim].values
            stats_row[f"{tsf_func_name}_{dim}"] = tsf_func(dim_data, *tsf_func_params)
    return stats_row


In [122]:
stats = [ get_measurement_stats(m_id) for m_id in labels_df.index.values ]
stats_df = pd.DataFrame(data=stats).set_index("measurement_id")
stats_cols = stats_df.columns.values.tolist()
df = labels_df.join(stats_df)
df

Unnamed: 0_level_0,subject_id,on_off,dyskinesia,tremor,kurtosis_X,kurtosis_Y,kurtosis_Z,mean_abs_change_X,mean_abs_change_Y,mean_abs_change_Z,mean_second_derivative_central_X,mean_second_derivative_central_Y,mean_second_derivative_central_Z,cid_ce_X,cid_ce_Y,cid_ce_Z
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
cc7b822c-e310-46f0-a8ea-98c95fdb67a1,1004,1.0,1.0,1.0,6.401526,1.716375,2.642564,0.036092,0.046818,0.044889,1.974835e-06,-1.488596e-06,2.466505e-06,23.233583,26.745867,27.669513
5163afe8-a6b0-4ea4-b2ba-9b4501dd5912,1004,0.0,0.0,0.0,9.977173,3.981855,3.415476,0.009803,0.008626,0.009969,4.079392e-08,9.178681e-08,1.631751e-08,18.858196,12.717907,16.960421
5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a,1004,1.0,1.0,1.0,33.207771,-1.445827,11.624063,0.013078,0.012145,0.011264,-3.265906e-08,-3.061807e-08,8.164841e-09,21.585875,14.788632,18.440142
fb188ae2-2173-4137-9236-19a137a402c2,1004,3.0,3.0,3.0,0.689776,0.772884,0.417105,0.050478,0.050062,0.034963,-3.621117e-07,-1.383351e-07,7.649100e-07,21.872837,20.457632,14.392826
19a3e9ea-fce1-40b7-9457-2618970beb7b,1004,1.0,1.0,1.0,0.042242,-0.591996,-0.327463,0.031835,0.041878,0.035411,-1.892972e-07,4.600124e-07,-5.495729e-08,15.456509,20.206204,18.834040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ab618d1f-eb27-442b-a3b7-e438fde70db1,1051,0.0,,,-0.524017,2.152334,-1.426171,0.003826,0.006999,0.005561,-3.263233e-08,-1.794791e-07,2.794165e-07,2.399479,4.205286,3.329643
25e8bca2-051f-4216-826b-810bbddfdb2e,1051,0.0,,,6.989518,7.579684,8.227764,0.044178,0.070682,0.064636,-2.508043e-07,6.586163e-07,-8.564048e-08,23.028351,37.520100,45.015427
7f7bb7b9-8656-40dd-94f9-9d546ac75722,1051,0.0,,,2.712011,3.187169,22.594224,0.031757,0.048710,0.041679,6.937650e-08,6.121540e-09,4.080979e-08,17.005984,26.977887,32.308186
c29c2d91-c294-4655-a2a7-d4c1f456c3a2,1051,1.0,,,-1.191098,2.939791,-0.140699,0.032105,0.044557,0.035015,-3.655594e-07,1.194706e-06,-1.110974e-06,19.222793,25.528173,19.308131


In [123]:
df = df.loc[~pd.isnull(df["on_off"]) & ~pd.isnull(df[stats_cols[0]])]
df.shape

(1767, 16)

In [124]:
# Check how well the endpoints can be explained by the tsfresh outputs
X = df[stats_cols].values
y = df["on_off"].values
clf = LogisticRegression(random_state=0, multi_class='multinomial', max_iter=100000)
clf.fit(X, y)
clf.score(X, y)

0.47198641765704585