# Idea is to create a quick script to pull all the neural network data into a single file for quick access
## CIS-PD Datasetup first

We are going to resample to 25 Hz since the data appears to be irregularly sampled. We will take 1 minute time samples with 15 seconds of overlap (4x the original data which takes up 9 gb of space)

In [1]:
import joblib
import sys, os
sys.path.append(os.path.realpath(".."))

In [2]:
import src.main as main
import pandas as pd
import numpy as np
from importlib import reload
reload(main)

<module 'src.main' from '/home/ms994/beat_pd2/src/main.py'>

In [6]:
import numpy as np
np.fft.fftfreq(50, 1/50)

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24., -25., -24., -23., -22., -21., -20., -19., -18.,
       -17., -16., -15., -14., -13., -12., -11., -10.,  -9.,  -8.,  -7.,
        -6.,  -5.,  -4.,  -3.,  -2.,  -1.])

In [95]:
labels = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/data_labels/CIS-PD_Training_Data_IDs_Labels.csv")
additional_labels = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv")

In [96]:
demo_data = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/clinical_data/CIS-PD_Demographics.csv")
updrs_1 = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/clinical_data/CIS-PD_UPDRS_Part1_2_4.csv")
updrs_2 = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/clinical_data/CIS-PD_UPDRS_Part3.csv")

In [97]:
updrs_1

Unnamed: 0,subject_id,Visit,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
0,1000,Baseline,4,5,0,0,1,3,1,1
1,1004,Baseline,13,6,1,3,1,4,3,0
2,1006,Baseline,10,19,0,0,1,0,2,0
3,1007,Baseline,18,9,0,0,2,3,1,2
4,1016,Baseline,15,2,0,0,0,0,0,0
5,1018,Baseline,6,2,0,0,0,0,0,0
6,1019,Baseline,13,10,0,0,0,0,0,0
7,1020,Baseline,9,6,0,0,0,0,0,0
8,1023,Baseline,10,10,1,0,1,1,1,0
9,1030,Baseline,6,9,0,0,0,0,0,0


In [98]:
all_data = demo_data.join(updrs_1.set_index("subject_id"), on="subject_id").join(updrs_2.set_index("subject_id").drop(["Visit"], axis=1))

In [99]:
labels = pd.concat([labels, additional_labels])

In [100]:
labels = labels.reset_index()

In [101]:
labels = labels.join(all_data.set_index("subject_id"), how="inner", on="subject_id")

In [102]:
labels = labels.dropna(how="all")
labels = labels.dropna(how="all", axis=1)
pd.isnull(labels).sum(axis=0)/labels.shape[0]

index                 0.000000
measurement_id        0.000000
subject_id            0.000000
on_off                0.041176
dyskinesia            0.303167
tremor                0.179186
Age                   0.000000
Gender                0.000000
Race                  0.020362
Ethnicity             0.000000
Visit                 0.000000
UPDRS_PartI_Total     0.000000
UPDRS_PartII_Total    0.000000
UPDRS_4.1             0.000000
UPDRS_4.2             0.000000
UPDRS_4.3             0.000000
UPDRS_4.4             0.000000
UPDRS_4.5             0.000000
UPDRS_4.6             0.000000
dtype: float64

In [103]:
labels["Visit"].unique() #useless label

array(['Baseline'], dtype=object)

In [104]:
labels = labels.drop(["Race", "Ethnicity", "Visit"], axis=1)

In [105]:
labels.max(axis=0)

index                                                 1857
measurement_id        ffd64945-4be2-47d1-a706-bf3e03cbf3b3
subject_id                                            1051
on_off                                                   4
dyskinesia                                               4
tremor                                                   4
Age                                                     81
Gender                                                Male
UPDRS_PartI_Total                                       25
UPDRS_PartII_Total                                      19
UPDRS_4.1                                                2
UPDRS_4.2                                                3
UPDRS_4.3                                                2
UPDRS_4.4                                                4
UPDRS_4.5                                                3
UPDRS_4.6                                                3
dtype: object

In [106]:
labels.min(axis=0)

index                                                    0
measurement_id        004ed441-24db-4839-8b5d-7465e4ea2a0a
subject_id                                            1000
on_off                                                   0
dyskinesia                                               0
tremor                                                   0
Age                                                     36
Gender                                              Female
UPDRS_PartI_Total                                        4
UPDRS_PartII_Total                                       2
UPDRS_4.1                                                0
UPDRS_4.2                                                0
UPDRS_4.3                                                0
UPDRS_4.4                                                0
UPDRS_4.5                                                0
UPDRS_4.6                                                0
dtype: object

In [107]:
pd.isna(labels).any(axis=0)

index                 False
measurement_id        False
subject_id            False
on_off                 True
dyskinesia             True
tremor                 True
Age                   False
Gender                False
UPDRS_PartI_Total     False
UPDRS_PartII_Total    False
UPDRS_4.1             False
UPDRS_4.2             False
UPDRS_4.3             False
UPDRS_4.4             False
UPDRS_4.5             False
UPDRS_4.6             False
dtype: bool

In [108]:
poss_missing_labels = ["on_off", "dyskinesia", "tremor"]

In [109]:
labels[labels.measurement_id=="014aa655-5dbf-448b-99ad-09fd7776a682"]

Unnamed: 0,index,measurement_id,subject_id,on_off,dyskinesia,tremor,Age,Gender,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
1962,104,014aa655-5dbf-448b-99ad-09fd7776a682,1018,0.0,0.0,0.0,58,Male,6,2,0,0,0,0,0,0


In [110]:
all_m_id = labels.measurement_id
test_m_id = all_m_id[0]

In [111]:
#lets go and filter data with some utility funcs
from scipy.signal import butter, lfilter

# https://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html

# grabs some filter constants for making bandpass filter
#    order is kinda like strength of filter... higher leads to more ideal filter but has weird interactions near the edges of the filter
#    lower is less ideal but results in less artifacts being generated
#    this is a bit of a cheat anyways, using a lowpass and highpass together, there may be less sketch filter designs applicable
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y
def butter_lp_filter(data, lowcut, fs, order=5):
    nyq = 0.5 * fs #just get the highest freq possible (nyquist, and bandgap it!)
    hc = nyq * 0.9 #can't accept exactly nyq
    print("hc:", hc)
    return butter_bandpass_filter(data, lowcut, hyc, fs, order)
def butter_bandgap_filter(data, lowcut, highcut, fs, order=5):
    if highcut is None:
        return butter_bandgap_filter(data, lowcut, fs, order)
    toRemove = butter_bandpass_filter(data, lowcut, highcut, fs, order)
    return data - toRemove

In [112]:
def get_preprocessed_data(measurement_id, low_f=1, high_f=10):
    data = main.read_seq(
        f"/home/ms994/beat_pd/data/cis-pd/training_data/{measurement_id}.csv",
        t_colname="Timestamp",
        xyz_colnames=["X", "Y", "Z"],
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
    timeIndex = data.index
    dataCol = data.columns
    #50 hz, make a bandpass between 1 and 10 hz, with order of 5
    data = butter_bandpass_filter(data, low_f, high_f, 50, 5)
    data = pd.DataFrame(data, index=timeIndex, columns=dataCol)
    return data

In [113]:
def read_mid_and_split(m_id, max_window=pd.Timedelta(seconds=60), overlap=pd.Timedelta(seconds=10)):
    try:
        data = get_preprocessed_data(m_id)
        all_samples = []
        currentIndex = pd.Timedelta(seconds=0)
        while (currentIndex + max_window < data.index.max()):
            all_samples.append(data.loc[currentIndex:currentIndex+max_window].values)
            currentIndex += overlap
        return all_samples
    except:
        return []

In [114]:
all_m_id = sorted(all_m_id) #keep a constant m_id

In [115]:
def write_data(to_run_q, to_return_q):
    for m_id in iter(to_run_q.get, None):
        data_samples = read_mid_and_split(m_id) 
        label = labels[labels.measurement_id == m_id]
        for data_sample in data_samples:
            to_return_q.put((data_sample, label))

In [116]:
from multiprocessing import Process, Manager
m = Manager()
n_process = 17
toRunQ = m.Queue()
toReturnQ = m.Queue()
p = [Process(target=write_data, args=(toRunQ, toReturnQ)) for i in range(n_process)]
[toRunQ.put(m_id) for m_id in all_m_id]
[toRunQ.put(None) for i in range(n_process)]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [117]:
[process.start() for process in p]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [118]:
[process.join() for process in p]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [124]:
toReturnQ.qsize()

204097

In [120]:
data = toReturnQ.get()

In [121]:
all_subjects = sorted(labels.subject_id.unique())

In [122]:
data[1].fillna(-1)

Unnamed: 0,index,measurement_id,subject_id,on_off,dyskinesia,tremor,Age,Gender,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
1632,1632,00544f67-c07c-4a07-9c17-a7aee51d8b96,1049,2.0,1.0,2.0,54,Female,25,14,1,1,1,2,3,3


In [138]:
import tensorflow as tf
# Helperfunctions to make your feature definition more readable
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _int64_feature_list(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
# Helperfunctions to make your feature definition more readable
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _float_feature_list(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def parse_datum(datum):
    xData = datum[0]
    yData = datum[1].fillna(-1) #using -1 as a masking value
    
    
    
    feature = { \
               'data': _float_feature_list(xData[:1500].reshape(-1)), \
               'on_off': _int64_feature_list(yData.on_off.values.astype(int)), \
               'dyskinesia': _int64_feature_list(yData.dyskinesia.values.astype(int)), \
               'measurement_id': _int64_feature(all_m_id.index(yData.measurement_id.values[0])), \
               'tremor': _int64_feature_list(yData.tremor.values.astype(int)), \
               'age': _int64_feature_list(yData.Age.values.astype(int)), \
               'subjects': _int64_feature(all_subjects.index(yData.subject_id.values[0])),
               'gender': _int64_feature_list(yData.Gender.apply(lambda x: x=="Male").values.astype(int)), \
               'UPDRS_PartI_Total': _int64_feature_list(yData.UPDRS_PartI_Total.values.astype(int)), \
               'UPDRS_PartII_Total': _int64_feature_list(yData.UPDRS_PartII_Total.values.astype(int)), \
               'UPDRS_4.1': _int64_feature_list(yData["UPDRS_4.1"].values.astype(int)), \
               'UPDRS_4.2': _int64_feature_list(yData["UPDRS_4.2"].values.astype(int)), \
               'UPDRS_4.3': _int64_feature_list(yData["UPDRS_4.3"].values.astype(int)), \
               'UPDRS_4.4': _int64_feature_list(yData["UPDRS_4.4"].values.astype(int)), \
               'UPDRS_4.5': _int64_feature_list(yData["UPDRS_4.5"].values.astype(int)), \
               'UPDRS_4.6': _int64_feature_list(yData["UPDRS_4.6"].values.astype(int)),
              }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [139]:
def gen():
    while not toRunQ.empty() and not toReturnQ.empty():
        data = toReturnQ.get()
        yield parse_datum(data).SerializeToString()

In [127]:
allResults = []
while not toReturnQ.empty():
    allResults.append(toReturnQ.get())

In [128]:
from random import shuffle
shuffle(allResults)

In [137]:
allResults[0][1].subject_id.iloc[0]

1023

In [140]:
#cuz im having issues with pulling directly off queue
def gen():
    for datum in allResults:
        yield parse_datum(datum).SerializeToString()

In [141]:
dataset = tf.data.Dataset.from_generator(gen, output_types=tf.string,  output_shapes=(tf.TensorShape([])))
writer = tf.data.experimental.TFRecordWriter("/n/scratch2/beat_pd_ms_tmp/all_data.tfr")
writer.write(dataset)

In [37]:
# %%bash
# scancel 2919535