# Idea is to create a quick script to pull all the neural network data into a single file for quick access
## CIS-PD Datasetup first

We are going to resample to 25 Hz since the data appears to be irregularly sampled. We will take 1 minute time samples with 15 seconds of overlap (4x the original data which takes up 9 gb of space)

In [6]:
import joblib
import sys, os
sys.path.append(os.path.realpath(".."))

In [7]:
import src.main as main
import pandas as pd
import numpy as np
from importlib import reload
reload(main)

<module 'src.main' from '/home/ms994/beat_pd2/src/main.py'>

In [4]:
labels = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/data_labels/CIS-PD_Training_Data_IDs_Labels.csv")
additional_labels = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/data_labels/CIS-PD_Ancillary_Data_IDs_Labels.csv")

In [5]:
demo_data = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/clinical_data/CIS-PD_Demographics.csv")
updrs_1 = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/clinical_data/CIS-PD_UPDRS_Part1_2_4.csv")
updrs_2 = pd.read_csv("/home/ms994/beat_pd/data/cis-pd/clinical_data/CIS-PD_UPDRS_Part3.csv")

In [6]:
updrs_1.head()

Unnamed: 0,subject_id,Visit,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
0,1000,Baseline,4,5,0,0,1,3,1,1
1,1004,Baseline,13,6,1,3,1,4,3,0
2,1006,Baseline,10,19,0,0,1,0,2,0
3,1007,Baseline,18,9,0,0,2,3,1,2
4,1016,Baseline,15,2,0,0,0,0,0,0


In [7]:
all_data = demo_data.join(updrs_1.set_index("subject_id"), on="subject_id").join(updrs_2.set_index("subject_id").drop(["Visit"], axis=1))

In [8]:
labels = pd.concat([labels, additional_labels])

In [10]:
labels = labels.reset_index()

In [11]:
labels = labels.join(all_data.set_index("subject_id"), how="inner", on="subject_id")

In [14]:
labels = labels.dropna(how="all")
labels = labels.dropna(how="all", axis=1)
pd.isnull(labels).sum(axis=0)/labels.shape[0]

level_0               0.000000
index                 0.000000
measurement_id        0.000000
subject_id            0.000000
on_off                0.041176
dyskinesia            0.303167
tremor                0.179186
Age                   0.000000
Gender                0.000000
Race                  0.020362
Ethnicity             0.000000
Visit                 0.000000
UPDRS_PartI_Total     0.000000
UPDRS_PartII_Total    0.000000
UPDRS_4.1             0.000000
UPDRS_4.2             0.000000
UPDRS_4.3             0.000000
UPDRS_4.4             0.000000
UPDRS_4.5             0.000000
UPDRS_4.6             0.000000
dtype: float64

In [15]:
labels["Visit"].unique() #useless label

array(['Baseline'], dtype=object)

In [16]:
labels = labels.drop(["Race", "Ethnicity", "Visit"], axis=1)

In [48]:
labels.max(axis=0)

index                                                 1857
measurement_id        ffd64945-4be2-47d1-a706-bf3e03cbf3b3
subject_id                                            1051
on_off                                                   4
dyskinesia                                               4
tremor                                                   4
Age                                                     81
Gender                                                Male
UPDRS_PartI_Total                                       25
UPDRS_PartII_Total                                      19
UPDRS_4.1                                                2
UPDRS_4.2                                                3
UPDRS_4.3                                                2
UPDRS_4.4                                                4
UPDRS_4.5                                                3
UPDRS_4.6                                                3
dtype: object

In [49]:
labels.min(axis=0)

index                                                    0
measurement_id        004ed441-24db-4839-8b5d-7465e4ea2a0a
subject_id                                            1000
on_off                                                   0
dyskinesia                                               0
tremor                                                   0
Age                                                     36
Gender                                              Female
UPDRS_PartI_Total                                        4
UPDRS_PartII_Total                                       2
UPDRS_4.1                                                0
UPDRS_4.2                                                0
UPDRS_4.3                                                0
UPDRS_4.4                                                0
UPDRS_4.5                                                0
UPDRS_4.6                                                0
dtype: object

In [50]:
pd.isna(labels).any(axis=0)

index                 False
measurement_id        False
subject_id            False
on_off                 True
dyskinesia             True
tremor                 True
Age                   False
Gender                False
UPDRS_PartI_Total     False
UPDRS_PartII_Total    False
UPDRS_4.1             False
UPDRS_4.2             False
UPDRS_4.3             False
UPDRS_4.4             False
UPDRS_4.5             False
UPDRS_4.6             False
dtype: bool

In [19]:
poss_missing_labels = ["on_off", "dyskinesia", "tremor"]

In [20]:
labels[labels.measurement_id=="014aa655-5dbf-448b-99ad-09fd7776a682"]

Unnamed: 0,level_0,index,measurement_id,subject_id,on_off,dyskinesia,tremor,Age,Gender,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
1962,1962,104,014aa655-5dbf-448b-99ad-09fd7776a682,1018,0.0,0.0,0.0,58,Male,6,2,0,0,0,0,0,0


In [21]:
all_m_id = labels.measurement_id
test_m_id = all_m_id[0]

In [22]:
#lets go and filter data with some utility funcs
from scipy.signal import butter, lfilter

# https://scipy-cookbook.readthedocs.io/items/ButterworthBandpass.html

# grabs some filter constants for making bandpass filter
#    order is kinda like strength of filter... higher leads to more ideal filter but has weird interactions near the edges of the filter
#    lower is less ideal but results in less artifacts being generated
#    this is a bit of a cheat anyways, using a lowpass and highpass together, there may be less sketch filter designs applicable
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y
def butter_lp_filter(data, lowcut, fs, order=5):
    nyq = 0.5 * fs #just get the highest freq possible (nyquist, and bandgap it!)
    hc = nyq * 0.9 #can't accept exactly nyq
    print("hc:", hc)
    return butter_bandpass_filter(data, lowcut, hyc, fs, order)
def butter_bandgap_filter(data, lowcut, highcut, fs, order=5):
    if highcut is None:
        return butter_bandgap_filter(data, lowcut, fs, order)
    toRemove = butter_bandpass_filter(data, lowcut, highcut, fs, order)
    return data - toRemove

In [23]:
def get_data(m_id):
    return main.read_seq(
        f"/home/ms994/beat_pd/data/cis-pd/training_data/{m_id}.csv",
        t_colname="Timestamp",
        xyz_colnames=["X", "Y", "Z"],
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
def get_preprocessed_data(measurement_id, low_f=1, high_f=10):
    data = main.read_seq(
        f"/home/ms994/beat_pd/data/cis-pd/training_data/{measurement_id}.csv",
        t_colname="Timestamp",
        xyz_colnames=["X", "Y", "Z"],
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
    timeIndex = data.index
    dataCol = data.columns
    #50 hz, make a bandpass between 1 and 10 hz, with order of 5
    data = butter_bandpass_filter(data, low_f, high_f, 50, 5)
    data = pd.DataFrame(data, index=timeIndex, columns=dataCol)
    return data

In [24]:
def read_mid_and_split(m_id, max_window=pd.Timedelta(seconds=60), overlap=pd.Timedelta(seconds=10)):
    try:
        data = get_data(m_id)
        all_samples = []
        currentIndex = pd.Timedelta(seconds=0)
        while (currentIndex + max_window < data.index.max()):
            all_samples.append(data.loc[currentIndex:currentIndex+max_window].values)
            currentIndex += overlap
        return all_samples
    except:
        return []

In [25]:
all_m_id = sorted(all_m_id) #keep a constant m_id

In [26]:
all_m_id[0]

'004ed441-24db-4839-8b5d-7465e4ea2a0a'

In [27]:
def write_data(to_run_q, to_return_q):
    for m_id in iter(to_run_q.get, None):
        data_samples = read_mid_and_split(m_id) 
        label = labels[labels.measurement_id == m_id]
        for data_sample in data_samples:
            to_return_q.put((data_sample, label))

In [28]:
all_m_id[0]

'004ed441-24db-4839-8b5d-7465e4ea2a0a'

In [29]:
from multiprocessing import Process, Manager
m = Manager()
n_process = 9
toRunQ = m.Queue()
toReturnQ = m.Queue()
p = [Process(target=write_data, args=(toRunQ, toReturnQ)) for i in range(n_process)]
[toRunQ.put(m_id) for m_id in all_m_id]
[toRunQ.put(None) for i in range(n_process)]

[None, None, None, None, None, None, None, None, None]

In [30]:
[process.start() for process in p]

[None, None, None, None, None, None, None, None, None]

In [31]:
[process.join() for process in p]

[None, None, None, None, None, None, None, None, None]

In [32]:
toReturnQ.qsize()

204098

In [34]:
all_subjects = sorted(labels.subject_id.unique())

In [36]:
def gen():
    while not toRunQ.empty() and not toReturnQ.empty():
        data = toReturnQ.get()
        yield parse_datum(data).SerializeToString()

In [37]:
allResults = []
while not toReturnQ.empty():
    allResults.append(toReturnQ.get())

In [79]:
from random import shuffle
shuffle(allResults)

In [None]:
allRes

In [39]:
allLabels = [res[1] for res in allResults]

In [46]:
df = pd.concat(allLabels)
df.head()

Unnamed: 0,level_0,index,measurement_id,subject_id,on_off,dyskinesia,tremor,Age,Gender,UPDRS_PartI_Total,UPDRS_PartII_Total,UPDRS_4.1,UPDRS_4.2,UPDRS_4.3,UPDRS_4.4,UPDRS_4.5,UPDRS_4.6
1632,1632,1632,00544f67-c07c-4a07-9c17-a7aee51d8b96,1049,2.0,1.0,2.0,54,Female,25,14,1,1,1,2,3,3
1632,1632,1632,00544f67-c07c-4a07-9c17-a7aee51d8b96,1049,2.0,1.0,2.0,54,Female,25,14,1,1,1,2,3,3
1632,1632,1632,00544f67-c07c-4a07-9c17-a7aee51d8b96,1049,2.0,1.0,2.0,54,Female,25,14,1,1,1,2,3,3
1632,1632,1632,00544f67-c07c-4a07-9c17-a7aee51d8b96,1049,2.0,1.0,2.0,54,Female,25,14,1,1,1,2,3,3
1632,1632,1632,00544f67-c07c-4a07-9c17-a7aee51d8b96,1049,2.0,1.0,2.0,54,Female,25,14,1,1,1,2,3,3


In [35]:
import pickle as pkl
pkl.dump(allResults, open("/n/scratch2/ms994/allZeData.pkl", "wb"))

NameError: name 'allResults' is not defined

## set up the eval set

In [5]:
import glob
import pandas as pd

In [2]:
eval_set = glob.glob("/home/ms994/beat_pd/data/test_set/cis-pd/testing_data/*.csv")

In [25]:
def get_data(m_id):
    return main.read_seq(
        f"{m_id}",
        t_colname="Timestamp",
        xyz_colnames=["X", "Y", "Z"],
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
def read_mid_and_split(m_id, max_window=pd.Timedelta(seconds=60), overlap=pd.Timedelta(seconds=10)):
#     try:
        data = get_data(m_id)
        all_samples = []
        currentIndex = pd.Timedelta(seconds=0)
        while (currentIndex + max_window < data.index.max()):
            all_samples.append(data.loc[currentIndex:currentIndex+max_window].values)
            currentIndex += overlap
        return all_samples
#     except:
#         return []

In [38]:
def write_data(to_run_q, to_return_q):
    for m_id in iter(to_run_q.get, None):
        data_samples = read_mid_and_split(m_id) 
        for data_sample in data_samples:
            to_return_q.put((data_sample, m_id))

In [28]:
all_m_id[0]

'004ed441-24db-4839-8b5d-7465e4ea2a0a'

In [39]:
from multiprocessing import Process, Manager
m = Manager()
n_process = 12
toRunQ = m.Queue()
toReturnQ = m.Queue()
p = [Process(target=write_data, args=(toRunQ, toReturnQ)) for i in range(n_process)]
[toRunQ.put(m_id) for m_id in eval_set]
[toRunQ.put(None) for i in range(n_process)]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [40]:
[process.start() for process in p]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [41]:
[process.join() for process in p]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [34]:
toRunQ.qsize()

0

In [34]:
all_subjects = sorted(labels.subject_id.unique())

In [36]:
def gen():
    while not toRunQ.empty() and not toReturnQ.empty():
        data = toReturnQ.get()
        yield parse_datum(data).SerializeToString()

In [37]:
allResults = []
while not toReturnQ.empty():
    allResults.append(toReturnQ.get())

In [79]:
from random import shuffle
shuffle(allResults)

In [42]:
allEvalResults = []
while not toReturnQ.empty():
    allEvalResults.append(toReturnQ.get())

In [44]:
import pickle as pkl
pkl.dump(allEvalResults, open("/n/scratch2/ms994/cispdEvalset.pkl", "wb"))

# TFRecords setup
reduces memory usage to maintain slurm sshare fair usage credits

In [3]:
import tensorflow as tf
# Helperfunctions to make your feature definition more readable
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _int64_feature_list(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
# Helperfunctions to make your feature definition more readable
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _float_feature_list(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))
def parse_datum(datum):
    xData = datum[0]
    yData = datum[1].fillna(-1) #using -1 as a masking value
    
    
    
    feature = { \
               'data': _float_feature_list(xData[:1500].reshape(-1)), \
               'on_off': _int64_feature_list(yData.on_off.values.astype(int)), \
               'dyskinesia': _int64_feature_list(yData.dyskinesia.values.astype(int)), \
               'measurement_id': _int64_feature(all_m_id.index(yData.measurement_id.values[0])), \
               'tremor': _int64_feature_list(yData.tremor.values.astype(int)), \
               'age': _int64_feature_list(yData.Age.values.astype(int)), \
               'subjects': _int64_feature(all_subjects.index(yData.subject_id.values[0])),
               'gender': _int64_feature_list(yData.Gender.apply(lambda x: x=="Male").values.astype(int)), \
               'UPDRS_PartI_Total': _int64_feature_list(yData.UPDRS_PartI_Total.values.astype(int)), \
               'UPDRS_PartII_Total': _int64_feature_list(yData.UPDRS_PartII_Total.values.astype(int)), \
               'UPDRS_4.1': _int64_feature_list(yData["UPDRS_4.1"].values.astype(int)), \
               'UPDRS_4.2': _int64_feature_list(yData["UPDRS_4.2"].values.astype(int)), \
               'UPDRS_4.3': _int64_feature_list(yData["UPDRS_4.3"].values.astype(int)), \
               'UPDRS_4.4': _int64_feature_list(yData["UPDRS_4.4"].values.astype(int)), \
               'UPDRS_4.5': _int64_feature_list(yData["UPDRS_4.5"].values.astype(int)), \
               'UPDRS_4.6': _int64_feature_list(yData["UPDRS_4.6"].values.astype(int)),
              }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [81]:
#cuz im having issues with pulling directly off queue
def gen():
    for datum in allResults:
        yield parse_datum(datum).SerializeToString()

In [82]:
dataset = tf.data.Dataset.from_generator(gen, output_types=tf.string,  output_shapes=(tf.TensorShape([])))
writer = tf.data.experimental.TFRecordWriter("/n/scratch2/beat_pd_ms_tmp/all_data.tfr")
writer.write(dataset)

In [None]:
# %%bash
# scancel 2919535

## Confirm TFRecords data correctly wrote down
Confirmation that data is correct

In [3]:
import tensorflow as tf
tf.enable_eager_execution()
# Helperfunctions to make your feature definition more readable

def read_tfrecord(example):
    features = { \
                'data':  tf.io.FixedLenFeature([1500*3], tf.float32,),\
                'on_off':  tf.io.FixedLenFeature([1], tf.int64,),\
                'dyskinesia':  tf.io.FixedLenFeature([1], tf.int64,),
                'measurement_id':  tf.io.FixedLenFeature([1], tf.int64,),\
                'tremor':  tf.io.FixedLenFeature([1], tf.int64,), \
                'age':  tf.io.FixedLenFeature([1], tf.int64,), \
                "subjects": tf.io.FixedLenFeature([1], tf.int64), \
                "gender": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_PartI_Total": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_PartII_Total": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_4.1": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_4.2": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_4.3": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_4.4": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_4.5": tf.io.FixedLenFeature([1], tf.int64), \
                "UPDRS_4.6": tf.io.FixedLenFeature([1], tf.int64)
               }

    example = tf.io.parse_single_example(example, features)
    return example
def map_example_to_simple(example):
    data = example['data']
    data = tf.reshape(data, (1500,3))
    return data, (example['on_off'][0],)
dataset = tf.data.Dataset.list_files("/n/scratch2/beat_pd_ms_tmp/all_data.tfr")
option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False
dataset = dataset.with_options(option_no_order)
dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=1, num_parallel_calls=1)

dataset = dataset.map(read_tfrecord, num_parallel_calls=1)

In [11]:
iterator = dataset.take(20000).make_one_shot_iterator()

In [12]:
m_id = []
on_off = []

for i in iterator:
    m_id.append(i["measurement_id"][0].numpy())
    on_off.append(i["on_off"][0].numpy())    

In [14]:
df = pd.DataFrame([m_id, on_off])

# RealPD
## RealPD data setup

In [45]:
labels = pd.read_csv("/home/ms994/beat_pd/data/real-pd/data_labels/REAL-PD_Training_Data_IDs_Labels.csv")
additional_labels = pd.read_csv("/home/ms994/beat_pd/data/real-pd/data_labels/REAL-PD_Ancillary_Data_IDs_Labels.csv")

In [47]:
demo_data = pd.read_csv("/home/ms994/beat_pd/data/real-pd/clinical_data/REAL-PD_Demographics.csv")
updrs_1 = pd.read_csv("/home/ms994/beat_pd/data/real-pd/clinical_data/REAL-PD_UPDRS_Part1_2_4.csv")
updrs_2 = pd.read_csv("/home/ms994/beat_pd/data/real-pd/clinical_data/REAL-PD_UPDRS_Part3.csv")

In [48]:
all_data = demo_data.join(updrs_1.set_index("subject_id"), on="subject_id").join(updrs_2.set_index("subject_id"))

In [49]:
labels = pd.concat([labels, additional_labels])

In [50]:
labels = labels.reset_index()

In [51]:
labels = labels.join(all_data.set_index("subject_id"), how="inner", on="subject_id")

In [66]:
def get_data(m_id):
    return main.read_seq(
        f"/home/ms994/beat_pd/data/real-pd/training_data/smartwatch_accelerometer/{m_id}.csv",
        
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
def get_preprocessed_data(measurement_id, low_f=1, high_f=10):
    data = main.read_seq(
        f"/home/ms994/beat_pd/data/real-pd/training_data/{measurement_id}.csv",
        t_colname="Timestamp",
        xyz_colnames=["X", "Y", "Z"],
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
    timeIndex = data.index
    dataCol = data.columns
    #50 hz, make a bandpass between 1 and 10 hz, with order of 5
    data = butter_bandpass_filter(data, low_f, high_f, 50, 5)
    data = pd.DataFrame(data, index=timeIndex, columns=dataCol)
    return data

In [67]:
def read_mid_and_split(m_id, max_window=pd.Timedelta(seconds=60), overlap=pd.Timedelta(seconds=10)):
        data = get_data(m_id)
        all_samples = []
        currentIndex = pd.Timedelta(seconds=0)
        while (currentIndex + max_window < data.index.max()):
            all_samples.append(data.loc[currentIndex:currentIndex+max_window].values)
            currentIndex += overlap
        return all_samples

In [68]:
all_m_id = labels.measurement_id
test_m_id = all_m_id[0]
all_m_id = sorted(all_m_id) #keep a constant m_id

In [62]:
all_m_id[0]

'00a49337-386c-4de3-a220-4cf3c0d20a7d'

In [75]:
def write_data(to_run_q, to_return_q):
        for m_id in iter(to_run_q.get, None):
            try:
                data_samples = read_mid_and_split(m_id) 
                label = labels[labels.measurement_id == m_id]
                for data_sample in data_samples:
                    to_return_q.put((data_sample, label))
            except:
                print("fail")
        

In [76]:
from multiprocessing import Process, Manager
m = Manager()
n_process = 12
toRunQ = m.Queue()
toReturnQ = m.Queue()
p = [Process(target=write_data, args=(toRunQ, toReturnQ)) for i in range(n_process)]
[toRunQ.put(m_id) for m_id in all_m_id]
[toRunQ.put(None) for i in range(n_process)]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [77]:
[process.start() for process in p]

fail
fail
fail
fail
fail


[None, None, None, None, None, None, None, None, None, None, None, None]

fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail
fail


In [81]:
toReturnQ.qsize()

19565

In [82]:
[process.join() for process in p]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [30]:
toRunQ.qsize()

0

In [34]:
all_subjects = sorted(labels.subject_id.unique())

In [36]:
def gen():
    while not toRunQ.empty() and not toReturnQ.empty():
        data = toReturnQ.get()
        yield parse_datum(data).SerializeToString()

In [83]:
allResults = []
while not toReturnQ.empty():
    allResults.append(toReturnQ.get())

In [79]:
from random import shuffle
shuffle(allResults)

In [84]:
import pickle as pkl
pkl.dump(allResults, open("/n/scratch2/ms994/realPDdata.pkl","wb"))

## Eval

In [85]:
import glob
import pandas as pd

In [88]:
eval_set = glob.glob("/home/ms994/beat_pd/data/test_set/real-pd/testing_data/smartwatch_accelerometer/*.csv")

In [92]:
def get_data(m_id):
    return main.read_seq(
        f"{m_id}",
        use_time_index=True,
        resample=pd.Timedelta(seconds=1/25)
    )
def read_mid_and_split(m_id, max_window=pd.Timedelta(seconds=60), overlap=pd.Timedelta(seconds=10)):
#     try:
        data = get_data(m_id)
        all_samples = []
        currentIndex = pd.Timedelta(seconds=0)
        while (currentIndex + max_window < data.index.max()):
            all_samples.append(data.loc[currentIndex:currentIndex+max_window].values)
            currentIndex += overlap
        return all_samples
#     except:
#         return []

In [93]:
def write_data(to_run_q, to_return_q):
    for m_id in iter(to_run_q.get, None):
        data_samples = read_mid_and_split(m_id) 
        for data_sample in data_samples:
            to_return_q.put((data_sample, m_id))

In [28]:
all_m_id[0]

'004ed441-24db-4839-8b5d-7465e4ea2a0a'

In [99]:
from multiprocessing import Process, Manager
m = Manager()
n_process = 12
toRunQ = m.Queue()
toReturnQ = m.Queue()
p = [Process(target=write_data, args=(toRunQ, toReturnQ)) for i in range(n_process)]
[toRunQ.put(m_id) for m_id in eval_set]
[toRunQ.put(None) for i in range(n_process)]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [100]:
[process.start() for process in p]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [101]:
[process.join() for process in p]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [97]:
toRunQ.qsize()

0

In [34]:
all_subjects = sorted(labels.subject_id.unique())

In [36]:
def gen():
    while not toRunQ.empty() and not toReturnQ.empty():
        data = toReturnQ.get()
        yield parse_datum(data).SerializeToString()

In [79]:
from random import shuffle
shuffle(allResults)

In [102]:
allEvalResults = []
while not toReturnQ.empty():
    allEvalResults.append(toReturnQ.get())

In [103]:
import pickle as pkl
pkl.dump(allEvalResults, open("/n/scratch2/ms994/realpdEvalset.pkl", "wb"))