In [6]:
import numpy as np
import glob
import pandas as pd
from datetime import datetime, timedelta
import math

# create a map between the subject_deiden_id and the patient id
patient_map = {}
patient_enrollment = pd.read_excel('/data/daily_data/patient_id_mapping.xlsx')

for row in patient_enrollment.itertuples():
    patient_map[row.subject_deiden_id] = row.patient_id

# Medication information

In [14]:
files = glob.glob('/data/daily_data/*/meds*.csv',
                       recursive = True)
meds = []
taken_dates = []
patient = []
for file in files:
    df = pd.read_csv(file)
    med_info = np.char.lower(df["med_generic_name"].to_numpy(dtype=str))
    date = df["taken_datetime"].to_numpy()
    patient_id = df["patient_deiden_id"].to_numpy()
    for m, d, p in zip(med_info, date, patient_id):
        if not pd.isna(d):
            try:
                timestamp = datetime.strptime(d, '%Y-%m-%d')
            except Exception as e:
                timestamp = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')


            try:
                patient.append(patient_map[p])
                meds.append(m)
                taken_dates.append(timestamp)
            except:
                pass

meds = np.asarray(meds)[:, np.newaxis]
taken_dates = np.asarray(taken_dates)[:, np.newaxis]
patient = np.asarray(patient)[:, np.newaxis]


array = np.concatenate([meds, taken_dates, patient], axis=1)
df_meds = pd.DataFrame(array, columns=["med_generic_name", "taken_dates", "patient_id"])
df_meds["taken_dates"] = pd.to_datetime(df_meds["taken_dates"])
print(df_meds.head())

  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


         med_generic_name taken_dates patient_id
0  hydromorphone hcl-nacl  2021-11-19       P069
1  hydromorphone hcl-nacl  2021-11-19       P069
2  hydromorphone hcl-nacl  2021-11-19       P069
3  hydromorphone hcl-nacl  2021-11-19       P069
4  hydromorphone hcl-nacl  2021-11-19       P069


# Pain information

In [None]:
def process_dvprs(value):
    try:
        score = int(value.split(' ')[0])
    except:
        if value == 'Patient Asleep':
            score = -1
        elif np.isnan(value):
            score = -1
        else:
            sys.exit('error in dvprs')

    return score

files = glob.glob('/data/daily_data/*/pain*.csv',
                      recursive=True)

pain = []
timestamps = []
patient = []
for file in files:
    df = pd.read_csv(file)

    for index, row in df.iterrows():
        try:
            # standardize the timestamp
            # timestamp_patient_id
            timestamp = datetime.strptime(row['pain_datetime'], '%Y-%m-%d %H:%M:%S')
            timestamp = timestamp.strftime('%m-%d-%Y')
            patient_id = patient_map[row.patient_deiden_id]
            key = f"{timestamp}_{patient_id}"
            if row.measurement_name == "pain_uf_dvprs":
                try:
                    score = process_dvprs(row.measurement_value)
                except:
                    score = -1

                pain.append(score)
                patient.append(patient_id)
                timestamps.append(timestamp)
        except KeyError as e:
            #print(e)
            pass

pain = np.asarray(pain)[:, np.newaxis]
timestamps = np.asarray(timestamps)[:, np.newaxis]
patient = np.asarray(patient)[:, np.newaxis]
        
array = np.concatenate([pain, timestamps, patient], axis=1)
df_pain_scores = pd.DataFrame(array, columns=["pain_score", "datetime", "patient_id"])
df_pain_scores["datetime"] = pd.to_datetime(df_pain_scores["datetime"])

print(df_pain_scores.head())

# Accelerometer information

In [9]:
import os
import fnmatch
from tqdm import tqdm

def reduce_sampling_rate(data, df_timestamps, reduce_rate):
    data_ts = df_timestamps
    if reduce_rate != 0:
        number_samp = data.shape[0]
        samples_slct = list(range(0, number_samp, int(reduce_rate)))
        new_data = data[samples_slct]
        data_ts = data_ts[samples_slct]
        return new_data, data_ts
    else:
        return data, df_timestamps


def read_acc_file_pain_adapt(file_name):
    df_acc = pd.read_csv(file_name)
    timestamp_col = None
    for col in df_acc.columns:
        if "timestamp" in col.lower():
            timestamp_col = col
        if "timestamp" not in col.lower() and "accel" not in col.lower():
            df_acc.drop(col, inplace=True, axis=1)
        if "emg" in col.lower():
            raise Exception("EMG data found in accelerometer file")
    if timestamp_col and len(df_acc) >= 4:
        # in case there is more than one accelerometer, drop the others and keep the first one
        columns_to_keep = df_acc.columns[:4].to_numpy()
        diff = set(df_acc.columns.to_numpy()).difference(columns_to_keep)
        for col in diff:
            df_acc.drop(columns=[col], inplace=True)

        # curation already converted the timestamp to EST, so we dont need to convert it again
        # I had to convert timestamps to pandas due to cudf not supporting milliseconds
        # -  that is needed to calculate the frequency of the sensor
        # pain and adapt are in EST time zone. But the function bellow convert it to GMT by default
        #timestamps = df_acc[timestamp_col].to_numpy(dtype="datetime64[ns]")
        #return timestamps, df_acc.drop(columns=[timestamp_col]).to_cupy()
        df_acc[timestamp_col] =  pd.to_datetime(df_acc[timestamp_col])
        return df_acc
    else:
        #debug
        if not timestamp_col:
            print("File {}, message: No timestamp ", file_name)
        if len(df_acc) <= 4:
            print("File {}, message: Not enough columns ", file_name)
        if len(df_acc) == 0:
            print("File {}, message: Empty file ", file_name)
        return None


def read_acc_file_intelligenticu(file_name):
    df_acc = cudf.read_csv(file_name, header=10)
    if "Timestamp" not in df_acc.columns[0]:
        print("File {} has no timestamp column".format(file_name))
        return None
    # we are supposing Intelligent ICU is already on GMT timezone, so no need to convert it
    timestamps = cudf.Series(df_acc["Timestamp"], dtype="datetime64[ms]").to_numpy()
    return timestamps, df_acc.drop(columns=["Timestamp"]).to_cupy()

def get_accs_files(dir_dataset, dataset_name):
        accs = []
        patients = {}
        for root, dirs, files in os.walk(dir_dataset):
            for file in files:
                ends_string = "RAW.csv" if dataset_name == 'intelligent_icu' else "SD.csv"
                if file.endswith(ends_string):
                    if dataset_name == "intelligent_icu":
                        patient = root.split('/')[5].split("_")[1]
                    else:
                        patient = root.split('/')[5]
                    acc_csv = os.path.join(root, file)
                    if patient not in patients:
                        patients[patient] = False
                    # just get csv files from Accelerometer directories
                    if dataset_name == "intelligent_icu":
                        path = f'{dir_dataset}*/Accelerometer/*'
                    else:
                        path = f'{dir_dataset}*/*_Accel/Curated_file/*'

                    if fnmatch.fnmatch(root, path):
                        if 'wrist' in file.lower() or 'arm' in file.lower() or 'emg' in file.lower():
                        #if 'wrist' in file.lower() or 'arm' in file.lower():
                            if 'emg' not in file.lower():
                                patients[patient] = True
                                if acc_csv not in accs:
                                    accs.append(acc_csv)

        for pat, acc_flag in patients.items():
            if not acc_flag:
                print("Patient {}, message: no accelerometer data in directory", pat)

        return accs

### Pain project

In [10]:
dir_dataset = "/home/jsenadesouza/DA-healthy2patient/354_Sensor_data/"
accs_files = get_accs_files(dir_dataset, "pain")

#for file in tqdm(accs_files):
file = "/home/jsenadesouza/DA-healthy2patient/354_Sensor_data/P013/P013_Accel/Curated_file/2021-07-28_08.59.02_P013_arm3_SD_Session1/P013_arm3_Session1_P013_arm3_Calibrated_SD.csv"
samples = []
#try:
if 'wrist' not in file and 'arm' not in file:
    print(f"\nDiscarding: {file}")
else:
    print(f'\nKeeping: {file}')
    df = read_acc_file_pain_adapt(file)
    patient_id = file.split('/')[5]
    init_ts = df[df.columns[0]].min()
    last_ts = df[df.columns[0]].max()
    date_list = pd.date_range(start=init_ts, end=last_ts, inclusive="both", normalize=True)
    daily_acc = []
    for day in date_list:
        mask = (df[df.columns[0]] > day) & (df[df.columns[0]] < day+pd.Timedelta(days=1))
        acc_sequence = df[mask]
        if len(acc_sequence) > 0:
            daily_acc.append(df[mask])
            pain_mask = (df_pain_scores["datetime"] > day) & (df_pain_scores["datetime"] < day+pd.Timedelta(days=1))
            pain_scores = df_pain_scores[pain_mask]
            
            med_mask = (df_meds["taken_date"] > day) & (df_meds["taken_date"] < day+pd.Timedelta(days=1))
            med_list = df_meds[med_mask]
            
            if len(pain_scores) >0 and len(med_list) > 0:
                sample = {"patient_id": patient_id, 
                          "pain_scores": pain_scores, 
                          "meds": med_list, 
                          "acc":acc_sequence}
                samples.append(sample)
                
    

Patient {}, message: no accelerometer data in directory P001
Patient {}, message: no accelerometer data in directory P060
Patient {}, message: no accelerometer data in directory P030
Patient {}, message: no accelerometer data in directory P008
Patient {}, message: no accelerometer data in directory P035
Patient {}, message: no accelerometer data in directory P061
Patient {}, message: no accelerometer data in directory P022
Patient {}, message: no accelerometer data in directory P069
Patient {}, message: no accelerometer data in directory P056
Patient {}, message: no accelerometer data in directory P040
Patient {}, message: no accelerometer data in directory P071
Patient {}, message: no accelerometer data in directory P020
Patient {}, message: no accelerometer data in directory P045
Patient {}, message: no accelerometer data in directory P026
Patient {}, message: no accelerometer data in directory P066
Patient {}, message: no accelerometer data in directory P002
Patient {}, message: no 

<class 'TypeError'>: '>' not supported between instances of 'str' and 'Timestamp'

In [11]:
um_acc_raw = pd.read_csv("/data/datasets/ICU_Data/354_Sensor_Data/P057/Accel/2021-11-05_09.12.40_P057_arm3_SD_Session1/P057_arm3_Session1_P057_arm3_Calibrated_SD.csv", header=2, delimiter='\t')

In [12]:
um_acc_raw.head()

Unnamed: 0,ms,m/(s^2),m/(s^2).1,m/(s^2).2,m/(s^2).3,m/(s^2).4,m/(s^2).5,deg/s,deg/s.1,deg/s.2,Unnamed: 10
0,1636119000000.0,14.130435,8.782609,13.293478,-3.664871,6.420108,7.444644,48.320611,-53.740458,42.763359,
1,1636119000000.0,12.358696,12.402174,8.51087,-3.222023,5.17295,6.920407,30.931298,-59.145038,36.70229,
2,1636119000000.0,11.663043,12.206522,8.695652,-4.337522,5.247157,7.265111,18.671756,-71.816794,77.725191,
3,1636119000000.0,12.01087,11.315217,7.793478,-3.669659,4.689408,5.998803,-0.305344,-67.389313,110.793893,
4,1636119000000.0,9.934783,16.119565,8.0,-5.484141,7.425494,6.408139,-18.030534,-86.824427,150.045802,
