# CONFIG

In [None]:
# Remove patients with amount of samples outside of these invervals.
MIN_LENGTH_OF_STAY = 8                              # 4 or 8
MAX_LENGTH_OF_STAY = 60

STATIC_PARAM = ['WardID', 'onO2', 'Unconscious','Gender', 'StaticImputed', 'WardChange']
DYNAMIC_PARAM = ['Systolic', 'Diastolic', 'O2', 'Pulse', 'Temp', 'Rf', 'EwsScore']
WARD_TO_USE = 'WardALL'                             # alternatives {'Ward2','Ward3','WardALL','WardNO'}
USE_DAY_MEAN = ''       # '_dayMean'  take mean of values for each day.
IMPUTE_GENDER = 'ImputeGender' # 'ImputeGender'  impute gender with random gender with same rate as data.
REMOVE_MISSING_GENDER = '' # 'noMissGender'  remove patient if gender is missing.
GET_WARD_CHANGES = True

PATH_DATA = '../data/'
LOAD_DATA = f'EWS_0122-0423_scrubbed_O{MIN_LENGTH_OF_STAY}U{MAX_LENGTH_OF_STAY}.csv'
SAVE_AS = f'triplets_bodo_O{MIN_LENGTH_OF_STAY}U{MAX_LENGTH_OF_STAY}_{WARD_TO_USE}_{IMPUTE_GENDER}{REMOVE_MISSING_GENDER}{USE_DAY_MEAN}.pkl'

print(f"Save triplet output as: {SAVE_AS}")

In [None]:
import pandas as pd
import pickle 
from tqdm import tqdm
import numpy as np
import os

def set_random_gender(df_in, rate):
    df = df_in.copy()
    df['StaticImputed'] = 0
    unique_patient_ids = df['PatientID'].unique()
    for patient_id in tqdm(unique_patient_ids):
        mask = df['PatientID'] == patient_id
        if -1 in df[mask]['Gender'].values:
            common_random_gender = np.random.choice([0,1], p=[1 - rate, rate])  # Generate a random value 0 or 1
            df.loc[mask, 'Gender'] = common_random_gender
            df.loc[mask, 'StaticImputed'] = 1
    return df

df_orig = pd.read_csv(PATH_DATA + LOAD_DATA)
df = df_orig.copy()

df['onO2'] = 0
df.loc[df.AirOrO2 == 'Oksygen', 'onO2'] = 1
del df["AirOrO2"] 

df['Unconscious'] = 0
df.loc[df.Consciousness != 'Våken', 'Unconscious'] = 1
del df["Consciousness"] 


del df['Index']
df = df.reindex(columns=[
'PatientID','WardID','Timestamp','Systolic','Diastolic','O2','Pulse','Temp','Rf','Gender','onO2','EwsScore','EwsType','Unconscious','WardName'])
df.loc[df.Gender == 'M', 'Gender'] = 1
df.loc[df.Gender == 'F', 'Gender'] = 0
df.loc[df.Gender == 'U', 'Gender'] = -1
df.loc[df.Gender == 'None', 'Gender'] = -1

if REMOVE_MISSING_GENDER == 'noMissGender' and IMPUTE_GENDER == 'ImputeGender':
    raise ValueError(f'{REMOVE_MISSING_GENDER} and {IMPUTE_GENDER} should not be used at the same time!')

if REMOVE_MISSING_GENDER == 'noMissGender':
    value_counts = df.drop_duplicates(subset=['PatientID', 'Gender'])['Gender'].value_counts()
    print('Gender After', value_counts)
    df = df[df['Gender'] != -1]
    value_counts = df.drop_duplicates(subset=['PatientID', 'Gender'])['Gender'].value_counts()
    print('Gender Before', value_counts)

if IMPUTE_GENDER == 'ImputeGender':
    # impute Gender with 0, 1 with the same rate for the non missing values
    value_counts = df.drop_duplicates(subset=['PatientID', 'Gender'])['Gender'].value_counts()
    count_0 = value_counts.get(0, 0)
    count_1 = value_counts.get(1, 0)
    rate = count_1/(count_0+count_1)
    print('Gender Before', value_counts)
    df = set_random_gender(df, rate)
    value_counts = df.drop_duplicates(subset=['PatientID', 'Gender'])['Gender'].value_counts()
    print('Gender After', value_counts)

if GET_WARD_CHANGES:
    df['WardChange'] = df.groupby(['PatientID']).WardID.transform(lambda x: len(x.unique())-1)

# Use all wards or only a single WardID  # alternatives {'Ward2','Ward3','WardALL','WardNO'}
if WARD_TO_USE == 'WardALL':
    pass
elif WARD_TO_USE == 'Ward2':
    df = df[df.WardID == 2]
elif WARD_TO_USE == 'Ward3':
    df = df[df.WardID == 3]
elif WARD_TO_USE == 'WardNO':
    df = df.drop('WardID', axis=1)

print(f'{len(df.PatientID.unique())} patients in data set.')
df.sort_values(by=['PatientID','Timestamp'], inplace=True, ascending = [True,True])

display(df)


In [None]:
# normalize for optimal value
#opt_val  = ['PatientID','WardID','Timestamp', 'Systolic', 'Diastolic', 'O2', 'Pulse', 'Temp', 'Rf', 'Gender', 'onO2',   'Unconscious', 'WardName', 'StaticImputed', 'WardChange']
# opt_val = [     0,         0,        0,          120,        80,      96,    70,     37,     16,     0,        0,             0,       0,            0,               0]

df_optDist = df.drop(['WardName', 'EwsType'], axis=1) 
df_optDist['Systolic'] = df_optDist['Systolic'] - 120
df_optDist['Diastolic'] = df_optDist['Diastolic'] - 80
df_optDist['O2'] = df_optDist['O2'] - 96
df_optDist['Pulse'] = df_optDist['Pulse'] - 70
df_optDist['Temp'] = df_optDist['Temp'] - 37
df_optDist['Rf'] = df_optDist['Rf'] - 16

# df_optDist['Unconscious'] = df['Unconscious']
# df_optDist['WardName'] = df['WardName']
df = df_optDist


if WARD_TO_USE == 'WardNO':    
    df = df.drop(['WaridID'], axis=1)#[['PatientID','Timestamp', 'Systolic', 'Diastolic', 'O2', 'Pulse', 'Temp', 'Rf', 'Gender','onO2' ,'Unconscious', 'EwsScore']]
print(f'{len(df.PatientID.unique())} patients in data set.')

display(df)

# Use daily mean values

In [None]:
if USE_DAY_MEAN == '_dayMean':
    df_dayMean = df.copy()
    df_dayMean['Day'] = pd.to_datetime(df_dayMean.Timestamp, unit='ms')
    df_dayMean.Day = df_dayMean.Day.dt.date
    df_dayMean.Gender = df_dayMean.Gender.fillna(-1)
    df_dayMean = df_dayMean.groupby(['PatientID','Day']).mean().reset_index()
    pat_count = df_dayMean.PatientID.value_counts()
    single_day = pat_count[pat_count<=3].index.tolist()
    print(f'{len(df_dayMean.PatientID.unique())} patients in data set.')
    df_dayMean = df_dayMean[~df_dayMean.PatientID.isin(single_day)]
    print(f'{len(df_dayMean.PatientID.unique())} patients in data set.')
    df_dayMean = df_dayMean[['PatientID', 'Timestamp', 'Day', 'WardID', 'Systolic', 'Diastolic', 'O2', 'Pulse', 'Temp', 'Rf', 'Gender', 'onO2', 'Unconscious', 'EwsScore', 'StaticImputed', 'WardChange']]
    # df = df_dayMean

    # dayMEan columns: PatientID	Timestamp	   Day	    WardID	Systolic	Diastolic	O2	Pulse	Temp	Rf	Gender	onO2	Unconscious
    # normal  columns: PatientID	Timestamp	            WardID	Systolic	Diastolic	O2	Pulse	Temp	Rf	Gender	onO2	Unconscious
    pat_count.hist(bins=30)
    print(f'{len(df_dayMean.PatientID.unique())} patients in data set.')
    display(df)


# Transform the data to triplet form.

In [None]:
# get mean and std for each variable
df_var = pd.DataFrame(columns=['variable', 'mean', 'std'])

for variable in df.columns.tolist():

    if variable in DYNAMIC_PARAM:
        df_var.loc[len(df_var), df_var.columns] = {
            'variable': variable,
            'mean':df[variable].mean(),
            'std':df[variable].std()}
    
    if variable in STATIC_PARAM:
        df_var.loc[len(df_var), df_var.columns] = {
            'variable': variable,
            'mean':0,
            'std':1}
    
df_var

In [None]:
# make the outcome df, not used but needed for STraTS.
oc_triplet = pd.DataFrame(columns=['length_of_stay', 'in_hospital_mortality','ts_ind'])
oc_triplet.ts_ind = range(0, len(df.PatientID.unique()))

oc_triplet.in_hospital_mortality = 0
oc_triplet.loc[[1,3,43, 652,888], 'in_hospital_mortality'] = 1 # add random ones to make the STraTS code run TODO: fix this to a better solution. 

# oc_triplet

In [None]:
# Transform the data to triplet form.
patient_ids = df.PatientID.unique()

# group the dataframe by PatientID
grouped = df.groupby('PatientID')

df_triplet = pd.DataFrame(columns=['ts_ind', 'PatientID', 'checkup', 'hour', 'variable', 'value', 'mean', 'std'])
df_ts_id_PatientID = pd.DataFrame(columns=['ts_ind', 'PatientID'])

for ts_id, ID in tqdm(enumerate(patient_ids), desc=f'making data triplet style for {len(patient_ids)} patients'):
    df_ts_id_PatientID.loc[len(df_ts_id_PatientID)] = [ts_id, ID]
    patient_data = grouped.get_group(ID)
    
    time_enter = patient_data.Timestamp.min()
    time_exit = patient_data.Timestamp.max()
    hours = (patient_data.Timestamp - time_enter) / 3600000 # / (time_exit - time_enter) * millisec in an hour
    checkups = range(0,len(np.unique(patient_data.Timestamp)))

    # normalize data except 'PatientID', 'Timestamp', 'WardID', 'Gender', 'onO2', 'Unconscious'
    normalize = patient_data.columns.isin(DYNAMIC_PARAM)
    normalized_values = ((patient_data.loc[:, normalize] - df_var.loc[df_var.variable.isin(patient_data.columns[normalize]), 'mean'].values) /
                         df_var.loc[df_var.variable.isin(patient_data.columns[normalize]), 'std'].values)
    static_param = patient_data.columns.isin(STATIC_PARAM) 
    static_param_values = patient_data.loc[:, ~normalize]
    values = pd.concat([static_param_values, normalized_values], axis=1)
    value_columns = [ a for a, b in zip(patient_data.columns, normalize+static_param) if b]
    
    # reshape data from wide to long format
    values = pd.melt(values, id_vars=['Timestamp'], value_vars=value_columns, var_name='variable', value_name='value')
    values['hour'] = np.tile(hours.values, sum(normalize+static_param))
    values['checkup'] = np.tile(checkups, sum(normalize+static_param))
    values['ts_ind'] = ts_id
    values['PatientID'] = ID
    values['mean'] = np.repeat(df_var.loc[df_var.variable.isin(values['variable']), 'mean'].values, len(hours))
    values['std']  = np.repeat(df_var.loc[df_var.variable.isin(values['variable']), 'std'].values, len(hours))
    df_triplet = pd.concat([df_triplet, values])

    oc_triplet.loc[ts_id, 'length_of_stay'] = hours.iloc[-1]


    
df_triplet.reset_index(drop=True, inplace=True)
df_triplet

In [None]:
# uses all for training since we have limited data and want to cluster later anyway so no point in good test/validation results
bp1 = int(0.7*len(oc_triplet.ts_ind))
bp2 = int(0.85*len(oc_triplet.ts_ind))

train_ind = oc_triplet.ts_ind.iloc[:].values.astype('int64')
valid_ind = oc_triplet.ts_ind.iloc[bp1:bp2].values.astype('int64')
test_ind = oc_triplet.ts_ind.iloc[bp2:].values.astype('int64')

print(f'{len(df_triplet.PatientID.unique())} patients in data set.')
print(f'Variables in dataset: {df_triplet.variable.unique()}')
df_triplet

# Save data.

In [None]:
if os.path.exists(PATH_DATA):
    print('Saving Triplets as:', PATH_DATA+SAVE_AS)
    pickle.dump([df_triplet, oc_triplet, train_ind, valid_ind, test_ind], open(PATH_DATA+SAVE_AS,'wb'))
else:
    print('Can NOT save file, path does not exist, PATH_DATA:', PATH_DATA)
# df_ts_id_PatientID.to_excel('../data/bodo_ts_id_to_PatientID.xlsx')
