In [1]:
import pandas as pd
import numpy as np

In [2]:
csv= '/home/mei/nas/docker/thesis/data/csv/'

In [5]:
print('==> Loading data from labels and flat features files...')
flat = pd.read_csv(csv + 'flat.csv')

==> Loading data from labels and flat features files...


In [6]:
pd.set_option('future.no_silent_downcasting', True)

# make naming consistent with the other tables
flat.rename(columns={'patientunitstayid': 'patient'}, inplace=True)
flat.set_index('patient', inplace=True)

# admission diagnosis is dealt with in diagnoses.py not flat features
flat.drop(columns=['apacheadmissiondx'], inplace=True)
flat.drop(columns=['unitdischargelocation'], inplace=True)
flat.drop(columns=['unitdischargestatus'], inplace=True)


## gender male 1, female 0
flat.loc[:, 'gender'] = flat['gender'].replace({'Male': 1, 'Female': 0})
flat=flat[flat['gender'].isin([0,1])]

## mask >89 as 1, fill na with 62 (mean), drop <18
flat['age'] = flat['age'].fillna('64')
flat['> 89'] = flat['age'].str.contains('> 89').astype(int)
flat['age'] = flat['age'].str.replace('> ', '', regex=True).astype(float)


# min-max scaling for the rest of the features
features_for_min_max = ['admissionweight', 'dischargeweight']
flat[features_for_min_max] = flat[features_for_min_max].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()), axis=0
)

In [7]:
flat

Unnamed: 0_level_0,hour,gender,age,admissionweight,dischargeweight,> 89
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
252784,15.0,1,56.0,0.261051,0.262847,0
253331,1.0,1,76.0,0.399234,0.397222,0
255112,5.0,1,52.0,0.176123,0.202083,0
258354,19.0,0,61.0,0.258893,0.261458,0
259414,1.0,1,81.0,0.374173,0.370486,0
...,...,...,...,...,...,...
3247116,21.0,0,52.0,0.214758,0.236111,0
3247360,21.0,0,25.0,0.170205,0.209375,0
3247421,1.0,1,59.0,0.265228,0.285069,0
3346588,17.0,0,71.0,0.422903,0.400347,0


In [8]:
labels = pd.read_csv(csv + 'labels.csv')

In [9]:
labels

Unnamed: 0,patientunitstayid,actualiculos,unitdischargelocation,unitdischargestatus
0,252784,2.0500,Floor,Alive
1,253331,1.7625,Floor,Alive
2,255112,10.7381,Death,Expired
3,258354,3.0090,Floor,Alive
4,259414,5.8562,Floor,Alive
...,...,...,...,...
10491,3247116,8.1256,Death,Expired
10492,3247360,3.0097,Floor,Alive
10493,3247421,2.9520,Floor,Alive
10494,3346588,2.1826,Telemetry,Alive


In [10]:
labels = labels.rename(columns={'patientunitstayid': 'patient'})
labels = labels.set_index('patient')
pd.set_option('future.no_silent_downcasting', True) # to avoid warning
labels.loc[:, 'unitdischargestatus'] = labels['unitdischargestatus'].replace({'Expired': 1, 'Alive': 0})
labels= labels[labels['unitdischargestatus'].isin([0, 1])]

In [11]:
##  unitdischargelocation, classify into 5 categories

def classify_discharge_location(location):
    high_risk = ['ICU', 'Other ICU', 'Other ICU (CABG)', 'Operating Room']
    medium_risk = ['Telemetry', 'Other Hospital', 'Other External',  'Other Internal','Step-Down Unit (SDU)']
    low_risk = ['Nursing Home', 'Skilled Nursing Facility', 'Floor', 'Acute Care/Floor']
    home= ['Home','Rehabilitation']
    death = ['Death'] 
    
    if location in high_risk:
        return 'High Risk'
    elif location in medium_risk:
        return 'Medium Risk'
    elif location in low_risk:
        return 'Low Risk'
    elif location in death:
        return 'Death'
    elif location in home:
        return 'Home'


In [12]:

labels.loc[:, 'discharge_risk_category'] = labels['unitdischargelocation'].apply(classify_discharge_location)
risk_mapping = { 'High Risk': 3, 'Medium Risk': 2, 'Low Risk': 1, 'Home': 0, 'Death': 4}

labels.loc[:, 'discharge_risk_category'] = labels['discharge_risk_category'].astype(str).map(risk_mapping)
labels = labels.dropna(subset=['discharge_risk_category'])
labels.loc[:, 'discharge_risk_category'] = labels['discharge_risk_category'].astype(int)
labels = labels.drop(columns=['unitdischargelocation'])

print("number of patient after cleaning in unitdischargelocation:", len(labels))

number of patient after cleaning in unitdischargelocation: 10415


In [13]:
labels

Unnamed: 0_level_0,actualiculos,unitdischargestatus,discharge_risk_category
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
252784,2.0500,0,1
253331,1.7625,0,1
255112,10.7381,1,4
258354,3.0090,0,1
259414,5.8562,0,1
...,...,...,...
3247116,8.1256,1,4
3247360,3.0097,0,1
3247421,2.9520,0,1
3346588,2.1826,0,2


In [14]:
print('==> Saving finalised preprocessed labels and flat features...')
flat.to_csv(csv + 'preprocessed_flat.csv')
labels.to_csv(csv + 'preprocessed_labels.csv')

==> Saving finalised preprocessed labels and flat features...
