In [1]:
import pandas as pd
import numpy as np

In [2]:
csv= '/home/mei/nas/docker/thesis/data/csv/'

In [6]:
print('==> Loading data from labels and flat features files...')
flat = pd.read_csv(csv + 'flat.csv')

==> Loading data from labels and flat features files...


In [7]:
# make naming consistent with the other tables
flat.rename(columns={'patientunitstayid': 'patient'}, inplace=True)
flat.set_index('patient', inplace=True)

# admission diagnosis is dealt with in diagnoses.py not flat features
flat.drop(columns=['apacheadmissiondx'], inplace=True)
flat.drop(columns=['unitdischargelocation'], inplace=True)
flat.drop(columns=['unitdischargestatus'], inplace=True)


## gender male 1, female 0
flat.loc[:, 'gender'] = flat['gender'].replace({'Male': 1, 'Female': 0})
flat=flat[flat['gender'].isin([0,1])]

## mask >89 as 1, fill na with 62 (mean), drop <18
flat['age'] = flat['age'].fillna('64')
flat['> 89'] = flat['age'].str.contains('> 89').astype(int)
flat['age'] = flat['age'].str.replace('> ', '', regex=True).astype(float)


# min-max scaling for the rest of the features
features_for_min_max = ['admissionweight', 'dischargeweight']
flat[features_for_min_max] = flat[features_for_min_max].apply(
    lambda x: (x - x.min()) / (x.max() - x.min()), axis=0
)

In [8]:
flat

Unnamed: 0_level_0,hour,gender,age,admissionweight,dischargeweight,> 89
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
141168,15.0,0,70.0,0.098596,0.252353,0
141265,14.0,1,67.0,0.116959,0.270000,0
141266,21.0,1,73.0,0.140819,0.332059,0
141276,14.0,0,59.0,0.183158,0.460588,0
141284,23.0,1,63.0,,0.260294,0
...,...,...,...,...,...,...
3353094,23.0,0,78.0,0.069123,0.169412,0
3353140,10.0,0,89.0,0.080468,0.202353,1
3353147,17.0,1,24.0,0.179415,0.445000,0
3353194,10.0,0,51.0,0.073743,0.193529,0


In [22]:
labels = pd.read_csv(csv + 'labels.csv')

In [23]:
labels

Unnamed: 0,patientunitstayid,actualiculos,unitdischargelocation,unitdischargestatus
0,141168,2.4972,Death,Expired
1,141265,4.2138,Floor,Alive
2,141266,1.0423,Floor,Alive
3,141276,1.1694,Home,Alive
4,141284,1.4416,Floor,Alive
...,...,...,...,...
40782,3353094,5.4923,Step-Down Unit (SDU),Alive
40783,3353140,1.3243,Skilled Nursing Facility,Alive
40784,3353147,1.0888,Floor,Alive
40785,3353194,2.4930,Death,Expired


In [24]:
labels = labels.rename(columns={'patientunitstayid': 'patient'})
labels = labels.set_index('patient')
pd.set_option('future.no_silent_downcasting', True) # to avoid warning
labels.loc[:, 'unitdischargestatus'] = labels['unitdischargestatus'].replace({'Expired': 1, 'Alive': 0})
labels= labels[labels['unitdischargestatus'].isin([0, 1])]

In [20]:
##  unitdischargelocation, classify into 5 categories

def classify_discharge_location(location):
    high_risk = ['ICU', 'Other ICU', 'Other ICU (CABG)', 'Operating Room']
    medium_risk = ['Telemetry', 'Other Hospital', 'Other External',  'Other Internal','Step-Down Unit (SDU)']
    low_risk = ['Nursing Home', 'Skilled Nursing Facility', 'Floor', 'Acute Care/Floor']
    home= ['Home','Rehabilitation']
    death = ['Death'] 
    
    if location in high_risk:
        return 'High Risk'
    elif location in medium_risk:
        return 'Medium Risk'
    elif location in low_risk:
        return 'Low Risk'
    elif location in death:
        return 'Death'
    elif location in home:
        return 'Home'


In [25]:

labels.loc[:, 'discharge_risk_category'] = labels['unitdischargelocation'].apply(classify_discharge_location)
risk_mapping = { 'High Risk': 3, 'Medium Risk': 2, 'Low Risk': 1, 'Home': 0, 'Death': 4}

labels.loc[:, 'discharge_risk_category'] = labels['discharge_risk_category'].astype(str).map(risk_mapping)
labels = labels.dropna(subset=['discharge_risk_category'])
labels.loc[:, 'discharge_risk_category'] = labels['discharge_risk_category'].astype(int)
labels = labels.drop(columns=['unitdischargelocation'])

print("number of patient after cleaning in unitdischargelocation:", len(labels))

number of patient after cleaning in unitdischargelocation: 40504


In [26]:
labels

Unnamed: 0_level_0,actualiculos,unitdischargestatus,discharge_risk_category
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
141168,2.4972,1,4
141265,4.2138,0,1
141266,1.0423,0,1
141276,1.1694,0,0
141284,1.4416,0,1
...,...,...,...
3353094,5.4923,0,2
3353140,1.3243,0,1
3353147,1.0888,0,1
3353194,2.4930,1,4


In [27]:
print('==> Saving finalised preprocessed labels and flat features...')
flat.to_csv(csv + 'preprocessed_flat.csv')
labels.to_csv(csv + 'preprocessed_labels.csv')

==> Saving finalised preprocessed labels and flat features...
