In [1]:
import pandas as pd
import numpy as np

In [2]:
csv= '/home/mei/nas/docker/thesis/data/csv/'

In [3]:
print('==> Loading data from labels and flat features files...')
flat = pd.read_csv(csv + 'flat.csv')

==> Loading data from labels and flat features files...


In [4]:
print("there are {} patients in the  and {} records in flat".format(flat['patientunitstayid'].nunique(), flat.shape[0]))

there are 12260 patients in the  and 12260 records in flat


In [5]:
pd.set_option('future.no_silent_downcasting', True)

# make naming consistent with the other tables
flat.rename(columns={'patientunitstayid': 'patient'}, inplace=True)
flat.set_index('patient', inplace=True)

# admission diagnosis is dealt with in diagnoses.py not flat features
flat.drop(columns=['apacheadmissiondx'], inplace=True)
flat.drop(columns=['unitdischargelocation'], inplace=True)
flat.drop(columns=['unitdischargestatus'], inplace=True)


## gender male 1, female 0
flat.loc[:, 'gender'] = flat['gender'].replace({'Male': 1, 'Female': 0})
flat=flat[flat['gender'].isin([0,1])]

## mask >89 as 1, fill na with 62 (mean), drop <18
flat['age'] = flat['age'].fillna('64')
flat['> 89'] = flat['age'].str.contains('> 89').astype(int)
flat['age'] = flat['age'].str.replace('> ', '', regex=True).astype(float)


In [6]:
## admission weight, discharge weight, fill na with mean, drop <0.1% and >99.9%
flat['admissionweight'] = flat['admissionweight'].fillna(flat['admissionweight'].mean())
flat['dischargeweight'] = flat['dischargeweight'].fillna(flat['dischargeweight'].mean())

admission_low = flat['admissionweight'].quantile(0.001)  # 0.1% 分位数
admission_high = flat['admissionweight'].quantile(0.999)  # 99.9% 分位数

flat = flat.loc[
    (flat['admissionweight'] >= admission_low) & (flat['admissionweight'] <= admission_high) 
]

flat.loc[:, 'admissionweight'] = flat['admissionweight'].round(1)
flat.loc[:, 'dischargeweight'] = flat['dischargeweight'].round(1)

In [7]:
flat

Unnamed: 0_level_0,gender,age,admissionweight,dischargeweight,> 89
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
252784,1,56.0,75.0,75.7,0
253331,1,76.0,114.7,114.4,0
255112,1,52.0,50.6,58.2,0
258354,0,61.0,74.4,75.3,0
259414,1,81.0,107.5,106.7,0
...,...,...,...,...,...
3247360,0,25.0,48.9,60.3,0
3247421,1,59.0,76.2,82.1,0
3346588,0,71.0,121.5,115.3,0
3347496,0,56.0,71.2,71.3,0


In [8]:
labels = pd.read_csv(csv + 'labels.csv')

In [9]:
labels

Unnamed: 0,patientunitstayid,dischargeweight,unitdischargelocation,unitdischargestatus,actualiculos
0,252784,75.7,Floor,Alive,2.0500
1,253331,114.4,Floor,Alive,1.7625
2,255112,58.2,Death,Expired,10.7381
3,258354,75.3,Floor,Alive,3.0090
4,259414,106.7,Floor,Alive,5.8562
...,...,...,...,...,...
12255,3247360,60.3,Floor,Alive,3.0097
12256,3247421,82.1,Floor,Alive,2.9520
12257,3346588,115.3,Telemetry,Alive,2.1826
12258,3347496,71.3,Floor,Alive,1.5340


In [10]:
labels = labels.rename(columns={'patientunitstayid': 'patient'})
labels = labels.set_index('patient')

labels['dischargeweight'] = labels['dischargeweight'].fillna(labels['dischargeweight'].mean())
discharge_low = labels['dischargeweight'].quantile(0.001)  # 0.1% 分位数
discharge_high = labels['dischargeweight'].quantile(0.999)  # 99.9% 分位数

labels= labels.loc[(labels['dischargeweight'] >= discharge_low) & (labels['dischargeweight'] <= discharge_high)]

pd.set_option('future.no_silent_downcasting', True) # to avoid warning
labels.loc[:, 'unitdischargestatus'] = labels['unitdischargestatus'].replace({'Expired': 1, 'Alive': 0})
labels= labels[labels['unitdischargestatus'].isin([0, 1])]

In [11]:
##  unitdischargelocation, classify into 5 categories

def classify_discharge_location(location):
    high_risk = ['ICU', 'Other ICU', 'Other ICU (CABG)', 'Operating Room']
    medium_risk = ['Telemetry', 'Other Hospital', 'Other External',  'Other Internal','Step-Down Unit (SDU)']
    low_risk = ['Nursing Home', 'Skilled Nursing Facility', 'Floor', 'Acute Care/Floor']
    home= ['Home','Rehabilitation']
    death = ['Death'] 
    
    if location in high_risk:
        return 'High Risk'
    elif location in medium_risk:
        return 'Medium Risk'
    elif location in low_risk:
        return 'Low Risk'
    elif location in death:
        return 'Death'
    elif location in home:
        return 'Home'


In [12]:

labels.loc[:, 'discharge_risk_category'] = labels['unitdischargelocation'].apply(classify_discharge_location)
risk_mapping = { 'High Risk': 3, 'Medium Risk': 2, 'Low Risk': 1, 'Home': 0, 'Death': 4}

labels.loc[:, 'discharge_risk_category'] = labels['discharge_risk_category'].astype(str).map(risk_mapping)
labels = labels.dropna(subset=['discharge_risk_category'])
labels.loc[:, 'discharge_risk_category'] = labels['discharge_risk_category'].astype(int)
labels = labels.drop(columns=['unitdischargelocation'])

print("number of patient after cleaning in unitdischargelocation:", len(labels))

number of patient after cleaning in unitdischargelocation: 12143


In [13]:
labels

Unnamed: 0_level_0,dischargeweight,unitdischargestatus,actualiculos,discharge_risk_category
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
252784,75.7,0,2.0500,1
253331,114.4,0,1.7625,1
255112,58.2,1,10.7381,4
258354,75.3,0,3.0090,1
259414,106.7,0,5.8562,1
...,...,...,...,...
3247360,60.3,0,3.0097,1
3247421,82.1,0,2.9520,1
3346588,115.3,0,2.1826,2
3347496,71.3,0,1.5340,1


In [14]:
flat=flat.drop(columns=['dischargeweight'])

In [15]:
flat

Unnamed: 0_level_0,gender,age,admissionweight,> 89
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
252784,1,56.0,75.0,0
253331,1,76.0,114.7,0
255112,1,52.0,50.6,0
258354,0,61.0,74.4,0
259414,1,81.0,107.5,0
...,...,...,...,...
3247360,0,25.0,48.9,0
3247421,1,59.0,76.2,0
3346588,0,71.0,121.5,0
3347496,0,56.0,71.2,0


In [16]:
labels_ = labels.join(flat[['gender', 'age']], how='left')
labels_

Unnamed: 0_level_0,dischargeweight,unitdischargestatus,actualiculos,discharge_risk_category,gender,age
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
252784,75.7,0,2.0500,1,1,56.0
253331,114.4,0,1.7625,1,1,76.0
255112,58.2,1,10.7381,4,1,52.0
258354,75.3,0,3.0090,1,0,61.0
259414,106.7,0,5.8562,1,1,81.0
...,...,...,...,...,...,...
3247360,60.3,0,3.0097,1,0,25.0
3247421,82.1,0,2.9520,1,1,59.0
3346588,115.3,0,2.1826,2,0,71.0
3347496,71.3,0,1.5340,1,0,56.0


In [19]:
print('==> Saving finalised preprocessed labels and flat features...')
flat.to_csv(csv + 'preprocessed_flat.csv')
labels_.to_csv(csv + 'preprocessed_labels.csv')

==> Saving finalised preprocessed labels and flat features...


In [20]:
hdf = '/home/mei/nas/docker/thesis/data/hdf/'
final_flat=pd.read_hdf(hdf + 'final_flat.h5')

In [21]:
final_flat


Unnamed: 0,patient,gender,age,admissionweight,> 89
0,252784,1,0.565789,0.226936,0
1,253331,1,0.828947,0.422792,0
2,255112,1,0.513158,0.106561,0
3,258354,0,0.631579,0.223976,0
4,259414,1,0.894737,0.387272,0
...,...,...,...,...,...
11693,3247116,0,0.513158,0.161322,0
11694,3247421,1,0.605263,0.232856,0
11695,3346588,0,0.763158,0.456339,0
11696,3347496,0,0.565789,0.208189,0


In [23]:
flat_patients = final_flat['patient'].unique()
labels_filtered = labels_.loc[flat_patients]
labels_filtered

Unnamed: 0_level_0,dischargeweight,unitdischargestatus,actualiculos,discharge_risk_category,gender,age
patient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
252784,75.7,0,2.0500,1,1,56.0
253331,114.4,0,1.7625,1,1,76.0
255112,58.2,1,10.7381,4,1,52.0
258354,75.3,0,3.0090,1,0,61.0
259414,106.7,0,5.8562,1,1,81.0
...,...,...,...,...,...,...
3247116,68.0,1,8.1256,4,0,52.0
3247421,82.1,0,2.9520,1,1,59.0
3346588,115.3,0,2.1826,2,0,71.0
3347496,71.3,0,1.5340,1,0,56.0


In [27]:
labels_filtered['gender'] = labels_filtered['gender'].astype('int8')
labels_filtered['unitdischargestatus'] = labels_filtered['unitdischargestatus'].astype('int8')
labels_filtered['discharge_risk_category'] = labels_filtered['discharge_risk_category'].astype('int8')

print(labels_filtered.dtypes)


dischargeweight            float64
unitdischargestatus           int8
actualiculos               float64
discharge_risk_category       int8
gender                        int8
age                        float64
dtype: object


In [33]:
labels_filtered.to_hdf(hdf + 'final_labels.h5', key='df', mode='w')

In [32]:
labels_filtered = labels_filtered.reset_index()

labels_filtered

Unnamed: 0,patient,dischargeweight,unitdischargestatus,actualiculos,discharge_risk_category,gender,age
0,252784,75.7,0,2.0500,1,1,56.0
1,253331,114.4,0,1.7625,1,1,76.0
2,255112,58.2,1,10.7381,4,1,52.0
3,258354,75.3,0,3.0090,1,0,61.0
4,259414,106.7,0,5.8562,1,1,81.0
...,...,...,...,...,...,...,...
11693,3247116,68.0,1,8.1256,4,0,52.0
11694,3247421,82.1,0,2.9520,1,1,59.0
11695,3346588,115.3,0,2.1826,2,0,71.0
11696,3347496,71.3,0,1.5340,1,0,56.0
