In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
filepath = "./gdrive/MyDrive/CSE6250project/"
# import os
# os.listdir(filepath)

To access files from shared folder, add the shared folder as shortcut to your
google drive

see: https://stackoverflow.com/questions/54351852/accessing-shared-with-me-with-colab

In [3]:
import pandas as pd
import numpy as np
import pickle
from google.colab import files


In [4]:
df_admissions_original = pd.read_csv(filepath+'ADMISSIONS.csv')
df_patients_original = pd.read_csv(filepath+'PATIENTS.csv')
df_diagnoses_original = pd.read_csv(filepath+'DIAGNOSES_ICD.csv')

In [5]:
df_admissions = df_admissions_original.copy()
df_patients = df_patients_original.copy()
df_diagnoses = df_diagnoses_original.copy()
print(df_admissions.shape)
print(df_patients.shape)
print(df_diagnoses.shape)

(58976, 19)
(46520, 8)
(651047, 5)


In [6]:
# get the most recent admissions to label death
newest_admissions = df_admissions.sort_values('ADMITTIME').groupby('SUBJECT_ID').tail(1)
# add death label based on deathtime is/is not null
newest_admissions['Mortality'] = np.where(newest_admissions['DEATHTIME'].isnull(), 0, 1)
df_mortality = newest_admissions[['SUBJECT_ID', 'Mortality']]
df_mortality = df_mortality.sort_values(by='SUBJECT_ID')
print(df_mortality['Mortality'].value_counts())
# df_mortality.to_csv('mortality_labels.csv', encoding='utf-8-sig')
# files.download('mortality_labels.csv')

0    40711
1     5809
Name: Mortality, dtype: int64


In [7]:
# filter records with more than one visit
admissions = df_admissions[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DIAGNOSIS']]
multi_visit_adm = admissions[admissions.groupby('SUBJECT_ID')['SUBJECT_ID'].transform('size') > 1]
print('number of hospital admissions with more than one visit:')
print(len(multi_visit_adm.index))
print('number of readmissions:')
print(len(multi_visit_adm['SUBJECT_ID'].unique()))

number of hospital admissions with more than one visit:
19993
number of readmissions:
7537


In [8]:
# Create mortality dataframe based on multi visits patients only
multi_visit_patients = multi_visit_adm['SUBJECT_ID'].drop_duplicates()
df_mortality = pd.merge(multi_visit_patients, df_mortality, on='SUBJECT_ID', how='left').dropna()
print(df_mortality.shape)

(7537, 2)


In [None]:
df_mortality.to_csv('multi_visit_mortality_labels.csv', encoding='utf-8-sig')
files.download('multi_visit_mortality_labels.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
# icd-9 process
# icd-9 with two digits after the point
def convert_to_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4:
            return dxStr[:4] + '.' + dxStr[4:]
        else:
            return dxStr
    else:
        if len(dxStr) > 3:
            return dxStr[:3] + '.' + dxStr[3:]
        else:
            return dxStr

# icd-9 only first three digits
def convert_to_3digit_icd9(dxStr):
    if dxStr.startswith('E'):
        if len(dxStr) > 4:
            return dxStr[:4]
        else:
            return dxStr
    else:
        if len(dxStr) > 3:
            return dxStr[:3]
        else:
            return dxStr
# build a code map using unique icd9 codes
def build_codemap(df_icd9, transform):
	"""
	:return: Dict of code map {main-digits of ICD9: unique feature ID}
	"""
	df_digits = df_icd9['ICD9_CODE'].dropna().apply(transform)
	codes = df_digits.unique()
	codemap = dict(zip(codes, range(len(codes))))
	return codemap


In [10]:
# Merge the admisions table and diagnoses_icd table
def merge_adm_diag(df_diagnoses, df_admissions, codemap, transform):
  # Convert diagnosis code into unique feature ID.
	df_diagnoses['ICD9_CODE_transformed'] = df_diagnoses['ICD9_CODE'].dropna().apply(transform)
	df_diagnoses['featureID'] = df_diagnoses['ICD9_CODE_transformed'].map(codemap)
	df_diagnoses = df_diagnoses.loc[df_diagnoses['featureID'].notnull()]

  # Group the diagnosis codes for the same visit.
	df_admissions = df_admissions[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME']]
	df_merge_diag = pd.merge(df_admissions, df_diagnoses, on=['SUBJECT_ID','HADM_ID'], how='left').dropna()
	df_merge_diag['featureID'] = df_merge_diag['featureID'].astype(int)
	return df_merge_diag

In [12]:
codemap = build_codemap(df_diagnoses, convert_to_icd9)
# pickle.dump(codemap, open("mortality.codemap.train", 'wb'), pickle.HIGHEST_PROTOCOL)
# files.download('mortality.codemap.train')

In [13]:
df_merge_adm_diag = merge_adm_diag(df_diagnoses, multi_visit_adm, codemap, convert_to_icd9)
# df_merge_adm_diag.to_csv('admissions_diagnoses_merge.csv', encoding='utf-8-sig')
# files.download('admissions_diagnoses_merge.csv')
print('number of diagnoses:')
print(len(df_merge_adm_diag.index))
print('number of unique ICD9 codes:')
print(len(df_merge_adm_diag['ICD9_CODE_transformed'].unique()))

number of diagnoses:
260282
number of unique ICD9 codes:
4893


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merge_diag['featureID'] = df_merge_diag['featureID'].astype(int)


In [14]:
print(len(df_merge_adm_diag['SUBJECT_ID'].unique()))

7537


In [15]:
# Extract heart failure dataset with diagnoses of 428.x code
heart_code = df_merge_adm_diag['ICD9_CODE'].str.startswith('428')
def heart_failure_filter(df_merge_adm_diag):
    new_df = df_merge_adm_diag.loc[heart_code].copy()
    # first time the diagnoses appear
    new_df = new_df.groupby('SUBJECT_ID').first()
    new_df = new_df.reset_index()
    return new_df
heart_failure_dataset = heart_failure_filter(df_merge_adm_diag)
print('Heart failure dataset number of patients:')
print(len(heart_failure_dataset['SUBJECT_ID'].unique()))
# Create heart failure labels
adm_diag = df_merge_adm_diag[['SUBJECT_ID']].drop_duplicates()
heart_failure_dataset['HEART FAILURE'] = 1
heart_failure_patients = heart_failure_dataset[['SUBJECT_ID', 'HEART FAILURE']]
df_heart_failure = pd.merge(adm_diag, heart_failure_patients, on='SUBJECT_ID', how='left')
df_heart_failure['HEART FAILURE'] = np.where(df_heart_failure['HEART FAILURE'].isnull(), 0, 1)
# print(len(df_heart_failure['SUBJECT_ID'].unique()))
# df_heart_failure.to_csv('Heart_failure_labels.csv', encoding='utf-8-sig')
# files.download('Heart_failure_labels.csv')

Heart failure dataset number of patients:
3370


In [16]:
# Extract sepsis dataset with diagnoses of 99592 code (sepsis 3.0 definition)
def sepsis_filter(df_merge_adm_diag):
    new_df = df_merge_adm_diag.loc[df_merge_adm_diag['ICD9_CODE'].eq('99592')].copy()
    # first time the diagnoses appear
    new_df = new_df.groupby('SUBJECT_ID').first()
    new_df = new_df.reset_index()
    return new_df
sepsis_dataset = sepsis_filter(df_merge_adm_diag)
print('Sepsis dataset number of patients:')
print(len(sepsis_dataset['SUBJECT_ID'].unique()))
# Create sepsis labels
sepsis_dataset['SEPSIS'] = 1
sepsis_dataset = sepsis_dataset[['SUBJECT_ID', 'SEPSIS']]
df_sepsis = pd.merge(adm_diag, sepsis_dataset, on='SUBJECT_ID', how='left')
df_sepsis['SEPSIS'] = np.where(df_sepsis['SEPSIS'].isnull(), 0, 1)
# print(len(df_sepsis['SUBJECT_ID'].unique()))
# df_sepsis.to_csv('Sepsis_labels.csv', encoding='utf-8-sig')
# files.download('Sepsis_labels.csv')

Sepsis dataset number of patients:
1444


In [17]:
# Create readmission labels
readmissions = multi_visit_adm[['SUBJECT_ID']].drop_duplicates()
readmissions['Readmission'] = 1
print(len(readmissions['SUBJECT_ID'].unique()))
adm = df_admissions[['SUBJECT_ID']].drop_duplicates()
df_readmissions = pd.merge(adm, readmissions, on='SUBJECT_ID', how='left')
df_readmissions['Readmission'] = np.where(df_readmissions['Readmission'].isnull(), 0, 1)
print(len(df_readmissions['SUBJECT_ID'].unique()))
# df_readmissions.to_csv('Readmission_labels.csv', encoding='utf-8-sig')
# files.download('Readmission_labels.csv')

7537
46520


In [18]:
# create custom dataset for the model
def create_dataset(df, label_df, codemap, transform):
	"""
	:param codemap: 3-digit ICD-9 code feature map
	:param transform: e.g. convert_icd9
	:return: List(patient IDs), List(labels), Visit sequence data as a List of List of List.
	"""
	df_merge_visit = df.groupby(['SUBJECT_ID','ADMITTIME'],as_index=False).agg({'featureID': lambda x: list(x)})
 	# Visits for each patient must be sorted in chronological order.
	df_merge_visit = df_merge_visit.sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
	# Group the visits for the same patient.
	df_merge = df_merge_visit.groupby('SUBJECT_ID',as_index=False).agg({'featureID': lambda x: list(x)})
	df_merge = df_merge.sort_values(by='SUBJECT_ID')
	patients_ids = df_merge['SUBJECT_ID']
 	# Make a visit sequence dataset as a List of patient Lists of visit Lists
	# Make patient-id List and label List too.
	label = pd.merge(patients_ids, label_df, on='SUBJECT_ID',how='left').dropna()
	patient_ids = list(df_merge['SUBJECT_ID'])
	labels = list(label.iloc[:, -1])
	seq_data = list(df_merge['featureID'])
	return patient_ids, labels, seq_data

In [19]:
# split mortality dataset into train, validation, test = (0.75, 0.1, 0.15)
# first split the 7537 patients with mortality label, then join each dataset with admission_diagnosis table
train_mortality = df_mortality.sample(frac = 0.75)
val_test_mortality = df_mortality.drop(train_mortality.index)
validation_mortality = val_test_mortality.sample(frac = 0.40)
test_mortality = val_test_mortality.drop(validation_mortality.index)
train_mortality_data = pd.merge(train_mortality, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(train_mortality_data.shape)
print(len(train_mortality_data['SUBJECT_ID'].unique()))
validation_mortality_data = pd.merge(validation_mortality, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(len(validation_mortality_data['SUBJECT_ID'].unique()))
test_mortality_data = pd.merge(test_mortality, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(len(test_mortality_data['SUBJECT_ID'].unique()))

# create mortality train, validation, test custom dataset
mortality_ids_train, mortality_labels_train, mortality_seqs_train = create_dataset(train_mortality_data, df_mortality, codemap, convert_to_icd9)
mortality_ids_validation, mortality_labels_validation, mortality_seqs_validation = create_dataset(validation_mortality_data, df_mortality, codemap, convert_to_icd9)
mortality_ids_test, mortality_labels_test, mortality_seqs_test = create_dataset(test_mortality_data, df_mortality, codemap, convert_to_icd9)


(194789, 9)
5653
754
1130


In [20]:
print(mortality_seqs_train[:10])

[[[26, 31, 120, 168, 244, 46, 167, 380], [377, 378, 143, 120, 46, 33, 244, 113, 379, 155]], [[24, 44, 4, 102, 98, 282, 26, 401], [277, 402, 386, 26, 403, 44, 77, 157]], [[26, 31, 92, 46, 34, 30, 167, 408, 410], [110, 253, 406, 288, 407, 92, 26, 113, 46, 167, 30, 408], [337, 406, 145, 1, 409, 213, 112, 92, 46, 167, 254, 408]], [[190, 191, 46, 77, 59, 159], [188, 77, 46, 189, 159]], [[192, 1, 4, 195, 44, 65, 193, 72, 194, 73, 25, 196, 197], [192, 1, 4, 5, 65, 62, 193, 194, 10, 44, 25]], [[237], [237, 238, 239, 240]], [[73, 278, 38, 279, 280, 46, 277, 281], [275, 276, 1, 102, 46, 167, 277, 25]], [[308, 188, 309, 1, 46, 155, 310, 311], [188, 308, 312, 302, 46]], [[299, 70, 601, 57, 42, 4, 102, 65, 171, 277, 318, 25, 828], [317, 1, 44, 318]], [[838, 234, 168, 101, 839], [834, 14, 111, 24, 837, 509, 101, 543, 427, 448, 44, 631, 26, 11, 7], [833, 14, 834, 234, 59, 22, 168, 509, 101, 244, 394, 216, 835, 279, 585, 836, 7]]]


In [21]:
print(mortality_ids_train[:10])

[23, 34, 36, 67, 68, 84, 94, 103, 105, 107]


In [22]:
print(mortality_labels_train[:10])

[0, 0, 0, 1, 0, 1, 0, 1, 0, 0]


In [23]:
# construct mortality train set
pickle.dump(mortality_ids_train, open("mortality.ids.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(mortality_labels_train, open("mortality.labels.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(mortality_seqs_train, open("mortality.seqs.train", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("mortality.ids.train")
files.download("mortality.labels.train")
files.download("mortality.seqs.train")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
# construct mortality validation set
pickle.dump(mortality_ids_validation, open("mortality.ids.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(mortality_labels_validation, open("mortality.labels.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(mortality_seqs_validation, open("mortality.seqs.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("mortality.ids.validation")
files.download("mortality.labels.validation")
files.download("mortality.seqs.validation")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
# construct mortality test set
pickle.dump(mortality_ids_test, open("mortality.ids.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(mortality_labels_test, open("mortality.labels.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(mortality_seqs_test, open("mortality.seqs.test", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("mortality.ids.test")
files.download("mortality.labels.test")
files.download("mortality.seqs.test")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [26]:
# split sepsis dataset into train, validation, test = (0.75, 0.1, 0.15)
# first split the 7537 patients with sepsis label, then join each dataset with admission_diagnosis table
train_sepsis = df_sepsis.sample(frac = 0.75)
val_test_sepsis = df_sepsis.drop(train_sepsis.index)
validation_sepsis = val_test_sepsis.sample(frac = 0.40)
test_sepsis = val_test_sepsis.drop(validation_sepsis.index)
train_sepsis_data = pd.merge(train_sepsis, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(train_sepsis_data.shape)
print(len(train_sepsis_data['SUBJECT_ID'].unique()))
validation_sepsis_data = pd.merge(validation_sepsis, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(len(validation_sepsis_data['SUBJECT_ID'].unique()))
test_sepsis_data = pd.merge(test_sepsis, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(len(test_sepsis_data['SUBJECT_ID'].unique()))

# create sepsis train, validation, test custom dataset
sepsis_ids_train, sepsis_labels_train, sepsis_seqs_train = create_dataset(train_sepsis_data, df_sepsis, codemap, convert_to_icd9)
sepsis_ids_validation, sepsis_labels_validation, sepsis_seqs_validation = create_dataset(validation_sepsis_data, df_sepsis, codemap, convert_to_icd9)
sepsis_ids_test, sepsis_labels_test, sepsis_seqs_test = create_dataset(test_sepsis_data, df_sepsis, codemap, convert_to_icd9)

(195879, 9)
5653
754
1130


In [27]:
# construct sepsis train set
pickle.dump(sepsis_ids_train, open("sepsis.ids.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(sepsis_labels_train, open("sepsis.labels.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(sepsis_seqs_train, open("sepsis.seqs.train", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("sepsis.ids.train")
files.download("sepsis.labels.train")
files.download("sepsis.seqs.train")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
# construct sepsis validation set
pickle.dump(sepsis_ids_validation, open("sepsis.ids.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(sepsis_labels_validation, open("sepsis.labels.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(sepsis_seqs_validation, open("sepsis.seqs.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("sepsis.ids.validation")
files.download("sepsis.labels.validation")
files.download("sepsis.seqs.validation")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [29]:
# construct sepsis test set
pickle.dump(sepsis_ids_test, open("sepsis.ids.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(sepsis_labels_test, open("sepsis.labels.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(sepsis_seqs_test, open("sepsis.seqs.test", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("sepsis.ids.test")
files.download("sepsis.labels.test")
files.download("sepsis.seqs.test")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [30]:
# split heart failure dataset into train, validation, test = (0.75, 0.1, 0.15)
# first split the 7537 patients with heart failure label, then join each dataset with admission and diagnosis table
train_heart_failure = df_heart_failure.sample(frac = 0.75)
val_test_heart_failure = df_heart_failure.drop(train_sepsis.index)
validation_heart_failure = val_test_heart_failure.sample(frac = 0.40)
test_heart_failure = val_test_heart_failure.drop(validation_heart_failure.index)
train_heart_data = pd.merge(train_heart_failure, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(train_heart_data.shape)
print(len(train_heart_data['SUBJECT_ID'].unique()))
validation_heart_data = pd.merge(validation_heart_failure, df_merge_adm_diag, on='SUBJECT_ID', how='left')
test_heart_data = pd.merge(test_heart_failure, df_merge_adm_diag, on='SUBJECT_ID', how='left')

# create heart failure train, validation, test custom dataset
heart_failure_ids_train, heart_failure_labels_train, heart_failure_seqs_train = create_dataset(train_heart_data, df_heart_failure, codemap, convert_to_icd9)
heart_failure_ids_validation, heart_failure_labels_validation, heart_failure_seqs_validation = create_dataset(validation_heart_data, df_heart_failure, codemap, convert_to_icd9)
heart_failure_ids_test, heart_failure_labels_test, heart_failure_seqs_test = create_dataset(test_heart_data, df_heart_failure, codemap, convert_to_icd9)


(195796, 9)
5653


In [31]:
# construct heart failure train set
pickle.dump(heart_failure_ids_train, open("heart_failure.ids.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(heart_failure_labels_train, open("heart_failure.labels.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(heart_failure_seqs_train, open("heart_failure.seqs.train", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("heart_failure.ids.train")
files.download("heart_failure.labels.train")
files.download("heart_failure.seqs.train")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [32]:
# construct heart failure validation set
pickle.dump(heart_failure_ids_validation, open("heart_failure.ids.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(heart_failure_labels_validation, open("heart_failure.labels.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(heart_failure_seqs_validation, open("heart_failure.seqs.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("heart_failure.ids.validation")
files.download("heart_failure.labels.validation")
files.download("heart_failure.seqs.validation")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
# construct heart failure test set
pickle.dump(heart_failure_ids_test, open("heart_failure.ids.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(heart_failure_labels_test, open("heart_failure.labels.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(heart_failure_seqs_test, open("heart_failure.seqs.test", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("heart_failure.ids.test")
files.download("heart_failure.labels.test")
files.download("heart_failure.seqs.test")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
# split readmission dataset into train, validation, test = (0.75, 0.1, 0.15)
# first split the patients with readmission label, then join each dataset with admission_diagnosis table
print(len(df_readmissions['SUBJECT_ID'].unique()))
train_readm = df_readmissions.sample(frac = 0.75)
val_test_readm = df_readmissions.drop(train_readm.index)
validation_readm = val_test_readm.sample(frac = 0.40)
test_readm = val_test_readm.drop(validation_readm.index)
train_readm_data = pd.merge(train_readm, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(train_readm_data.shape)
print(len(train_readm_data['SUBJECT_ID'].unique()))
validation_readm_data = pd.merge(validation_readm, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(len(validation_readm_data['SUBJECT_ID'].unique()))
test_readm_data = pd.merge(test_readm, df_merge_adm_diag, on='SUBJECT_ID', how='left')
print(len(test_readm_data['SUBJECT_ID'].unique()))

# create sepsis train, validation, test custom dataset

readmission_ids_train, readmission_labels_train, readmission_seqs_train = create_dataset(train_readm_data, df_readmissions, codemap, convert_to_icd9)
readmission_ids_validation, readmission_labels_validation, readmission_seqs_validation = create_dataset(validation_readm_data, df_readmissions, codemap, convert_to_icd9)
readmission_ids_test, readmission_labels_test, readmission_seqs_test = create_dataset(test_readm_data, df_readmissions, codemap, convert_to_icd9)

46520
(227027, 9)
34890
4652
6978


In [35]:
# construct readmission train set
pickle.dump(readmission_ids_train, open("readmission.ids.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(readmission_labels_train, open("readmission.labels.train", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(readmission_seqs_train, open("readmission.seqs.train", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("readmission.ids.train")
files.download("readmission.labels.train")
files.download("readmission.seqs.train")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
# construct readmission validation set
pickle.dump(readmission_ids_validation, open("readmission.ids.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(readmission_labels_validation, open("readmission.labels.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(readmission_seqs_validation, open("readmission.seqs.validation", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("readmission.ids.validation")
files.download("readmission.labels.validation")
files.download("readmission.seqs.validation")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
# construct readmission test set
pickle.dump(readmission_ids_test, open("readmission.ids.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(readmission_labels_test, open("readmission.labels.test", 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(readmission_seqs_test, open("readmission.seqs.test", 'wb'), pickle.HIGHEST_PROTOCOL)
files.download("readmission.ids.test")
files.download("readmission.labels.test")
files.download("readmission.seqs.test")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>