# Connect to the workspace:

In [1]:
# Load the workspace 
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()
print('Use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Use Azure ML 1.34.0 to work with azure_python_sdk


# **DATA INFORMATION:**
MIMIC stands for Medical Information Mart for Intensive Care. It is part of a larger dataset called PhysioNet ,
which is a large open source collection of physiologic and clinical data submitted by many institutions.
https://mimic.mit.edu/docs/gettingstarted/ 

All tables have at least one unique identiier which is the
ROW_ID.

SUBJECT_ID: Refers to a unique patient.

HADM_ID: Refers to a hospital admission event for a patient.

ICUSTAY_ID: Refers to an ICU for a patient.

Each hospital admission of a patient is unique with “hadm_id”.

Each ICU stay of a patient is unique with “icustay_id”.

That means:

- One subject_id can be associated with multiple hadm_ids
when a patient had multiple admissions.

- One hadm_id can be linked to multiple icustay_id
when a patient had a multiple ICU stays during an admission.
(e.g., transferring between multiple ICUs




# Create a training script:

In [2]:
%%writefile EHR_training.py

# load libraries and dependencies
import azureml.core
from azureml.core import Run
from azureml.core import Workspace
from azureml.core import Dataset

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import imblearn
from imblearn.over_sampling import SMOTE
from numpy import mean

from azureml.interpret import ExplanationClient
from interpret.ext.blackbox import TabularExplainer

import pandas as pd
import numpy as np
import re
import joblib
import os

pd.options.display.max_rows = 2000

# Get the experiment run context
run = Run.get_context()

# load dataset
patients = pd.read_csv("PATIENTS-copy.csv", index_col = None)
admissions = pd.read_csv("ADMISSIONS-copy.csv", index_col = None)
chartevents = pd.read_csv("CHARTEVENTS-copy.csv", index_col = None)
dictionary_itemid = pd.read_csv("D_ITEMS-copy.csv", index_col = None)

# data preprocessing
def overall_check(df):
    '''
    Check data type for each column,
    How many missing values per column,
    Check row duplicates
    '''
    print ('Check data type for each column \n', df.dtypes)
    print ('How many missing values per column \n', df.isnull().sum())
    print ('Check row duplicates ', df.duplicated().sum())

# overall check dataframe 'patients'
overall_check(patients)  

# parse dates in 'patients'
patients['dob'] = pd.to_datetime(patients['dob'], format="%Y-%m-%d %H:%M:%S", errors = 'coerce')
print ('Done parsing dates!')

# overall check dataframe 'admissions'
overall_check(admissions)  
# admittime, dischtime should be time
# one 'religion' missing value
# no row duplicates

# parse dates in 'admissions'
admissions['admittime'] = pd.to_datetime(admissions['admittime'], format="%Y-%m-%d %H:%M:%S", errors = 'coerce')
admissions['dischtime'] = pd.to_datetime(admissions['dischtime'], format="%Y-%m-%d %H:%M:%S", errors = 'coerce')
print ('Done parsing dates!')

# print na row on 'religion'
admissions[admissions['religion'].isna()]

# drop na row on 'religion'
admissions = admissions.dropna(subset=['religion'])
admissions[admissions['religion'].isna()]
print ('Done dropping NAs!')

# overall check dataframe 'chartevents'
overall_check(chartevents)  
# value should be float
# value, valuenum, valueuom should be replaced by the mean of each column
# no row duplicates

# object to numeric 
chartevents['value'] = pd.to_numeric(chartevents['value'], errors = 'coerce')
print ('Done converting!')

# imputation by mean (numerical) and 'Unknown' (categorical)
chartevents['value'].fillna((chartevents['value'].mean()), inplace=True)
chartevents['valuenum'].fillna((chartevents['valuenum'].mean()), inplace=True)
chartevents['valueuom'].fillna('Unknown', inplace=True)
print ('Done imputation')

# overall check dataframe 'dictionary_itemid'
overall_check(dictionary_itemid)  
# no missing values on row_id, itemid, label, linksto
# no duplicates

# d_items links to 'chartevents'
dictionary_itemid = dictionary_itemid[dictionary_itemid['linksto'].str.contains('chartevents')]

# move 'dob' to admissions
if 'dob' not in admissions:
    try:
        admissions = pd.merge(admissions, patients[['subject_id','dob']], on='subject_id', how='left')
    except KeyError:
        #error log
        print ('I have problem')

# calculate patients age when patients hospitalize
admissions['age'] = admissions[['admittime','dob']].apply(lambda x: round((x['admittime'].date() - x['dob'].date()).days/365), axis=1)

# convert to 'systolic bp', 'diastolic bp', 'respiratory rate', 'glucose'
dictionary_itemid['label'] = dictionary_itemid['label'].apply(
    lambda x:  'systolic bp' if ('bp' in x.lower() or 'blood pressure' in x.lower() 
    or 'blood' in x.lower()) and 'sys' in x.lower() else x)
dictionary_itemid['label'] = dictionary_itemid['label'].apply(
    lambda x:  'diastolic bp' if ('bp' in x.lower() or 'blood pressure' in x.lower()
    or 'blood' in x.lower()) and 'dia' in x.lower() else x)
dictionary_itemid['label'] = dictionary_itemid['label'].apply(
    lambda x:  'respiratory rate' if ('resp' and 'rate') in x.lower() else x)
dictionary_itemid['label'] = dictionary_itemid['label'].apply(lambda x: 'glucose' if ('glucose') in x.lower() else x)

# move 'label' to chartevents
if 'label' not in chartevents:
    try:
        chartevents = pd.merge(chartevents, dictionary_itemid[['itemid','label']], on='itemid', how='left')
    except KeyError:
        #error log
        print ('I have problem')

# chartevents 'label' only care about 'systolic bp|diastolic bp|respiratory rate|glucose'
chartevents = chartevents[chartevents['label'].str.contains('systolic bp|diastolic bp|respiratory rate|glucose', na=False)]
chartevents = chartevents[['hadm_id','valuenum','label']]
chartevents.drop_duplicates(keep=False,inplace=True)
chartevents = pd.pivot_table(chartevents, index=['hadm_id'], columns=['label'], values='valuenum')

# create is_death (death or not)
admissions['is_death'] = admissions['deathtime'].isnull().apply(lambda x:  0 if (x==True) else 1)

# create feature columns
df_feature = pd.merge(admissions[['subject_id', 'hadm_id', 'admittime', 'is_death', 'age', 'admission_type', 'ethnicity', 'diagnosis']], chartevents, on='hadm_id', how='inner')

# is_readmission
df_feature = df_feature.sort_values(['subject_id', 'admittime'])
df_feature['is_readmission'] = df_feature.index[:-1].to_series().apply(lambda i: 1 if (df_feature['subject_id'][i] == df_feature['subject_id'][i+1]) else 0)
df_feature['is_readmission'][df_feature.index[-1]] = 0
df_feature['is_readmission'] = pd.to_numeric(df_feature['is_readmission'], downcast="integer")

# overall check df_feature
overall_check(df_feature)

# imputation by mean (numerical)
df_feature['glucose'].fillna((df_feature['glucose'].mean()), inplace=True)
df_feature['respiratory rate'].fillna((df_feature['respiratory rate'].mean()), inplace=True)
df_feature['systolic bp'].fillna((df_feature['systolic bp'].mean()), inplace=True)

# frequency of categorical data
df_feature['admission_type'].value_counts()
df_feature['ethnicity'] = df_feature['ethnicity'].apply(
    lambda x: 'UNKNOWN' if 'SPECIFIED' in x.strip() or 'OTHER' in x.strip() 
    or 'UNABLE' in x.strip() else x)
df_feature['ethnicity'] = df_feature['ethnicity'].apply(
    lambda x: 'HISPANIC' if 'HISPANIC' in x.strip() else x)
df_feature['ethnicity'] = df_feature['ethnicity'].apply(
    lambda x: 'ASIAN' if 'AMERICAN INDIAN' in x.strip() else x)

# Use Pandas to One-Hot encode the admission_type
df_feature_with_one_hot = pd.get_dummies(df_feature, columns=["admission_type"], drop_first=False)

# normalizatiion by MinMaxScaler
# define min max scaler
scaler = MinMaxScaler()
# transform data
df_feature_with_one_hot[['diastolic bp','glucose','respiratory rate','systolic bp']] = scaler.fit_transform(df_feature_with_one_hot[['diastolic bp','glucose','respiratory rate','systolic bp']])

# features
features = ['is_death','age','admission_type_ELECTIVE','admission_type_EMERGENCY','admission_type_URGENT','diastolic bp','glucose','respiratory rate','systolic bp']
X, y = df_feature_with_one_hot[features].values, df_feature_with_one_hot['is_readmission']
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
accuracy = np.average(y_hat == y_test)
print('Test Accuracy:', accuracy)

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ', auc)

# log results
run.log('AUC:',np.round(auc, 3))
run.log('Test Accuracy:',np.round(accuracy*100, 3))

# Save the trained model in the outputs folder
joblib.dump(value=model, filename='EHR_training.pkl')

# Get global feature importance using TabularExplainer
explainer = TabularExplainer(model, X_train, features=features, classes=[0,1])
explanation = explainer.explain_global(X_test)

# Upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')

run.complete()

Overwriting EHR_training.py


# Run an experiement:

In [3]:
# run the experiment
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.widgets import RunDetails

# Create an environment from a .yml file
env = Environment.from_conda_specification("environment", "environment.yml")

# Create a script config
script_config = ScriptRunConfig(source_directory='.',
                                script='EHR_training.py',
                                environment=env) 

# submit the experiment run
experiment_name = 'EHR'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)

# Show the running experiment run in the notebook widget
RunDetails(run).show()

# Block until the experiment run has completed
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'EHR_1646607914_cd72a437',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2022-03-06T23:05:28.706803Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'a9c39306-4d44-4253-95ae-7ba6d62b7c95'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'EHR_training.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': [],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'local',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'datacaches': [],
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'instanceTypes': [],
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'environment',
   'version': 'Autosave_2022-03-06T01:53:17Z_ef626af7',
   'python': {'interpreterPath': 'python',
    'userManagedDependencies': False,
    'condaDependencies': {'dependencies': [

# Print the feature importance ang experiment logs:

In [5]:
# retrieve the feature importance values
from azureml.interpret import ExplanationClient

# get the feature explanations
client = ExplanationClient.from_run(run)
engineered_explanations = client.download_model_explanation()
feature_importances = engineered_explanations.get_feature_importance_dict()

# global feature importance
print('Feature\tImportance')
for key, value in feature_importances.items():
    print(key, '\t', value)

# Get logged metrics and files
metrics = run.get_metrics()
for key in metrics.keys():
    print(key, metrics.get(key))

Feature	Importance
glucose 	 0.15344214270767947
is_death 	 0.130257376020088
systolic bp 	 0.12904791797447165
respiratory rate 	 0.10196903117807081
age 	 0.05817325800376651
diastolic bp 	 0.0072421008579200774
admission_type_EMERGENCY 	 0.001173885750156949
admission_type_URGENT 	 0.0
admission_type_ELECTIVE 	 0.0
AUC: 0.726
Test Accuracy: 72.881
