In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.30.0 to work with jp-ml-workspace-train


In [2]:
cpet_data = ws.datasets.get("cpet dataset cardiac2")

In [3]:
import os

experiment_folder = 'cpet_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print('Folder ready.')

Folder ready.


In [34]:
%%writefile $experiment_folder/cpet_cardiac_training.py
# Import libraries
import argparse
import joblib
import os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler

print("Begin run...")
# Set regularization parameter
parser = argparse.ArgumentParser()
#parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='regularization rate')
parser.add_argument('--n-estimators', type=int, dest='n_estimators', default=10, help='n_estimators')
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')
args = parser.parse_args()
n_trees = args.n_estimators

# Get the experiment run context
run = Run.get_context()
run.log('Begin run...')

# load the diabetes dataset
print("Loading Data...")
run.log('Loading Data...')
cpet_data = run.input_datasets['training_data'].to_pandas_dataframe() # Get the training data from the estimator input
run.log('data',cpet_data.shape)
# {
#     "class_name": "StandardScaler",
#     "module": "sklearn.preprocessing",
#     "param_args": [],
#     "param_kwargs": {
#         "with_mean": true,
#         "with_std": true
#     },
#     "prepared_kwargs": {},
#     "spec_class": "preproc"
# }

# {
#     "class_name": "RandomForestClassifier",
#     "module": "sklearn.ensemble",
#     "param_args": [],
#     "param_kwargs": {
#         "bootstrap": true,
#         "class_weight": "balanced",
#         "criterion": "gini",
#         "max_features": 0.7,
#         "min_samples_leaf": 0.01,
#         "min_samples_split": 0.15052631578947367,
#         "n_estimators": 10,
#         "oob_score": true
#     },
#     "prepared_kwargs": {},
#     "spec_class": "sklearn"
# }

# Separate features and labels
X, y = cpet_data[['75_to_100_VCO2Slope','VEvsVCO2Slope','75_to_100_VO2Slope','15_to_85_VESlope','MeanVE/VCO2','MeanO2Pulse',
'PredictedMaxHR','MaxVO2_EST','DiffPeakHR','75_to_100_VEVCO2Slope','StdVE/VCO2','15_to_85_VEVCO2Slope',
'75_to_100_RRSlope','75_to_100_RERSlope','MeanRR','75_to_100_O2Slope','75_to_100_HRSlope']].values, cpet_data['CardiacLim'].values
scaler = StandardScaler(with_mean=True, with_std=True)
run.log('X',X)

scaler.fit(X)
X_scaled = scaler.transform(X)

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=0)
run.log('Model creation')

clf = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion="gini",max_features=".7",min_samples_leaf=0.01,
                            min_samples_split=0.15052631578947367,n_estimators=n_trees,oob_score=True)
run.log('Number of trees: ',  np.float(n_trees))
model = clf.fit(X_train,y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/cpet_model.pkl')

run.complete()

Overwriting cpet_training-hyperdrive/cpet_cardiac_training.py


In [118]:
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler

cpet_ds = ws.datasets.get("cpet dataset cardiac2")
data_test = cpet_ds.to_pandas_dataframe()

# X, y = data_test[['75_to_100_VCO2Slope','VEvsVCO2Slope','75_to_100_VO2Slope','15_to_85_VESlope','MeanVE/VCO2','MeanO2Pulse',
# 'PredictedMaxHR','MaxVO2_EST','DiffPeakHR','75_to_100_VEVCO2Slope','StdVE/VCO2','15_to_85_VEVCO2Slope',
# '75_to_100_RRSlope','75_to_100_RERSlope','MeanRR','75_to_100_O2Slope','75_to_100_HRSlope']].values, data_test['CardiacLim'].values

X, y = data_test[['75_to_100_VO2Slope','DiffPeakVO2','75_to_100_HRSlope','StdVE/VCO2','15_to_85_RRSlope','PeakVE'
 ,'MeanHeartRate','75_to_100_RERSlope','MeanRER','HRvsVO2Slope','75_to_100_VEVCO2Slope','PeakVCO2'
 ,'PredictedMaxHR','75_to_100_VCO2Slope','MeanVE','75_to_100_VESlope','PeakVO2Real'		
 ,'LowestVE/VCO2','VO2atVT','PeakHeartRate']].values, data_test['CardiacLim'].values

#X, y = data_test[['75_to_100_VO2Slope','DiffPeakVO2','75_to_100_HRSlope','StdVE/VCO2','15_to_85_RRSlope','PeakVE'
#,'MeanHeartRate','75_to_100_RERSlope','MeanRER','HRvsVO2Slope','75_to_100_VEVCO2Slope','PeakVCO2'
# ,'PredictedMaxHR','75_to_100_VCO2Slope']].values, data_test['CardiacLim'].values


scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(X)
X_scaled = scaler.transform(X)
#print('X',X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.70,random_state=123)
clf = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion="gini",max_features=0.7,min_samples_leaf=0.01,
                            min_samples_split=0.15052631578947367,n_estimators=100,oob_score=True, random_state=0)
#print(clf)
model = clf.fit(X_train,y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

Accuracy: 0.7922077922077922


In [119]:
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
y_pred = model.predict_proba(X_test)[:,1]
#print(model.predict_proba(X_test)[:,0])
#print(model.predict_proba(X_test)[:,0])
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
print(metrics.auc(fpr, tpr))
average_precision = precision_score(y_test, model.predict(X_test), average='macro')
print(average_precision)
macro_recall=recall_score(y_test,  model.predict(X_test), average='macro')
print(macro_recall)

0.8456140350877193
0.7319892473118279
0.6892543859649123


In [135]:
bechmark_value_2={'Title':'Boruta Features','Columns':['75_to_100_VO2Slope','DiffPeakVO2','75_to_100_HRSlope','75_to_100_VCO2Slope','75_to_100_VESlope','75_to_100_RRSlope'
,'75_to_100_O2Slope','MeanVO2','StdO2Pulse','75_to_100_RERSlope','HRvsVO2Slope','LowestVO2','PeakVO2'
,'MeanHeartRate','PeakVO2Real','MeanO2Pulse','PeakRER']}
bechmark_value_1={'Title':'Grand collection Features','Columns':['75_to_100_VO2Slope','DiffPeakVO2','75_to_100_HRSlope','StdVE/VCO2','15_to_85_RRSlope','PeakVE'
 ,'MeanHeartRate','75_to_100_RERSlope','MeanRER','HRvsVO2Slope','75_to_100_VEVCO2Slope','PeakVCO2'
 ,'PredictedMaxHR','75_to_100_VCO2Slope','MeanVE','75_to_100_VESlope','PeakVO2Real'		
 ,'LowestVE/VCO2','VO2atVT','PeakHeartRate']}
bechmark_values =[bechmark_value_1, bechmark_value_2] 
for columns in bechmark_values:
    print('Perofrmance for the',columns['Title'])
    auc_arr = np.array([])
    acc_arr = np.array([])
    pre_arr = np.array([])
    rec_arr = np.array([])
    for i in range(35):
        X, y = data_test[columns['Columns']].values, data_test['CardiacLim'].values
        scaler = StandardScaler(with_mean=True, with_std=True)
        scaler.fit(X)
        X_scaled = scaler.transform(X)
        #print('X',X_scaled)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.70,random_state=i)
        clf = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion="gini",max_features=0.7,min_samples_leaf=0.01,
                                    min_samples_split=0.15052631578947367,n_estimators=100,oob_score=True, random_state=i)
        #print(clf)
        model = clf.fit(X_train,y_train)

        # calculate accuracy
        y_hat = model.predict(X_test)
        acc = np.average(y_hat == y_test)
        acc_arr = np.append(acc_arr,acc)
        #print('Accuracy:', acc)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
        #print(metrics.auc(fpr, tpr))
        auc_arr = np.append(auc_arr,metrics.auc(fpr, tpr))
        average_precision = precision_score(y_test, model.predict(X_test), average='macro')
        pre_arr = np.append(pre_arr, average_precision)
        #print(average_precision)
        macro_recall=recall_score(y_test,  model.predict(X_test), average='macro')
        rec_arr = np.append(rec_arr,macro_recall)
        #print(macro_recall)
        pass
    print('AUC: mean {:.2f}, std {:.2f}'.format(np.mean(auc_arr),np.std(auc_arr)))
    print('Accuracy: mean {:.2f}, std {:.2f}'.format(np.mean(acc_arr),np.std(acc_arr)))
    print('Precision: mean {:.2f}, std {:.2f}'.format(np.mean(pre_arr),np.std(pre_arr)))
    print('Recall: mean {:.2f}, std {:.2f}'.format(np.mean(rec_arr),np.std(rec_arr)))

Perofrmance for the Grand collection Features
AUC: mean 0.81, std 0.03
Accuracy: mean 0.75, std 0.03
Precision: mean 0.70, std 0.05
Recall: mean 0.66, std 0.05
Perofrmance for the Boruta Features
AUC: mean 0.78, std 0.03
Accuracy: mean 0.72, std 0.03
Precision: mean 0.65, std 0.04
Recall: mean 0.63, std 0.05


In [120]:
X, y = data_test[['75_to_100_VO2Slope','DiffPeakVO2','75_to_100_HRSlope','75_to_100_VCO2Slope','75_to_100_VESlope','75_to_100_RRSlope'
,'75_to_100_O2Slope','MeanVO2','StdO2Pulse','75_to_100_RERSlope','HRvsVO2Slope','LowestVO2','PeakVO2'
,'MeanHeartRate','PeakVO2Real','MeanO2Pulse','PeakRER']].values, data_test['CardiacLim'].values

#X, y = data_test[['75_to_100_VO2Slope','DiffPeakVO2','75_to_100_HRSlope','StdVE/VCO2','15_to_85_RRSlope','PeakVE'
#,'MeanHeartRate','75_to_100_RERSlope','MeanRER','HRvsVO2Slope','75_to_100_VEVCO2Slope','PeakVCO2'
# ,'PredictedMaxHR','75_to_100_VCO2Slope']].values, data_test['CardiacLim'].values


scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(X)
X_scaled = scaler.transform(X)
#print('X',X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.70,random_state=123)
clf = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion="gini",max_features=0.7,min_samples_leaf=0.01,
                            min_samples_split=0.15052631578947367,n_estimators=100,oob_score=True, random_state=0)
#print(clf)
model = clf.fit(X_train,y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

Accuracy: 0.7792207792207793


In [121]:
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
y_pred = model.predict_proba(X_test)[:,1]
#print(model.predict_proba(X_test)[:,0])
#print(model.predict_proba(X_test)[:,0])
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
print(metrics.auc(fpr, tpr))
average_precision = precision_score(y_test, model.predict(X_test), average='macro')
print(average_precision)
macro_recall=recall_score(y_test,  model.predict(X_test), average='macro')
print(macro_recall)

0.8208333333333334
0.7112903225806452
0.6723684210526316


In [106]:
from sklearn import metrics
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
y_pred = model.predict_proba(X_test)[:,1]
#print(model.predict_proba(X_test)[:,0])
#print(model.predict_proba(X_test)[:,0])
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
print(metrics.auc(fpr, tpr))
average_precision = average_precision_score(y_test,y_pred)
print(average_precision)
macro_recall=recall_score(y_test,  model.predict(X_test), average='macro')
print(macro_recall)

0.800203873598369
0.543929399765017
0.6756371049949031


In [82]:
data_test.groupby(['CardiacLim']).count()

Unnamed: 0_level_0,MaxVO2_EST,PredictedMaxHR,PeakHeartRate,MeanHeartRate,MinHeartRate,StdHeartRate,LowestVE/VCO2,PeakVE/VCO2,MeanVE/VCO2,StdVE/VCO2,...,15_to_85_VCO2Slope,15_to_85_VESlope,15_to_85_RERSlope,15_to_85_RRSlope,15_to_85_O2Slope,15_to_85_VEVCO2Slope,15_to_85_VEVO2Slope,VO2atVT,VO2vsPeakVO2atVT,HasAnaerobicThresholdMean
CardiacLim,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,158,158,158,158,158,158,158,158,158,158,...,158,158,158,158,158,158,158,158,158,158
1,61,61,61,61,61,61,61,61,61,61,...,61,61,61,61,61,61,61,61,61,61


In [84]:
158/(158+61)

0.7214611872146118

In [16]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "jp-compute-fast-two"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)
    

Found existing cluster, use it.


In [36]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
sklearn_env = Environment("sklearn-env")

# Ensure the required packages are installed (we need scikit-learn, Azure ML defaults, and Azure ML dataprep)
packages = CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults','azureml-dataprep[pandas]'])
sklearn_env.python.conda_dependencies = packages

# Get the training dataset
cpet_ds = ws.datasets.get("cpet dataset cardiac2")
data_test = cpet_ds.to_pandas_dataframe()
#print(data_test.head)
#print(cpet_ds)
# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                              script='cpet_cardiac_training.py',
                              arguments = ['--n-estimators', 10, # Regularizaton rate parameter
                                           '--input-data', cpet_ds.as_named_input('training_data')], # Reference to dataset
                              environment=sklearn_env,
                              compute_target = training_cluster)

# Sample a range of parameter values
params = GridParameterSampling(
    {
        # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination
        '--n-estimators': choice(10,40,70,100)
    }
)

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None, 
                          primary_metric_name='AUC', 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=6,
                          max_concurrent_runs=4)

# Run the experiment
experiment = Experiment(workspace = ws, name = 'cpet_training_hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

{'runId': 'HD_28c7a600-3954-4cf6-8bef-b5aa8b37b56f',
 'target': 'jp-compute-fast-two',
 'status': 'Canceled',
 'startTimeUtc': '2021-06-22T14:57:33.54132Z',
 'endTimeUtc': '2021-06-22T14:59:33.792548Z',
 'error': {'error': {'code': 'UserError',
   'message': 'User errors were found in at least one of the child runs.',
   'messageParameters': {},
   'details': []},
  'time': '0001-01-01T00:00:00.000Z'},
 'properties': {'primary_metric_config': '{"name": "AUC", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '7c7eaa74-9c62-407b-8270-73ff8c63a8a0'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://jpmlworkspacet8118195379.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_28c7a600-3954-4cf6-8bef-b5aa8b37b56f/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=AhV3rdr86tW8kVhz5%2BTSMj4OB1Z9Dj%2BPPbRTPMC%2BbIM%3D&st=2021-06-22T14%3A