In [10]:
from azureml.core import Workspace, Experiment, Model
import joblib

# ws = Workspace.get(name="capstone")
ws = Workspace.from_config()

# exp = Experiment(workspace=ws, name="udacity-project-hyperparam-bayes")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: capstone
Azure region: westeurope
Subscription id: 72f46e0e-1451-4b79-92cd-fc8f7797bda7
Resource group: test-Jesse


In [11]:
from azureml.core.compute import ComputeTarget, AmlCompute

# from https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "cpu-cluster"
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [15]:
# I've ran five experiments, 
explanation = { 'udacity-project-hyperparam-bandit' : 'hyperdrive xgboost with bandit policy',
                'udacity-project-hyperparam-bayes' : 'hyperdrive xgboost with bayesian sampling',
                'automl': 'AutoML on same dataset as hyperdrive',
                'AutoML-onlylogtransform': 'AutoML on original dataset (except log transform target)',
                'automl-kaggle-orgininal': 'AutoML on original dataset (no log transform target)'
}

def get_score(run):
    print(float(run.get_metrics()['rmse']) if run.status == 'Completed' else None)
    return float(run.get_metrics()['rmse']) if run.status == 'Completed' else None

def get_main_run(exp_name):
    exp = Experiment(workspace=ws, name=exp_name)
    return list(exp.get_runs())[0]

def get_best_run(name, run):
    print(explanation[name], end = '\t')
    if run.type == 'hyperdrive':
        best_run = run.get_best_run_by_primary_metric()
        print(round(best_run.get_metrics()['rmse'],4))
    elif run.type == 'automl':
        best_run = run.get_best_child()
        print(round(best_run.get_metrics()['root_mean_squared_error'],4))
    else:
        print('wrong type')
    return best_run

main_runs = {exp_name : get_main_run(exp_name) for exp_name in list(explanation.keys())}
best_runs = {name: get_best_run(name, run) for name, run in main_runs.items()}
    

hyperdrive xgboost with bandit policy	0.1259
hyperdrive xgboost with bayesian sampling	0.127
AutoML on same dataset as hyperdrive	0.1167
AutoML on original dataset (except log transform target)	0.1248
AutoML on original dataset (no log transform target)	26738.5013


## Register model

In [5]:
model = main_runs['automl'].register_model(model_name = 'automl2')
model.download(target_dir='models/automl2', exist_ok = True)
model = joblib.load('models/automl/outputs/model.pkl')


In [6]:
best_run, best_model = main_runs['automl'].get_output()

In [8]:
best_run.register_model(model_name = 'automl3', model_path='./outputs/')

Model(workspace=Workspace.create(name='capstone', subscription_id='72f46e0e-1451-4b79-92cd-fc8f7797bda7', resource_group='test-Jesse'), name=automl3, id=automl3:1, version=1, tags={}, properties={})

In [9]:
best_model

RegressionPipeline(pipeline=Pipeline(memory=None,
                                     steps=[('datatransformer',
                                             DataTransformer(enable_dnn=None,
                                                             enable_feature_sweeping=None,
                                                             feature_sweeping_config=None,
                                                             feature_sweeping_timeout=None,
                                                             featurization_config=None,
                                                             force_text_dnn=None,
                                                             is_cross_validation=None,
                                                             is_onnx_compatible=None,
                                                             logger=None,
                                                             observer=None,
                                         

## Get testset

In [125]:
train_url = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/housing_after_preprocessing.csv"
train_x = pd.read_csv(train_url)
train_x.rename(columns={'Unnamed: 0':'Column1'}, inplace=True)
train_x = train_x.drop(columns=['y'])

from sklearn.impute import SimpleImputer
class DFSimpleImputer(SimpleImputer):
    # just like SimpleImputer, but retuns a df
    # this approach creates problems with the add_indicator=True, since more columns are returned
    # so don't set add_indicator to True
    def transform(self, X,y=None):
        return pd.DataFrame(super().transform(X),columns=X.columns) 
    def __repr__(self):
        return f'SimpleImputer'
imp = DFSimpleImputer(strategy='most_frequent')
imp = imp.fit(train_x)

In [126]:
test_url = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/final_test.csv"
test_x = pd.read_csv(test_url)
test_x.rename(columns={'Unnamed: 0':'Column1'}, inplace=True)
test_x = imp.transform(test_x)
test_x = test_x.astype(train_x.dtypes.to_dict())


float64
object
int64


In [131]:
def submit(model, filename='submission.csv'):
    pred = model.predict(test_x)
    out = pd.DataFrame()
    out['Id']= list(range(1461, 2920))
    out['SalePrice'] = np.exp(pred)
    out.to_csv('./submissions/'+ filename, index=False)
submit(model)

In [19]:
import argparse
import os
import numpy as np
import joblib
import pandas as pd
from azureml.core.run import Run
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
import xgboost as xgb


# Create TabularDataset using TabularDatasetFactory
train_url = ["https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/housing_after_preprocessing.csv"]
train_x = TabularDatasetFactory.from_delimited_files(train_url).drop_columns('y')
train_y = TabularDatasetFactory.from_delimited_files(train_url).keep_columns('y')

test_url = ["https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/final_test.csv"]
test_x = TabularDatasetFactory.from_delimited_files(test_url)

train_x = train_x.to_pandas_dataframe()
train_y = train_y.to_pandas_dataframe()
test_x = test_x.to_pandas_dataframe()

train_x = train_x.fillna(value=np.nan)
test_x = train_x.fillna(value=np.nan)


# def get_pipeline(impute_cat='DFSIMPLEIMPUTER', impute_num =DFSIMPLEIMPUTER', scale=DFMINMAX',onehot='default',remove_outliers='default'):
class DFGetDummies(TransformerMixin):
    # actually this should be identical to sklearn OneHotEncoder()
    def fit(self, X, y=None):
        self.train = pd.get_dummies(X)
        return self
    def transform(self, X, y=None):
        self.test = pd.get_dummies(X)
        return self.test.reindex(columns=self.train.columns,fill_value=0)
    def __repr__(self):
        return 'DFGetDummies'

class DFSimpleImputer(SimpleImputer):
    # just like SimpleImputer, but retuns a df
    # this approach creates problems with the add_indicator=True, since more columns are returned
    # so don't set add_indicator to True
    def transform(self, X,y=None):
        return pd.DataFrame(super().transform(X),columns=X.columns) 
    def __repr__(self):
        return f'SimpleImputer'

class DFMinMaxScaler(MinMaxScaler):
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X),columns=X.columns)
    def __repr__(self):
        return 'DFMinMaxScaler'

class DFColumnTransformer(ColumnTransformer):
    # works only with non-sparse matrices!
    def _hstack(self, Xs):
        Xs = [f for f in Xs]
        cols = [col for f in Xs for col in f.columns]
        df = pd.DataFrame(np.hstack(Xs), columns=cols)
        # print('final shape',df.shape)
        return df.infer_objects()
#%%
def get_pipeline():
    # in essence this splits the input into a categorical pipeline and a numeric pipeline
    # merged with a ColumnTransformer

    cat_steps = []
    cat_steps.append(('impute_cat', DFSimpleImputer(strategy='most_frequent')))
    cat_steps.append(('cat_to_num', DFGetDummies()))
    categorical_transformer = Pipeline(steps=cat_steps)

    num_steps = []
    num_steps.append(('impute_num', DFSimpleImputer(strategy='most_frequent')))
    # num_steps.append(('scale_num', DFMinMaxScaler()))
    numeric_transformer = Pipeline(steps=num_steps)

    col_trans = DFColumnTransformer(transformers=[
        ('numeric', numeric_transformer, make_column_selector(dtype_include=np.number)),
        ('category', categorical_transformer, make_column_selector(dtype_exclude=np.number)),
        ])

    preprocessor_steps = [('col_trans', col_trans)]
    preprocessor = Pipeline(steps=preprocessor_steps)

    return preprocessor

#%%
pipe = get_pipeline()
train_x = pipe.fit_transform(train_x, train_y)
test_x = pipe.transform(test_x)


In [24]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print(best_run.get_details()['runDefinition']['arguments'])
print(best_run_metrics['Accuracy'])
files = best_run.get_file_names()
best_run.download_file(files[-1], output_file_path='./outputs/')
joblib.load('./outputs/model.joblib')

ModelNotFoundException: ModelNotFoundException:
	Message: Model /outputs/model.pkl not found in cache at azureml-models or in current working directory d:\Documenten\GitHub\courses\Nanodegree Azure Machine Learning\Azure ML Capstone. For more info, set logging level to DEBUG.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Model /outputs/model.pkl not found in cache at azureml-models or in current working directory d:\\Documenten\\GitHub\\courses\\Nanodegree Azure Machine Learning\\Azure ML Capstone. For more info, set logging level to DEBUG."
    }
}

In [7]:
exp_name = all_exp[2]
print(explanation[exp_name], end='\t')
exp = Experiment(workspace=ws, name=exp_name)
run = list(exp.get_runs())[0]
if run.type == 'hyperdrive':
    best_run = run.get_best_run_by_primary_metric()
    print(round(best_run.get_metrics()['rmse'],3))
elif run.type == 'automl':
    best_run = run.get_best_child()
    print(round(best_run.get_metrics()['root_mean_squared_error'],3))
# best_run.register_model(
#    model_name=exp_name, 
#    model_path="outputs/model.pkl")

AutoML on same dataset as hyperdrive	

AttributeError: 'Run' object has no attribute 'get_best_child'