Load the necessary libraries for the experimentation.

In [1]:
import azureml.core
import azureml.dataprep as dprep
from azureml.core.workspace import Workspace
from azureml.core.runconfig import DataReferenceConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from IPython.display import display
from imblearn.over_sampling import SMOTENC
import pandas as pd
import logging
import os
import csv

In [None]:
# load the AML workspace from config
ws = Workspace.from_config('data/config.json')
# choose a name for the run history container in the workspace
experiment_name = 'automated-ml-classification'
# project folder
project_folder = './automated-ml-classification'

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

Load the train and test data using the dataprep library.

In [3]:
dataset_root = "data"
train_values = dprep.read_csv(path=os.path.join(dataset_root, 'train_values.csv'))
train_labels = dprep.read_csv(path=os.path.join(dataset_root, 'train_labels.csv'))

test_values = dprep.read_csv(path=os.path.join(dataset_root, 'test_values.csv'))

display(train_values.head(5))
display(train_labels.head(5))

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,building_id,damage_grade
0,802906,3
1,28830,2
2,94947,3
3,590882,2
4,201944,3


Remove any of the unnecessary columns and convert the dataset into a dataframe

In [4]:
dflow_X = train_values.drop_columns(['building_id'])
dflow_test = test_values.drop_columns(['building_id'])
dflow_y = train_labels.keep_columns('damage_grade')
x_train = dflow_X.to_pandas_dataframe()
y_train = dflow_y.to_pandas_dataframe()
x_test = dflow_test.to_pandas_dataframe()

Convert the columns into the right format (categorical and numerical columns).

In [7]:
for x in [x_train, x_test]:
    x.geo_level_1_id = x.geo_level_1_id.astype(str)
    x.geo_level_2_id = x.geo_level_2_id.astype(str)
    x.geo_level_3_id = x.geo_level_3_id.astype(str)
    x.count_floors_pre_eq = x.count_floors_pre_eq.astype(int)
    x.age = x.age.astype(int)
    x.area_percentage = x.area_percentage.astype(int)
    x.height_percentage = x.height_percentage.astype(int)


Define which of the features are categorical so SMOTENC can perform synthetic oversampling

In [13]:
cat_features = list(range(0,38))
for i in [3,4,5,6]:
    cat_features.remove(i)

In [14]:
sm = SMOTENC(n_jobs=4,categorical_features=cat_features)
x_train, y_train = sm.fit_resample(X=x_train, y=y_train)

  y = column_or_1d(y, warn=True)


Convert the data back into a dataframe so it can be prepared for the AML experiment

In [19]:
x_train=pd.DataFrame(x_train)
y_train=pd.DataFrame(y_train)

In [None]:
# save the new train dataset to tsv format
x_train.to_csv("data/X_train.tsv", index=False, header=False, quoting=csv.QUOTE_ALL, sep="\t")
y_train.to_csv("data/y_train.tsv", index=False, header=False, sep="\t")
# upload the dataset to the datastore
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='bai_data', overwrite=True, show_progress=True)
# define the datastore for the experiment
dr = DataReferenceConfiguration(datastore_name=ds.name, 
                   path_on_datastore='bai_data', 
                   path_on_compute='/tmp/azureml_runs',
                   mode='download', # download files from datastore to compute target
                   overwrite=False)
# define the compute target for the experiment 
compute_target = ComputeTarget(workspace=ws, name='aml-compute')
# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")
# Set compute target to AmlCompute
conda_run_config.target = compute_target
conda_run_config.environment.docker.enabled = True
conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
conda_run_config.data_references = {ds.name: dr}
# add the correct dependencies
cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy','py-xgboost<=0.80'])
conda_run_config.environment.python.conda_dependencies = cd

Setup a dataloading script for use in the experiments.

In [None]:
%%writefile $project_folder/get_data.py

import pandas as pd

def get_data():
    X_train = pd.read_csv("/tmp/azureml_runs/bai_data/X_train.tsv", delimiter="\t", header=None, quotechar='"')
    y_train = pd.read_csv("/tmp/azureml_runs/bai_data/y_train.tsv", delimiter="\t", header=None, quotechar='"')

    return { "X" : X_train.values, "y" : y_train[0].values }

Define the autoML project

In [None]:
automl_settings = {
    "iteration_timeout_minutes" : 1440,
    "iterations" : 100,
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations":5,
    "max_concurrent_iterations":2,
    "max_cores_per_iteration":2
}

automated_ml_config = AutoMLConfig(task = 'classification',
                                    primary_metric='AUC_weighted',
                             debug_log = 'automated_ml_errors.log',
                             path = project_folder,
                            data_script = project_folder + "/get_data.py",
                            run_configuration=conda_run_config,
                             **automl_settings)

Submit the AutoML experiment

In [None]:
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(automated_ml_config, show_output=True)

Obtain the best model

In [44]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: automated-ml-classification,
Id: AutoML_e5d6054d-3c28-44b5-9d8b-3f3ebb8fcfc5_51,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_feature_sweeping=None, feature_sweeping_timeout=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('SparseNormalizer', <automl.client.core.common.model_wrappers.SparseNormalizer object at 0x000001FD237006D8>), ('XGBoostClassifier', <automl.client.core.common.model_wrappers.XGBoostClassifier object at 0x000001FD23700BA8>)])
Y_transformer(['LabelEncoder', LabelEncoder()])


Apply the best model to the test dataset

In [21]:
result = fitted_model.predict(x_test.values)
print(result)

[3 2 2 ... 2 2 1]



Prepare the results for submission

In [22]:
submission = pd.read_csv('data/submission_format.csv')
submission.damage_grade = result
submission.to_csv('data/automl_submission.csv', index=False)