<center>

# Classification of clients of a bank's marketing campaign

## Armando Medina
    
#### (October, 2020)
</center>

<br />

In [None]:
from azureml.core import Workspace, Experiment

#ws = Workspace.get(name="quick-starts-ws-122643")
ws = Workspace.get(name="Experiments")

exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

#cpu_cluster_name = "udacity-first-project"
cpu_cluster_name = "TheGPUMachine"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

In [None]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive import choice
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
    'C': uniform(0.01, 100),
    'max_iter': choice(100, 1000, 10000)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) 

if "training" not in os.listdir():
    os.mkdir("./training")
    
azureml_pip_packages = [
    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-telemetry', 'azureml-interpret'
]
    

# Create a SKLearn estimator for use with train.py
est = SKLearn(entry_script='./train.py', source_directory=".", compute_target=cpu_cluster, pip_packages=azureml_pip_packages)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=4,
                                     max_concurrent_runs=4
                                    )

In [None]:
from azureml.core.experiment import Experiment

experiment = Experiment(ws, "hyperparamenter_tuning")
run = experiment.submit(config=hyperdrive_config, show_output=True)

RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)

In [None]:
import joblib
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

# Get your best run and save the model from that run.
best_run = run.get_best_run_by_primary_metric()

model = best_run.register_model(model_name='sklearn-lr', 
                                model_path='./outputs/model.joblib', 
                                model_framework=Model.Framework.SCIKITLEARN, 
                                model_framework_version='0.22.2',
                                resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=7.0)
                               )

print(best_run.get_metrics())

print(model)

### Model Evaluation using Confusion Matrix

In [None]:
from sklearn.linear_model import LogisticRegression
from azureml.data.dataset_factory import TabularDatasetFactory
from cleandata import clean_data
from sklearn.model_selection import train_test_split


ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)
x, y = clean_data(ds)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42);

model.download(target_dir='outputs/', exist_ok=False, exists_ok=None)

#model
lr_model = joblib.load('outputs/model.joblib')


#prediction
predictions = lr_model.predict(x_test)
#score
score = lr_model.score(x_test, y_test)

In [None]:
!pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import numpy as np

cm = metrics.confusion_matrix(y_test, predictions)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(cm/np.sum(cm), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory
ds = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

In [None]:
from cleandata import clean_data
import os

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
feature_names = list(x.columns)

### Explanation

In [None]:
!pip install --upgrade --upgrade-strategy eager azureml-sdk azureml-contrib-explain-model azureml-interpret 
#!pip install azureml-contrib-explain-model==1.0.65 --force-reinstall
!pip install -U scikit-learn==0.22.2.post1 --force-reinstall

In [None]:
#from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(best_run)
global_explanation = client.download_model_explanation(top_k=5)
local_importance_values = global_explanation.local_importance_values
expected_values = global_explanation.expected_values

In [None]:
global_explanation_topk = client.download_model_explanation(top_k=5)
global_importance_values = global_explanation_topk.get_ranked_global_values()
global_importance_names = global_explanation_topk.get_ranked_global_names()

In [None]:
print('global importance values: {}'.format(global_importance_values))
print('global importance names: {}'.format(global_importance_names))

In [None]:
from azureml.core.model import Model
import joblib
original_model = Model(ws, 'sklearn-lr')
model_path = original_model.download(exist_ok=True)
original_model = joblib.load(model_path)

In [None]:
from azureml.core import Dataset
from azureml.data.dataset_factory import DataType
from azureml.data.dataset_factory import TabularDatasetFactory

df = TabularDatasetFactory.from_delimited_files("https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

x_automl, y_automl = clean_data(df)

data_train = x_automl
data_train['y'] = y_automl
feature_names = list(data_train.columns)

if "data" not in os.listdir():
    os.mkdir("./data")

local_path = './data/data_clean.csv'
data_train.to_csv(local_path)

datastore = ws.get_default_datastore()

datastore.upload(src_dir='data', target_path='data')

datastore_paths = [(datastore, 'data/data_clean.csv')]

dataset = Dataset.Tabular.from_delimited_files(path=datastore_paths)

training_data, validation_data = dataset.random_split(percentage=0.8, seed=1)

In [None]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task = 'classification',
                             experiment_timeout_minutes=30,
                             primary_metric_name='accuracy',
                             blocked_models=['XGBoostClassifier', 'MaxAbsScaler SVM'],
                             training_data=training_data,
                             validation_data= validation_data,
                             label_column_name = 'y',
                             compute_target=cpu_cluster
                            )

In [None]:
# Submit your automl run
automl_run = experiment.submit(automl_config, show_output = True)

In [None]:
from azureml.widgets import RunDetails
RunDetails(automl_run).show()

In [None]:
# Retrieve and save your best automl model.
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

# Get your best run and save the model from that run.
best_run, fitted_model = automl_run.get_output(metric = "accuracy")
print(best_run)


description = 'Best AutoML Model'
tags = None
model = automl_run.register_model(description = description, tags = tags)
print(automl_run.model_id)

In [None]:
print('fitted_model:\n {}\n\n'.format(fitted_model))

In [None]:
best_run

### Handle imbalanced data

The variable y is extremely unbalanced, this causes bias, this can be seen in the confusion matrix.

In the handle-imbalanced-data.ipynb notebook included in this project, you can see how the problem is corrected and the dataset is created through the Synthetic Minority Oversampling Technique, or SMOTE for short.

In [None]:
plt.bar(['No', 'Yes'], data_train.y.value_counts().values, facecolor = 'lawngreen', edgecolor='lightseagreen', linewidth=0.5)
plt.title('Has the client subscribed a term deposit?', fontsize=14)
plt.xlabel('Answer')
plt.ylabel('No.')
plt.show()

In [None]:
datastore_training_paths = [(datastore, 'data/data_balanced.csv')]
datastore_validation_paths = [(datastore, 'data/data_validation.csv')]

training_dataset = Dataset.Tabular.from_delimited_files(path=datastore_training_paths)
validation_dataset = Dataset.Tabular.from_delimited_files(path=datastore_validation_paths)


automl_config = AutoMLConfig(task = 'classification',
                             experiment_timeout_minutes=30,
                             primary_metric_name='accuracy',
                             blocked_models=['XGBoostClassifier', 'MaxAbsScaler SVM'],
                             training_data=training_dataset,
                             validation_data= validation_dataset,
                             label_column_name = 'y',
                             compute_target=cpu_cluster
                            )

automl_run = experiment.submit(automl_config, show_output = True)

In [None]:
RunDetails(automl_run).show()

In [None]:
# Retrieve and save your best automl model.
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

# Get your best run and save the model from that run.
best_run, fitted_model = automl_run.get_output(metric = "accuracy")
print(best_run)


description = 'Best AutoML Model'
tags = None
model = automl_run.register_model(description = description, tags = tags)
print(automl_run.model_id)

In [None]:
best_run

In [None]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive import choice
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
import os

# Specify parameter sampler
ps = RandomParameterSampling(
    {
    'C': uniform(0.01, 100),
    'max_iter': choice(100, 1000, 10000)
    }
)

# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) 

if "training" not in os.listdir():
    os.mkdir("./training")
    
azureml_pip_packages = [
    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-telemetry', 'azureml-interpret'
]
    

# Create a SKLearn estimator for use with train.py
est = SKLearn(entry_script='./train_balanced.py', source_directory=".", compute_target=cpu_cluster, pip_packages=azureml_pip_packages)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator=est,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='Accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=4,
                                     max_concurrent_runs=4
                                    )

In [None]:
from azureml.core.experiment import Experiment

experiment = Experiment(ws, "hyperparamenter_tuning")
run = experiment.submit(config=hyperdrive_config, show_output=True)

RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)

In [None]:
import joblib
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

# Get your best run and save the model from that run.
best_run = run.get_best_run_by_primary_metric()

model = best_run.register_model(model_name='sklearn-lr', 
                                model_path='./outputs/model.joblib', 
                                model_framework=Model.Framework.SCIKITLEARN, 
                                model_framework_version='0.22.2',
                                resource_configuration=ResourceConfiguration(cpu=1, memory_in_gb=7.0)
                               )


model.download(target_dir='outputs/', exist_ok=True, exists_ok=None)

#model
lr_model = joblib.load('outputs/model.joblib')

#data
data_test = pd.read_csv('data/data_validation.csv')  

y_test = data_test["y"]
data_test.drop("y", inplace=True, axis=1)
data_test.drop("Unnamed: 0", inplace=True, axis=1)
x_test = data_test


#prediction
predictions = lr_model.predict(x_test)
#score
score = lr_model.score(x_test, y_test)
#confusion matrix
cm = metrics.confusion_matrix(y_test, predictions)

plt.figure(figsize=(12,12))
sns.heatmap(cm/np.sum(cm), annot=True, fmt=".2%", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

### Cluster clean up

In [None]:
#cpu_cluster.delete()