In [None]:
# import necessary libraries
import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner
)
from sagemaker import get_execution_role
import boto3

role = get_execution_role()
default_bucket = "sagemaker-data113"

## XGBoost Hyper-paramter Tuning and Training jobs

In [None]:
# Creating Training and Validation data channels from s3 buckets (saved in 'churn_data_prep.ipynb')
s3_input_train = TrainingInput(s3_data=f"s3://{default_bucket}/train.csv", content_type="csv")
s3_input_validation = TrainingInput(s3_data=f"s3://{default_bucket}/validation.csv", content_type="csv")

Following hyper-parameters are fixed 

- `metric` (default value for binary classification) error rate  = #(wrong_cases)/#(total_cases) at threshold of 0.5.
- `objective`  logistic regression for binary classification, output probability 
- `num_round` controls the number of boosting rounds. This is essentially the subsequent models that are trained using the residuals of previous iterations. Again, more rounds should produce a better fit on the training data, but can be computationally expensive or lead to overfitting.
- `rate_drop` The dropout rate that specifies the fraction of previous trees to drop during the dropout.



In [None]:
fixed_hyperparameters = {
    "eval_metric": "auc",
    "objective":"binary:logistic",
    "num_round":"100",
    "rate_drop":"0.3",
}

In [None]:
sess = sagemaker.Session()
container = sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "1.5-1")

estimator = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    hyperparameters=fixed_hyperparameters,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/output".format(default_bucket),
    sagemaker_session=sess
)

Following hyperparamters are varied for tuning:-

- `eta` controls how aggressive each round of boosting is. Larger values lead to more conservative boosting.
- `min_child_weight` Minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, the building process gives up further partitioning. The larger the tree, the more conservative it is.
- `alpha` L1 regularization term on weights. Increasing this value makes models more conservative.
- `max_depth` Maximum depth of a tree. Increasing this value makes the model more complex and likely to be overfit.

In [None]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0, 1),
    "min_child_weight": ContinuousParameter(1, 10),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
}

In [None]:
objective_metric_name = "validation:auc"

In [None]:
tuner = HyperparameterTuner(
    estimator,objective_metric_name,hyperparameter_ranges,max_jobs=10,max_parallel_jobs=2)

In [None]:
tuner.fit({
    "train":s3_input_train,
    "validation":s3_input_validation
    },include_cls_metadata=False)

In [None]:
tuning_job_result = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)
job_count = tuning_job_result["TrainingJobStatusCounters"]["Completed"]
print("%d training jobs have completed" %job_count)

### Fetch Tuning results

In [None]:
import pandas as pd
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_result["HyperParameterTuningJobName"])

full_df = tuner.dataframe()

objective = tuning_job_result["HyperParameterTuningJobConfig"]["HyperParameterTuningJobObjective"]
is_minimize = objective["Type"] != "Maximize"
objective_name = objective["MetricName"]

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    if len(df) > 0:
        df = df.sort_values("FinalObjectiveValue", ascending=is_minimize)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest": min(df["FinalObjectiveValue"]), "highest": max(df["FinalObjectiveValue"])})
        pd.set_option("display.max_colwidth", None)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df

In [None]:
best_hyperparameters = tuning_job_result["BestTrainingJob"]["TunedHyperParameters"]
best_hyperparameters

The scatter plot shows that the points are distributed quite apart from each other. Hence, we have set the ranges well for hyperparamter optimization.


## Register the model

Go to Amazon Sagemaker console. In the Training -> Hyperparameter Tuning Jobs, select the hyperparamter tuning job with the corresponding name ,initiated in this notebook (or use Creation Time column).  There, in the hyperparamter tuning job, there will be tab showing the Best Trained Model summary and a button "Create Model", to  register the model container. We will later use this image to create a deployment endpoint for real-time prediction.