# Bring Your Own XGBoost training code

## Configure Variables

In [85]:
import pandas as pd
import boto3
import sagemaker
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig
from sagemaker.experiments.run import Run, load_run
sm_sess = sagemaker.session.Session()



sess = boto3.Session()
region = sess.region_name
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()

aws_region = sess.region_name
account_id = sess.client('sts', region_name=region).get_caller_identity()["Account"]
s3_bucket = 'bring-your-own-training-code-{}-{}'.format(sess.region_name, account_id)
s3_prefix = 'xgboost-churn'

try:
    if sess.region_name == "us-east-1":
        sess.client('s3').create_bucket(Bucket=bucket)
    else:
        sess.client('s3').create_bucket(Bucket=bucket, 
                                        CreateBucketConfiguration={'LocationConstraint': sess.region_name})
except Exception as e:
    print("Looks like you already have a bucket of this name. That's good!")

sm_xgb_framework_version = '1.7-1'
sm_xgb_image_name = sagemaker.image_uris.retrieve(framework='xgboost', region=region, version=framework_version)

print("Setting some useful environment variables (bucket, prefix, region, docker_image_name)...")
%store s3_bucket
%store s3_prefix
%store aws_region
%store sm_xgb_image_name
%store sm_xgb_framework_version

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


Setting some useful environment variables (bucket, prefix, region, docker_image_name)...
Stored 's3_bucket' (str)
Stored 's3_prefix' (str)
Stored 'aws_region' (str)
Stored 'sm_xgb_image_name' (str)
Stored 'sm_xgb_framework_version' (str)


## Download the data and upload to S3

In [63]:
!wget https://higheredbcs.wiley.com/legacy/college/larose/0470908742/ds/data_sets.zip --no-check-certificate
!unzip -o data_sets.zip
!mv "Data sets"/churn.txt .
!rm -rf "Data sets" data_sets.zip

--2024-05-21 17:16:14--  https://higheredbcs.wiley.com/legacy/college/larose/0470908742/ds/data_sets.zip
Resolving higheredbcs.wiley.com (higheredbcs.wiley.com)... 172.64.147.222, 104.18.40.34, 2606:4700:4400::6812:2822, ...
Connecting to higheredbcs.wiley.com (higheredbcs.wiley.com)|172.64.147.222|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 1003616 (980K) [application/zip]
Saving to: ‘data_sets.zip’


2024-05-21 17:16:14 (150 MB/s) - ‘data_sets.zip’ saved [1003616/1003616]

Archive:  data_sets.zip
 extracting: Data sets/adult.zip     
  inflating: Data sets/cars.txt      
  inflating: Data sets/cars2.txt     
  inflating: Data sets/cereals.CSV   
  inflating: Data sets/churn.txt     
  inflating: Data sets/ClassifyRisk  
  inflating: Data sets/ClassifyRisk - Missing.txt  
 extracting: Data sets/DKD2e data sets.zip  
  inflating: Data sets/nn1.txt       


In [64]:
# Convert data to CSV
import pandas as pd
data = pd.read_csv('churn.txt')
data.to_csv('churn.csv', index=False)

In [65]:
# Push to S3
local_raw_path = "churn.csv"
raw_dir = f"{prefix}/data/raw"
s3uri_raw = sagemaker.s3.S3Uploader.upload(local_raw_path, f's3://{bucket}/{raw_dir}')
## Store the variable
%store s3uri_raw
s3uri_raw

Stored 's3uri_raw' (str)


's3://bring-your-own-training-code-us-east-1-047922237497/xgboost-churn/data/raw/churn.csv'

## Import the libraries

In [66]:
import io
import os
import sys
import time
import json
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import display as dis
from time import strftime, gmtime
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from IPython import display 

In [68]:
churn = pd.read_csv("./churn.csv")
pd.set_option("display.max_columns", 500)
churn.head(2)

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.


### Change "Area Code" type and Drop Cullumns column

In [69]:
churn["Area Code"] = churn["Area Code"].astype(object)
churn = churn.drop(["Phone", "Day Charge", "Eve Charge", "Night Charge", "Intl Charge"], axis=1)

### Convert our categorical features into numeric features and put Churn as the first column

In [70]:
model_data = pd.get_dummies(churn)
model_data = pd.concat(
    [model_data["Churn?_True."], model_data.drop(["Churn?_False.", "Churn?_True."], axis=1)], axis=1
)
model_data = model_data.astype(float)

### Split the data into training, validation, and test sets.

In [71]:
train_data, validation_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729),
    [int(0.7 * len(model_data)), int(0.9 * len(model_data))],
)
train_data.to_csv("train.csv", header=False, index=False)
validation_data.to_csv("validation.csv", header=False, index=False)
test_data.to_csv("test.csv", header=False, index=False)
train_data.shape, validation_data.shape, test_data.shape

  return bound(*args, **kwds)


((2333, 70), (666, 70), (334, 70))

### Upload these files to S3

In [72]:
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/train.csv")
).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation/validation.csv")
).upload_file("validation.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "test/test.csv")
).upload_file("test.csv")

### Specify TrainingInput Data

In [73]:
s3_input_train = TrainingInput(s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv")
s3_input_validation = TrainingInput(s3_data="s3://{}/{}/validation/".format(bucket, prefix), content_type="csv")

#Store variable
%store s3_input_train
%store s3_input_validation

Stored 's3_input_train' (TrainingInput)
Stored 's3_input_validation' (TrainingInput)


# Train the model

In [78]:
# experiment_name
create_date = lambda: strftime("%Y-%m-%d-%H-%M-%S", gmtime())
experiment_name=f"byo-code-churn-prediction-xgboost-{create_date()}"

In [81]:
# Set Rules for SM Debugger
debug_rules = [Rule.sagemaker(rule_configs.loss_not_decreasing()),
               Rule.sagemaker(rule_configs.overtraining()),
               Rule.sagemaker(rule_configs.overfit())
              ]

In [82]:
#Set Hyperparameter
hyperparams = {"max_depth":5,
               "subsample":0.8,
               "num_round":600,
               "eta":0.2,
               "gamma":4,
               "min_child_weight":6,
               "objective":'binary:logistic',
               "verbosity": 0
              }

In [87]:
from sagemaker.xgboost.estimator import XGBoost
train_script_name = 'train.py'
framework_xgb = XGBoost(image_uri=sm_xgb_image_name,
                        entry_point=train_script_name,
                        role=role,
                        framework_version=sm_xgb_framework_version,
                        py_version="py3",
                        hyperparameters=hyperparams,
                        instance_count=1, 
                        instance_type='ml.m4.xlarge',
                        output_path=f's3://{bucket}/{prefix}/output',
                        base_job_name='BYO-Code-xgboost-customer-churn',
                        sagemaker_session=sm_sess,
                        rules=debug_rules
                        )


In [88]:
with Run(
    experiment_name=experiment_name,
    run_name=f"BYO-Code-01-framework-mode-run-{create_date()}",
    sagemaker_session=sm_sess,
) as run:
    framework_xgb.fit(inputs={
                          'train': s3_input_train,
                          'validation': s3_input_validation
                             }
                     )

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: BYO-Code-xgboost-customer-churn-2024-05-21-17-24-41-745


2024-05-21 17:24:42 Starting - Starting the training job...
2024-05-21 17:25:11 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
Overtraining: InProgress
Overfit: InProgress
...
2024-05-21 17:25:31 Downloading - Downloading input data...
2024-05-21 17:26:11 Downloading - Downloading the training image......
2024-05-21 17:27:12 Training - Training image download completed. Training in progress...[34m[2024-05-21 17:27:15.024 ip-10-2-78-200.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-05-21 17:27:15.048 ip-10-2-78-200.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-05-21:17:27:15:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-05-21:17:27:15:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-05-21:17:27:15:INFO] Invoking user training script.[0m
[34m[2024-05-21:17:27:15:INFO] Module train does not provide a setup.py. [0m
[34mGe