In [2]:
!pip3 install -U sagemaker

Collecting sagemaker
  Downloading sagemaker-2.207.0-py3-none-any.whl.metadata (13 kB)
Collecting urllib3<3.0.0,>=1.26.8 (from sagemaker)
  Using cached urllib3-2.0.7-py3-none-any.whl.metadata (6.6 kB)
Downloading sagemaker-2.207.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hUsing cached urllib3-2.0.7-py3-none-any.whl (124 kB)
Installing collected packages: urllib3, sagemaker
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.1.0
    Uninstalling urllib3-2.1.0:
      Successfully uninstalled urllib3-2.1.0
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.199.0
    Uninstalling sagemaker-2.199.0:
      Successfully uninstalled sagemaker-2.199.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency con

In [3]:
import os
import boto3
import sagemaker
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = "DEMO-breast-cancer-prediction-xgboost-highlevel"
s3_private_path_tsv = "s3://{}/amazon-reviews-pds/tsv".format(bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Read In Data and Create Train/Val?test Splits

In [62]:
df = pd.read_csv("../data/internet_churn_cleaned.csv")
df = df.drop(labels= ['Unnamed: 0', 'id'], axis=1)
df.head(5)

Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,churn,remaining_contract_0-1_years,remaining_contract_1-2_years,remaining_contract_2-3_years,remaining_contract_no_contract
0,1,0,11.95,25,0,8.4,2.3,0,0,1,0,0,0
1,0,0,8.22,0,0,0.0,0.0,0,1,0,0,0,1
2,1,0,8.91,16,0,13.7,0.9,0,1,1,0,0,0
3,0,0,6.87,21,1,0.0,0.0,0,1,0,0,0,1
4,0,0,6.39,0,0,0.0,0.0,0,1,0,0,0,1


In [63]:
# Split into train and test, validation 80/10/10
X = df.drop(labels = 'churn', axis = 1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 24)
X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size = 0.5,random_state = 24)

In [64]:
# Scale variables
cols_scale = ['subscription_age','bill_avg','service_failure_count','download_avg','upload_avg','download_over_limit']
SS = ColumnTransformer([('scaler',StandardScaler(),cols_scale)],remainder='passthrough')
X_train = SS.fit_transform(X_train)
X_test = SS.fit_transform(X_test)
X_val = SS.fit_transform(X_val)

In [65]:
new_cols = ['subscription_age','bill_avg','service_failure_count',
                                           'download_avg','upload_avg','download_over_limit','is_tv_subscriber',
                                           'is_movie_package_subscriber','remaining_contract_0-1_years',
                                           'remaining_contract_1-2_years','remaining_contract_2-3_years',
                                           'remaining_contract_no_contract']
X_train = pd.DataFrame(X_train,columns = new_cols)
X_test = pd.DataFrame(X_test,columns = new_cols)
X_val = pd.DataFrame(X_val,columns = new_cols)
X_train.head()

Unnamed: 0,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,is_tv_subscriber,is_movie_package_subscriber,remaining_contract_0-1_years,remaining_contract_1-2_years,remaining_contract_2-3_years,remaining_contract_no_contract
0,2.503554,0.227845,-0.334555,-0.121946,-0.195863,-0.209692,1.0,0.0,0.0,1.0,0.0,0.0
1,0.773575,0.003701,-0.334555,0.977917,0.40514,-0.209692,1.0,0.0,0.0,1.0,0.0,0.0
2,1.283257,0.30256,-0.334555,-0.489083,-0.364566,-0.209692,0.0,0.0,0.0,0.0,0.0,1.0
3,0.146274,-0.668732,-0.334555,-0.309387,-0.248583,-0.209692,1.0,1.0,0.0,1.0,0.0,0.0
4,0.136472,-0.071014,-0.334555,-0.451905,-0.332934,-0.209692,1.0,0.0,0.0,0.0,0.0,1.0


## Upload Split Files to S3 Bucket

In [66]:
# Append y cols to dataframes
y_train.index = list(range(0,len(y_train)))
train_df = X_train
train_df["churn"] = y_train
col_names = list(train_df.columns)
col_names = [col_names[-1]] + col_names[:-1]
train_df = train_df[col_names]
train_df["churn"] = train_df["churn"].astype(int)

y_val.index = list(range(0,len(y_val)))
val_df = X_val
val_df["churn"] = y_val
col_names = list(val_df.columns)
col_names = [col_names[-1]] + col_names[:-1]
val_df = val_df[col_names]
val_df["churn"] = val_df["churn"].astype(int)

y_test.index = list(range(0,len(y_test)))
test_df = X_test
test_df["churn"] = y_test
col_names = list(test_df.columns)
col_names = [col_names[-1]] + col_names[:-1]
test_df = test_df[col_names]
test_df["churn"] = test_df["churn"].astype(int)

In [67]:
# Write split dfs to csv files
train_df.to_csv("../data/splits/train.csv", header=False, index=False)
val_df.to_csv("../data/splits/val.csv", header=False, index=False)
test_df.to_csv("../data/splits/test.csv", header=False, index=False)

In [68]:
# Upload data to S3 bucket
!aws s3 cp --recursive ../data/splits/ $s3_private_path_tsv/

upload: ../data/splits/test.csv to s3://sagemaker-us-east-1-129457079970/amazon-reviews-pds/tsv/test.csv
upload: ../data/splits/val.csv to s3://sagemaker-us-east-1-129457079970/amazon-reviews-pds/tsv/val.csv
upload: ../data/splits/train.csv to s3://sagemaker-us-east-1-129457079970/amazon-reviews-pds/tsv/train.csv


In [69]:
!aws s3 ls $s3_private_path_tsv/

                           PRE .ipynb_checkpoints/
                           PRE xgb_output/
2024-01-22 22:16:46    9828620 dataset.csv
2024-02-06 01:00:11    1062003 test.csv
2024-02-06 01:00:11    8410493 train.csv
2024-02-06 01:00:11    1056164 val.csv


In [70]:
%%time
from time import gmtime, strftime

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "{}/xgb_output/{}".format(s3_private_path_tsv, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    num_round=100,
)

train_data = sagemaker.inputs.TrainingInput(
    "{}/train.csv".format(s3_private_path_tsv),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "{}/val.csv".format(s3_private_path_tsv),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: xgb-2024-02-06-01-00-18


2024-02-06 01:00:19 Starting - Starting the training job...
2024-02-06 01:00:35 Starting - Preparing the instances for training.........
2024-02-06 01:02:04 Downloading - Downloading input data...
2024-02-06 01:02:38 Downloading - Downloading the training image......
2024-02-06 01:03:39 Training - Training image download completed. Training in progress..[34m[2024-02-06 01:03:51.091 ip-10-0-107-202.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-02-06 01:03:51.113 ip-10-0-107-202.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-02-06:01:03:51:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-02-06:01:03:51:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-02-06:01:03:51:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-06:01:03:51:INFO] Running XGBoost Sagemaker in algorithm mode[

In [75]:
!aws s3 ls $s3_private_path_tsv/xgb_output/xgb-2024-02-06-01-00-18/xgb-2024-02-06-01-00-18/output/

2024-02-06 01:04:04      75883 model.tar.gz
