In [2]:
!pip3 install -U sagemaker

Collecting sagemaker
  Using cached sagemaker-2.207.1-py3-none-any.whl.metadata (13 kB)
Collecting docker (from sagemaker)
  Downloading docker-7.0.0-py3-none-any.whl.metadata (3.5 kB)
Using cached sagemaker-2.207.1-py3-none-any.whl (1.4 MB)
Downloading docker-7.0.0-py3-none-any.whl (147 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.6/147.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: docker, sagemaker
  Attempting uninstall: sagemaker
    Found existing installation: sagemaker 2.198.0
    Uninstalling sagemaker-2.198.0:
      Successfully uninstalled sagemaker-2.198.0
Successfully installed docker-7.0.0 sagemaker-2.207.1
[0m

In [4]:
import os
import boto3
import sagemaker
import pandas as pd
#import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name

bucket = sess.default_bucket()
prefix = "DEMO-breast-cancer-prediction-xgboost-highlevel"
s3_private_path_tsv = "s3://{}/amazon-reviews-pds/tsv".format(bucket)

## Read In Data and Create Train/Val?test Splits

In [9]:
df = pd.read_csv("../data/internet_churn_cleaned.csv")
print(df.columns)

df = df.drop(labels= ['Unnamed: 0', 'id'], axis=1)
df = df[df[df.columns].min(axis=1) >= 0]

#df = df.rename(columns = {'reamining_contract':'remaining_contract'})
#df['remaining_contract'] = df['remaining_contract'].astype(str)

#df['remaining_contract'].replace('nan', 'no contract', inplace=True)

df[['download_avg','upload_avg']] = df[['download_avg','upload_avg']].fillna(df[['download_avg','upload_avg']].median())
        
#df = pd.get_dummies(df, columns = ['remaining_contract'],dtype = int)
df= df.rename({'remaining_contract_0-1 years':'remaining_contract_0-1_years',
              'remaining_contract_1-2 years': 'remaining_contract_1-2_years',
              'remaining_contract_2-3 years': 'remaining_contract_2-3_years',
              'remaining_contract_no contract':'remaining_contract_no_contract'},axis = 1)


df.head(5)

Index(['Unnamed: 0', 'id', 'is_tv_subscriber', 'is_movie_package_subscriber',
       'subscription_age', 'bill_avg', 'service_failure_count', 'download_avg',
       'upload_avg', 'download_over_limit', 'churn',
       'remaining_contract_0-1_years', 'remaining_contract_1-2_years',
       'remaining_contract_2-3_years', 'remaining_contract_no_contract'],
      dtype='object')


Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,churn,remaining_contract_0-1_years,remaining_contract_1-2_years,remaining_contract_2-3_years,remaining_contract_no_contract
0,1,0,11.95,25,0,8.4,2.3,0,0,1,0,0,0
1,0,0,8.22,0,0,0.0,0.0,0,1,0,0,0,1
2,1,0,8.91,16,0,13.7,0.9,0,1,1,0,0,0
3,0,0,6.87,21,1,0.0,0.0,0,1,0,0,0,1
4,0,0,6.39,0,0,0.0,0.0,0,1,0,0,0,1


In [10]:
# Split into train and test, validation 80/10/10
X = df.drop(labels = 'churn', axis = 1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 24)
X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size = 0.5,random_state = 24)

In [11]:
# Scale variables
cols_scale = ['subscription_age','bill_avg','service_failure_count','download_avg','upload_avg','download_over_limit']
SS = ColumnTransformer([('scaler',StandardScaler(),cols_scale)],remainder='passthrough')
X_train = SS.fit_transform(X_train)
X_test = SS.fit_transform(X_test)
X_val = SS.fit_transform(X_val)

In [12]:
new_cols = ['subscription_age','bill_avg','service_failure_count',
                                           'download_avg','upload_avg','download_over_limit','is_tv_subscriber',
                                           'is_movie_package_subscriber','remaining_contract_0-1_years',
                                           'remaining_contract_1-2_years','remaining_contract_2-3_years',
                                           'remaining_contract_no_contract']
X_train = pd.DataFrame(X_train,columns = new_cols)
X_test = pd.DataFrame(X_test,columns = new_cols)
X_val = pd.DataFrame(X_val,columns = new_cols)
X_train.head()

Unnamed: 0,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,is_tv_subscriber,is_movie_package_subscriber,remaining_contract_0-1_years,remaining_contract_1-2_years,remaining_contract_2-3_years,remaining_contract_no_contract
0,2.502725,0.232133,-0.335453,-0.121422,-0.190893,-0.209741,1.0,0.0,0.0,1.0,0.0,0.0
1,0.772996,0.005874,-0.335453,0.97446,0.390738,-0.209741,1.0,0.0,0.0,1.0,0.0,0.0
2,1.282604,0.307553,-0.335453,-0.487231,-0.354158,-0.209741,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.22662,0.382972,-0.335453,0.363236,0.472371,-0.209741,0.0,0.0,0.0,0.0,0.0,1.0
4,0.135986,-0.069546,-0.335453,-0.450187,-0.323546,-0.209741,1.0,0.0,0.0,0.0,0.0,1.0


## Upload Split Files to S3 Bucket

In [13]:
# Append y cols to dataframes
y_train.index = list(range(0,len(y_train)))
train_df = X_train
train_df["churn"] = y_train
col_names = list(train_df.columns)
col_names = [col_names[-1]] + col_names[:-1]
train_df = train_df[col_names]
train_df["churn"] = train_df["churn"].astype(int)

y_val.index = list(range(0,len(y_val)))
val_df = X_val
val_df["churn"] = y_val
col_names = list(val_df.columns)
col_names = [col_names[-1]] + col_names[:-1]
val_df = val_df[col_names]
val_df["churn"] = val_df["churn"].astype(int)

# y_test.index = list(range(0,len(y_test)))
test_df = X_test
# test_df["churn"] = y_test
# col_names = list(test_df.columns)
# col_names = [col_names[-1]] + col_names[:-1]
# test_df = test_df[col_names]
# test_df["churn"] = test_df["churn"].astype(int)

In [14]:
# Write split dfs to csv files
train_df.to_csv("../data/splits/train.csv", header=False, index=False)
val_df.to_csv("../data/splits/val.csv", header=False, index=False)
test_df.to_csv("../data/splits/test.csv", header=False, index=False)

In [15]:
# Upload data to S3 bucket
!aws s3 cp --recursive ../data/splits/ $s3_private_path_tsv/

upload: ../data/splits/test.csv to s3://sagemaker-us-east-1-129457079970/amazon-reviews-pds/tsv/test.csv
upload: ../data/splits/val.csv to s3://sagemaker-us-east-1-129457079970/amazon-reviews-pds/tsv/val.csv
upload: ../data/splits/train.csv to s3://sagemaker-us-east-1-129457079970/amazon-reviews-pds/tsv/train.csv


In [16]:
!aws s3 ls $s3_private_path_tsv/

                           PRE .ipynb_checkpoints/
                           PRE xgb_output/
2024-01-22 22:16:46    9828620 dataset.csv
2024-02-13 01:12:08    1049475 test.csv
2024-02-13 01:12:08    8347937 train.csv
2024-02-13 01:12:08    1059318 val.csv


In [17]:
%%time
from time import gmtime, strftime

job_name = "xgb-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = "{}/xgb_output/{}".format(s3_private_path_tsv, job_name)
image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="binary:logistic",
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    num_round=100,
)

train_data = sagemaker.inputs.TrainingInput(
    "{}/train.csv".format(s3_private_path_tsv),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
validation_data = sagemaker.inputs.TrainingInput(
    "{}/val.csv".format(s3_private_path_tsv),
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
)
data_channels = {"train": train_data, "validation": validation_data}

# Start training by calling the fit method in the estimator
sm_estimator.fit(inputs=data_channels, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-2024-02-13-01-12-25


2024-02-13 01:12:25 Starting - Starting the training job...
2024-02-13 01:12:40 Starting - Preparing the instances for training.........
2024-02-13 01:14:12 Downloading - Downloading input data...
2024-02-13 01:14:47 Downloading - Downloading the training image.........
2024-02-13 01:16:20 Training - Training image download completed. Training in progress.
2024-02-13 01:16:20 Uploading - Uploading generated training model[34m[2024-02-13 01:16:14.381 ip-10-0-160-219.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-02-13 01:16:14.403 ip-10-0-160-219.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-02-13:01:16:14:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-02-13:01:16:14:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-02-13:01:16:14:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[20

In [18]:
!aws s3 ls $s3_private_path_tsv/xgb_output/xgb-2024-02-06-01-00-18/xgb-2024-02-06-01-00-18/output/

2024-02-06 01:04:04      75883 model.tar.gz


In [19]:
# Set up transform job for model testing
#%%time

sm_transformer = sm_estimator.transformer(1, "ml.m4.xlarge")

# start a transform job
input_location = "{}/test.csv".format(s3_private_path_tsv)  # use input data without ID column
sm_transformer.transform(input_location, content_type="text/csv", split_type="Line")
sm_transformer.wait()

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-02-13-01-19-56-369
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2024-02-13-01-19-57-110


................................................
[34m[2024-02-13:01:27:55:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-13:01:27:55:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-02-13:01:27:55:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn

In [20]:
import re


def get_csv_output_from_s3(s3uri, batch_file):
    file_name = "{}.out".format(batch_file)
    match = re.match("s3://([^/]+)/(.*)", "{}/{}".format(s3uri, file_name))
    output_bucket, output_prefix = match.group(1), match.group(2)
    s3.download_file(output_bucket, output_prefix, file_name)
    return pd.read_csv(file_name, sep=",", header=None)

In [21]:
s3 = boto3.client("s3")
output_df = get_csv_output_from_s3(sm_transformer.output_path, "test.csv")
output_df.head(8)

Unnamed: 0,0
0,0.995523
1,0.097457
2,0.962098
3,0.942585
4,0.999898
5,0.165067
6,0.089544
7,0.518209


In [22]:
output_df.columns = ["pred"]
output_df['pred'] = output_df['pred'].apply(lambda x: 0 if x < 0.5 else 1)
output_df

Unnamed: 0,pred
0,1
1,0
2,1
3,1
4,1
...,...
7222,0
7223,0
7224,1
7225,1


In [23]:
from sklearn.metrics import classification_report
classification_report(y_test, output_df["pred"], output_dict=True)

{'0': {'precision': 0.8613550815558344,
  'recall': 0.8491032776747063,
  'f1-score': 0.8551853005294301,
  'support': 3234.0},
 '1': {'precision': 0.8791780143599901,
  'recall': 0.8893062860005009,
  'f1-score': 0.8842131474103586,
  'support': 3993.0},
 'accuracy': 0.871315898713159,
 'macro avg': {'precision': 0.8702665479579123,
  'recall': 0.8692047818376036,
  'f1-score': 0.8696992239698944,
  'support': 7227.0},
 'weighted avg': {'precision': 0.8712024553882675,
  'recall': 0.871315898713159,
  'f1-score': 0.8712235172992582,
  'support': 7227.0}}