# XGBoostによる二値分類（ローカル）
- ローカルからAWS上のトレーニングジョブを実行
- XGBoostはECRイメージを使用
- バッチトランスフォームでテストデータを推論

## Data preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header = None)

In [3]:
# specify columns extracted from wbdc.names
data.columns = ["id","diagnosis","radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean",
                "compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean",
                "radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se",
                "concave points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst",
                "perimeter_worst","area_worst","smoothness_worst","compactness_worst","concavity_worst",
                "concave points_worst","symmetry_worst","fractal_dimension_worst"] 

In [4]:
data.to_csv("data.csv", sep=',', index=False)

In [5]:
data.sample(8)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
556,924964,B,10.16,19.59,64.73,311.7,0.1003,0.07504,0.005025,0.01116,...,10.65,22.88,67.88,347.3,0.1265,0.12,0.01005,0.02232,0.2262,0.06742
383,90251,B,12.39,17.48,80.64,462.9,0.1042,0.1297,0.05892,0.0288,...,14.18,23.13,95.23,600.5,0.1427,0.3593,0.3206,0.09804,0.2819,0.1118
490,91376701,B,12.25,22.44,78.18,466.5,0.08192,0.052,0.01714,0.01261,...,14.17,31.99,92.74,622.9,0.1256,0.1804,0.123,0.06335,0.31,0.08203
552,924084,B,12.77,29.43,81.35,507.9,0.08276,0.04234,0.01997,0.01499,...,13.87,36.0,88.1,594.7,0.1234,0.1064,0.08653,0.06498,0.2407,0.06484
35,854253,M,16.74,21.59,110.1,869.5,0.0961,0.1336,0.1348,0.06018,...,20.01,29.02,133.5,1229.0,0.1563,0.3835,0.5409,0.1813,0.4863,0.08633
140,868999,B,9.738,11.97,61.24,288.5,0.0925,0.04102,0.0,0.0,...,10.62,14.1,66.53,342.9,0.1234,0.07204,0.0,0.0,0.3105,0.08151
238,883270,B,14.22,27.85,92.55,623.9,0.08223,0.1039,0.1103,0.04408,...,15.75,40.54,102.5,764.0,0.1081,0.2426,0.3064,0.08219,0.189,0.07796
254,886226,M,19.45,19.33,126.5,1169.0,0.1035,0.1188,0.1379,0.08591,...,25.7,24.57,163.1,1972.0,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895


In [6]:
data['diagnosis'] = data['diagnosis'].apply(lambda x: ((x =="M"))+0)
data.sample(8)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
254,886226,1,19.45,19.33,126.5,1169.0,0.1035,0.1188,0.1379,0.08591,...,25.7,24.57,163.1,1972.0,0.1497,0.3161,0.4317,0.1999,0.3379,0.0895
15,84799002,1,14.54,27.54,96.73,658.8,0.1139,0.1595,0.1639,0.07364,...,17.46,37.13,124.1,943.2,0.1678,0.6577,0.7026,0.1712,0.4218,0.1341
201,877989,1,17.54,19.32,115.1,951.6,0.08968,0.1198,0.1036,0.07488,...,20.42,25.84,139.5,1239.0,0.1381,0.342,0.3508,0.1939,0.2928,0.07867
173,871641,0,11.08,14.71,70.21,372.7,0.1006,0.05743,0.02363,0.02583,...,11.35,16.82,72.01,396.5,0.1216,0.0824,0.03938,0.04306,0.1902,0.07313
175,872113,0,8.671,14.45,54.42,227.2,0.09138,0.04276,0.0,0.0,...,9.262,17.04,58.36,259.2,0.1162,0.07057,0.0,0.0,0.2592,0.07848
370,9012315,1,16.35,23.29,109.0,840.4,0.09742,0.1497,0.1811,0.08773,...,19.38,31.03,129.3,1165.0,0.1415,0.4665,0.7087,0.2248,0.4824,0.09614
296,891936,0,10.91,12.35,69.14,363.7,0.08518,0.04721,0.01236,0.01369,...,11.37,14.82,72.42,392.2,0.09312,0.07506,0.02884,0.03194,0.2143,0.06643
377,9013579,0,13.46,28.21,85.89,562.1,0.07517,0.04726,0.01271,0.01117,...,14.69,35.63,97.11,680.6,0.1108,0.1457,0.07934,0.05781,0.2694,0.07061


In [7]:
#data split in three sets, training, validation and batch inference
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
batch_list = rand_split >= 0.9

data_train = data[train_list].drop(['id'],axis=1)
data_val = data[val_list].drop(['id'],axis=1)
data_batch = data[batch_list].drop(['diagnosis'],axis=1)
data_batch_noID = data_batch.drop(['id'],axis=1)

## Setup

In [8]:
import boto3, os, sagemaker

In [9]:
sess = sagemaker.Session()

In [10]:
# bucket = sess.default_bucket() # 'sagemaker-ap-northeast-1-078451633032'
bucket = 'dge-sagemaker-test'
prefix = 'breast-cancer-prediction-xgboost'

In [11]:
train_file = 'train_data.csv'
data_train.to_csv(train_file, index=False, header=False)
sess.upload_data(train_file, bucket=bucket, key_prefix=f'{prefix}/train')
# boto3.Session().resource('s3').Bucket(bucket).Object(f'{prefix}/train/{train_file}').upload_file(train_file)

's3://dge-sagemaker-test/breast-cancer-prediction-xgboost/train/train_data.csv'

In [12]:
validation_file = 'validation_data.csv'
data_val.to_csv(validation_file, index=False, header=False)
sess.upload_data(validation_file, bucket=bucket, key_prefix=f'{prefix}/validation')

's3://dge-sagemaker-test/breast-cancer-prediction-xgboost/validation/validation_data.csv'

In [13]:
batch_file = 'batch_data.csv'
data_batch.to_csv(batch_file, index=False, header=False)
sess.upload_data(batch_file, bucket=bucket, key_prefix=f'{prefix}/batch')

's3://dge-sagemaker-test/breast-cancer-prediction-xgboost/batch/batch_data.csv'

In [14]:
batch_file_noID = 'batch_data_noID.csv'
data_batch_noID.to_csv(batch_file_noID, index=False, header=False)
sess.upload_data(batch_file_noID, bucket=bucket, key_prefix=f'{prefix}/batch')

's3://dge-sagemaker-test/breast-cancer-prediction-xgboost/batch/batch_data_noID.csv'

In [15]:
# role = sagemaker.get_execution_role()   # Sagemakerのノートブックインスタンスで実行する場合
role = 'arn:aws:iam::078451633032:role/service-role/AmazonSageMaker-ExecutionRole-20191003T162893'

## Training job

In [24]:
output_location = f's3://{bucket}/{prefix}'

In [25]:
# ECRのXGBoostコンテナイメージ取得
from sagemaker.amazon.amazon_estimator import get_image_uri

image = get_image_uri(boto3.Session().region_name, 'xgboost', '1.0-1')



In [26]:
sm_estimator = sagemaker.estimator.Estimator(image,
                                             role,
                                             train_instance_count=1,
                                             train_instance_type='ml.m5.4xlarge',
                                             train_volume_size=50,
                                             input_mode='File',
                                             output_path=output_location,
                                             sagemaker_session=sess)



In [27]:
sm_estimator.set_hyperparameters(objective="binary:logistic",
                                 max_depth=5,
                                 eta=0.2,
                                 gamma=4,
                                 min_child_weight=6,
                                 subsample=0.8,
                                 silent=0,
                                 num_round=100)

In [28]:
train_data = sagemaker.session.s3_input(f's3://{bucket}/{prefix}/train', distribution='FullyReplicated', content_type='text/csv', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(f's3://{bucket}/{prefix}/validation', distribution='FullyReplicated', content_type='text/csv', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}



In [29]:
%%time
sm_estimator.fit(inputs=data_channels, logs=True)

2020-07-10 06:02:52 Starting - Starting the training job...
2020-07-10 06:02:55 Starting - Launching requested ML instances......
2020-07-10 06:04:10 Starting - Preparing the instances for training...
2020-07-10 06:04:46 Downloading - Downloading input data...
2020-07-10 06:05:19 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[06:05:20] 449x30 matrix with 13470 entries loaded from

## Batch Transform

### Create a transform job with the default configurations

In [43]:
sm_transformer = sm_estimator.transformer(1, 'ml.m4.xlarge', assemble_with = 'Line', accept = 'text/csv', output_path=f's3://{bucket}/{prefix}/output')



In [34]:
%%time
# start a transform job
# input_location = f's3://{bucket}/{prefix}/batch/{batch_file_noID}' # use input data without ID column
input_location = f's3://{bucket}/{prefix}/batch/{batch_file}' # use input data with ID column
# train_dataにはIDがないので、IDを外して推測し、出力時にIDを付加するフィルターを付ける
sm_transformer.transform(input_location, split_type='Line', content_type='text/csv', input_filter='$[1:]', join_source='Input', output_filter='$[0,-1]')
sm_transformer.wait()

......................[34m[2020-07-10:06:43:06:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-07-10:06:43:06:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-07-10:06:43:06:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    locatio

### 推論

In [35]:
import json
import io
from urllib.parse import urlparse

In [36]:
def get_csv_output_from_s3(s3uri, file_name):
    parsed_url = urlparse(s3uri)
    bucket_name = parsed_url.netloc
    prefix = parsed_url.path[1:]
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket_name, f'{prefix}/{file_name}')
    return obj.get()["Body"].read().decode('utf-8')

In [37]:
output = get_csv_output_from_s3(sm_transformer.output_path, f'{batch_file}.out')
output_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
output_df.head(8)

Unnamed: 0,0,1
0,843786,0.444697
1,844359,0.992445
2,84458202,0.970348
3,84501001,0.794606
4,852781,0.983478
5,857343,0.013051
6,857374,0.00658
7,858477,0.01014


### トレーニングジョブで出力されたモデルを読み込んで推論

In [55]:
sm_estimator.model_data

's3://dge-sagemaker-test/breast-cancer-prediction-xgboost/sagemaker-xgboost-2020-07-10-06-02-53-004/output/model.tar.gz'

In [56]:
model = sagemaker.model.Model(
    sm_estimator.model_data,
    image,
    role=role,
    sagemaker_session=sess)



In [57]:
transformer = model.transformer(1, 'ml.m4.xlarge', assemble_with = 'Line', accept = 'text/csv', output_path=f's3://{bucket}/{prefix}/output')

In [58]:
%%time
# start a transform job
# input_location = f's3://{bucket}/{prefix}/batch/{batch_file_noID}' # use input data without ID column
input_location = f's3://{bucket}/{prefix}/batch/{batch_file}' # use input data with ID column
# train_dataにはIDがないので、IDを外して推測し、出力時にIDを付加するフィルターを付ける
transformer.transform(input_location, split_type='Line', content_type='text/csv', input_filter='$[1:]', join_source='Input', output_filter='$[0,-1]')
transformer.wait()

.......................[34m[2020-07-10:07:51:00:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-07-10:07:51:00:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2020-07-10:07:51:00:INFO] nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;
[0m
[34mworker_rlimit_nofile 4096;
[0m
[34mevents {
  worker_connections 2048;[0m
[34m}
[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;

  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }

  server {
    listen 8080 deferred;
    client_max_body_size 0;

    keepalive_timeout 3;

    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
      proxy_read_timeout 60s;
      proxy_pass http://gunicorn;
    }

    locati

In [59]:
output = get_csv_output_from_s3(transformer.output_path, f'{batch_file}.out')
output_df = pd.read_csv(io.StringIO(output), sep=",", header=None)
output_df.head(8)

Unnamed: 0,0,1
0,843786,0.444697
1,844359,0.992445
2,84458202,0.970348
3,84501001,0.794606
4,852781,0.983478
5,857343,0.013051
6,857374,0.00658
7,858477,0.01014
