In [1]:
import os
import re
import boto3
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from botocore.exceptions import ClientError

Matplotlib is building the font cache; this may take a moment.


In [2]:
session = boto3.session.Session()
smsession = sagemaker.Session()

In [3]:
REGION = session.region_name
PREFIX = 'dongkyl-proto'
PREFIX

'dongkyl-proto'

In [4]:
BUCKET_NAME = smsession.default_bucket()
BUCKET_NAME

'sagemaker-ap-northeast-2-331769466833'

In [5]:
s3 = session.resource('s3')
smclient = session.client('sagemaker')

In [6]:
bucket = s3.Bucket(BUCKET_NAME)

# 1. 버킷 생성

In [7]:
try:
    bucket.create(
        ACL='private',
        CreateBucketConfiguration={
            'LocationConstraint': REGION,
        },
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'BucketAlreadyOwnedByYou':
        print('Bucket have been already created..')
    else:
        raise e

Bucket have been already created..


In [8]:
bucket.Acl

<bound method ResourceFactory._create_class_partial.<locals>.create_resource of s3.Bucket(name='sagemaker-ap-northeast-2-331769466833')>

# 2. 원본 데이터 준비

In [9]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls

--2021-05-14 13:15:57--  https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5539328 (5.3M) [application/x-httpd-php]
Saving to: ‘default of credit card clients.xls.2’


2021-05-14 13:15:59 (4.16 MB/s) - ‘default of credit card clients.xls.2’ saved [5539328/5539328]



In [10]:
dataset = pd.read_excel('default of credit card clients.xls')
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


By reading the dataset we see that it has 30,000 records, and each record has 23 associated attributes to describe features relevant to the credit scores of the person the record represents. The attributes are the following:

    X1: Amount of the given credit.
    X2: Gender (1 = male; 2 = female).
    X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
    X4: Marital status (1 = married; 2 = single; 3 = others).
    X5: Age (year).
    X6 – X11: History of past payments. Tracked past monthly payment records (from April to September, 2005) are displayed as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005… X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months… 8 = payment delay for eight months; 9 = payment delay for nine months and above.
    X12-X17: Amount of bill statement X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005… X17 = amount of bill statement in April, 2005.
    X18-X23: Amount of previous payment. X18 = amount paid in September, 2005; X19 = amount paid in August, 2005…. X23 = amount paid in April, 2005.
    Y: Did the person default? (Yes = 1, No = 0)

In [11]:
dataset = dataset.drop('Unnamed: 0', axis=1)
dataset = pd.concat([dataset['Y'], dataset.drop(['Y'], axis=1)], axis=1)
dataset.head(5)

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
0,default payment next month,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
1,1,20000,2,2,1,24,2,2,-1,-1,...,689,0,0,0,0,689,0,0,0,0
2,1,120000,2,2,2,26,-1,2,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
3,0,90000,2,2,2,34,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
4,0,50000,2,2,1,37,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000


# 3. 데이터셋 분리 

In [12]:
train_data, validation_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset)), int(0.9 * len(dataset))])
train_data.to_csv('train.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

# 4. 데이터셋 업로드

In [13]:
bucket.Object(os.path.join(PREFIX, 'train/train.csv')).upload_file('train.csv')
bucket.Object(os.path.join(PREFIX, 'validation/validation.csv')).upload_file('validation.csv')

s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket.name, PREFIX), content_type='csv')
s3_input_validation = TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket.name, PREFIX), content_type='csv')

# 5. Xgboost 트레이닝잡 생성

In [14]:
role = get_execution_role(sagemaker_session=smsession)
xgb_container = sagemaker.image_uris.retrieve('xgboost', REGION, '1.2-1')

In [15]:
hp = {
    'eta': '0.1',
    'objective': 'binary:logistic',
    'num_round': '25'
}

In [16]:
xgb = sagemaker.estimator.Estimator(xgb_container,
                                    role,
                                    hyperparameters=hp,
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket.name, PREFIX),
                                    sagemaker_session=smsession)

In [17]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-05-14 13:16:25 Starting - Starting the training job...
2021-05-14 13:16:26 Starting - Launching requested ML instancesProfilerReport-1620998184: InProgress
.........
2021-05-14 13:18:25 Starting - Preparing the instances for training......
2021-05-14 13:19:26 Downloading - Downloading input data
2021-05-14 13:19:26 Training - Downloading the training image......
2021-05-14 13:20:26 Uploading - Uploading generated training model[34m[2021-05-14 13:20:15.166 ip-10-0-95-100.ap-northeast-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:D

# 6. 엔드포인트 생성

In [18]:
xgb_predictor = xgb.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    serializer = CSVSerializer())

---------------!

# 7. 인퍼런스 테스트

In [21]:
test_data.head(5)

Unnamed: 0,Y,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
22404,0,180000,2,2,1,28,0,0,0,0,...,68650,67895,68442,70131,3200,2500,3000,2500,3000,5000
21001,0,30000,1,2,1,36,1,-1,-1,-1,...,0,1170,780,0,780,0,1170,0,0,0
25401,1,20000,2,1,2,24,2,0,0,0,...,16327,17970,18819,16642,1500,1500,2000,1000,1000,1000
14066,0,60000,2,3,1,38,0,0,0,0,...,23027,12947,390,700,1864,1206,259,390,700,13628
25301,0,50000,2,1,2,25,0,0,0,0,...,7161,8377,8789,8813,1500,2000,1500,1000,500,500


In [19]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.to_numpy()[:,1:])
predictions

array([0.09881981, 0.29045957, 0.70511222, ..., 0.14971405, 0.4061085 ,
       0.10524765])