In [1]:
import boto3
import sagemaker

In [2]:
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

In [3]:
bucket = sagemaker_session.default_bucket()
code_location = f's3://{bucket}/xgboost/code'
output_path = f's3://{bucket}/xgboost/output'

### Define hyperparameter

In [4]:
hyperparameters = {
       "scale_pos_weight" : "29",    
        "max_depth": "3",
        "eta": "0.2",
        "objective": "binary:logistic",
        "num_round": "100",
}

### Cluster Specification

In [5]:
instance_count = 1
# instance_type = "ml.m5.large"
instance_type = "local"
max_run = 1*60*60

use_spot_instances = False
if use_spot_instances:
    max_wait = 1*60*60
else:
    max_wait = None

In [6]:
if instance_type in ['local', 'local_gpu']:
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
else:
    sagemaker_session = sagemaker.session.Session()

### Define training cluster

In [7]:
from sagemaker.xgboost.estimator import XGBoost

estimator = XGBoost(
    entry_point="xgboost_starter_script.py",
    source_dir='src',
    output_path=output_path,
    code_location=code_location,
    hyperparameters=hyperparameters,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=instance_count,
    instance_type=instance_type,
    framework_version="1.3-1",
    max_run=max_run,
    use_spot_instances=use_spot_instances,  # spot instance 활용
    max_wait=max_wait,
)

### Prepare training dataset

In [8]:
data_path=f's3://{bucket}/xgboost/dataset'
!aws s3 sync ./dataset/ $data_path


The user-provided path ./dataset/ does not exist.


In [9]:
if instance_type in ['local', 'local_gpu']:
    from pathlib import Path
    file_path = f'file://{Path.cwd()}'
    inputs = file_path.split('lab_1_training')[0] + '/data/dataset/'
    
else:
    inputs = data_path
inputs

'file:///home/ec2-user/SageMaker/sg-workshop/data/dataset/'

### Start training

In [10]:
estimator.fit(inputs = {'inputdata': inputs},
                  wait=False)

Creating saiekor0yh-algo-1-asqap ... 
Creating saiekor0yh-algo-1-asqap ... done
Attaching to saiekor0yh-algo-1-asqap
[36msaiekor0yh-algo-1-asqap |[0m [2022-09-18 01:43:09.477 c24af3c65c57:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[36msaiekor0yh-algo-1-asqap |[0m [2022-09-18:01:43:09:INFO] Imported framework sagemaker_xgboost_container.training
[36msaiekor0yh-algo-1-asqap |[0m [2022-09-18:01:43:09:INFO] No GPUs detected (normal if no gpus installed)
[36msaiekor0yh-algo-1-asqap |[0m [2022-09-18:01:43:09:INFO] Invoking user training script.
[36msaiekor0yh-algo-1-asqap |[0m [2022-09-18:01:43:09:INFO] Installing module with the following command:
[36msaiekor0yh-algo-1-asqap |[0m /miniconda3/bin/python3 -m pip install . 
[36msaiekor0yh-algo-1-asqap |[0m Processing /opt/ml/code
[36msaiekor0yh-algo-1-asqap |[0m   Preparing metadata (setup.py) ... [?25ldone
[36msaiekor0yh-algo-1-asqap |[0m [?25hBuilding wheels for collected packages: xgboost-starter-script
[36

Failed to delete: /tmp/tmp275g3vxz/algo-1-asqap Please remove it manually.


===== Job Complete =====


In [11]:
estimator.logs()

### Apendix: Dataset 

In [12]:
import pandas as pd

In [13]:
train_prep_df = pd.read_csv('data/dataset/train.csv')
train_prep_df.groupby('fraud').sample(n=5)

Unnamed: 0,fraud,vehicle_claim,total_claim_amount,customer_age,months_as_customer,num_claims_past_year,num_insurers_past_5_years,policy_deductable,policy_annual_premium,customer_zip,...,collision_type_missing,incident_severity_Major,incident_severity_Minor,incident_severity_Totaled,authorities_contacted_Ambulance,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Police,police_report_available_No,police_report_available_Yes
2698,0,32568.092499,62068.092499,51,237,0,1,750,3000,92407,...,0,0,0,1,0,0,0,1,0,1
1202,0,8242.194286,11342.194286,26,18,1,1,750,3000,95838,...,0,0,1,0,0,0,1,0,1,0
50,0,4670.252898,8670.252898,22,53,0,1,750,3000,93722,...,0,0,1,0,0,0,1,0,1,0
3128,0,12549.631698,79049.631698,65,115,0,1,750,2800,95129,...,0,0,1,0,0,0,1,0,1,0
2687,0,23904.842415,29404.842415,37,36,0,2,750,3000,93267,...,0,1,0,0,0,0,0,1,0,1
1974,1,11204.650914,28004.650914,55,142,1,1,750,3000,85706,...,0,0,1,0,0,0,1,0,1,0
1500,1,31550.781254,43250.781254,47,184,0,1,750,3000,89103,...,0,0,0,1,0,0,0,1,0,1
775,1,14998.032388,26798.032388,49,208,0,1,750,3000,90302,...,0,1,0,0,0,0,0,1,1,0
165,1,14571.624297,19771.624297,21,23,0,3,750,3000,86301,...,0,1,0,0,0,0,0,1,0,1
1320,1,9104.430774,67204.430774,61,154,0,1,750,3000,93105,...,0,1,0,0,0,0,0,1,1,0


In [14]:
train_prep_df.groupby('fraud').size()

fraud
0    3869
1     131
dtype: int64