In [1]:
import boto3
import sagemaker

In [2]:
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

In [3]:
bucket = sagemaker_session.default_bucket()
code_location = f's3://{bucket}/xgboost/code'
output_path = f's3://{bucket}/xgboost/output'

### Define hyperparameter

In [4]:
hyperparameters = {
       "scale_pos_weight" : "29",    
        "max_depth": "3",
        "eta": "0.2",
        "objective": "binary:logistic",
        "num_round": "100",
}

In [6]:
instance_count = 1
# instance_type = "ml.m5.large"
instance_type = "local"
max_run = 1*60*60

use_spot_instances = False
if use_spot_instances:
    max_wait = 1*60*60
else:
    max_wait = None

In [7]:
if instance_type in ['local', 'local_gpu']:
    from sagemaker.local import LocalSession
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
else:
    sagemaker_session = sagemaker.session.Session()

### Define training cluster

In [8]:
from sagemaker.xgboost.estimator import XGBoost

estimator = XGBoost(
    entry_point="xgboost_starter_script.py",
    source_dir='src',
    output_path=output_path,
    code_location=code_location,
    hyperparameters=hyperparameters,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=instance_count,
    instance_type=instance_type,
    framework_version="1.3-1",
    max_run=max_run,
    use_spot_instances=use_spot_instances,  # spot instance 활용
    max_wait=max_wait,
)

### Prepare training dataset

In [9]:
data_path=f's3://{bucket}/xgboost/dataset'
!aws s3 sync ./dataset/ $data_path

upload: dataset/test.csv to s3://sagemaker-ap-northeast-2-677146750822/xgboost/dataset/test.csv
upload: dataset/train.csv to s3://sagemaker-ap-northeast-2-677146750822/xgboost/dataset/train.csv


In [15]:
if instance_type in ['local', 'local_gpu']:
    from pathlib import Path
    file_path = f'file://{Path.cwd()}'
    inputs = file_path.split('lab_1_training')[0] + '/data/dataset/'
    
else:
    inputs = data_path
inputs

'file:///home/ec2-user/SageMaker/sg-workshop/data/dataset/'

### Start training

In [16]:
estimator.fit(inputs = {'inputdata': inputs},
                  wait=False)

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded
Creating w3tplxqm0o-algo-1-ym692 ... 
Creating w3tplxqm0o-algo-1-ym692 ... done
Attaching to w3tplxqm0o-algo-1-ym692
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17 04:51:12.420 66fe7ff16fbb:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] Imported framework sagemaker_xgboost_container.training
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] No GPUs detected (normal if no gpus installed)
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] Invoking user training script.
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] Module xgboost_starter_script does not provide a setup.py. 
[36mw3tplxqm0o-algo-1-ym692 |[0m Generating setup.py
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] Generating setup.cfg
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] Generating MANIFEST.in
[36mw3tplxqm0o-algo-1-ym692 |[0m [2022-09-17:04:51:12:INFO] Installing 

Failed to delete: /tmp/tmpttjvq2tl/algo-1-ym692 Please remove it manually.


===== Job Complete =====


In [17]:
estimator.logs()

In [18]:
import pandas as pd

In [20]:
train_prep_df = pd.read_csv('data/dataset/train.csv')
train_prep_df.groupby('fraud').sample(n=5)

Unnamed: 0,fraud,vehicle_claim,total_claim_amount,customer_age,months_as_customer,num_claims_past_year,num_insurers_past_5_years,policy_deductable,policy_annual_premium,customer_zip,...,collision_type_missing,incident_severity_Major,incident_severity_Minor,incident_severity_Totaled,authorities_contacted_Ambulance,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Police,police_report_available_No,police_report_available_Yes
3649,0,26952.916976,33052.916976,27,42,0,1,750,3000,83638,...,0,0,0,1,0,0,0,1,0,1
2227,0,19394.043734,22594.043734,34,39,0,2,750,2750,93551,...,0,1,0,0,0,0,0,1,0,1
3703,0,32257.812211,64357.812211,56,139,0,1,750,3000,85614,...,0,0,0,1,0,0,0,1,0,1
1607,0,21303.283717,38303.283717,44,98,0,1,750,3000,89102,...,0,1,0,0,0,0,0,1,0,1
3799,0,18373.833878,23173.833878,38,66,0,1,750,2900,85037,...,0,1,0,0,0,0,0,1,1,0
414,1,3079.236859,4179.236859,32,55,0,1,750,3000,94703,...,1,1,0,0,0,0,0,1,1,0
730,1,25540.624982,45140.624982,67,196,0,1,750,3000,92064,...,0,0,0,1,0,0,0,1,0,1
1642,1,19908.068639,79508.068639,29,7,0,5,750,3000,90245,...,0,0,0,1,1,0,0,0,1,0
1460,1,6323.758264,7223.758264,29,2,0,5,750,2850,93215,...,1,0,1,0,0,0,0,1,0,1
1944,1,11361.251613,25861.251613,25,26,0,4,750,3000,94539,...,0,0,1,0,0,0,1,0,1,0


In [21]:
train_prep_df.groupby('fraud').size()

fraud
0    3869
1     131
dtype: int64