In [5]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [6]:
import pandas as pd

In [7]:
pd.options.display.max_rows=20
pd.options.display.max_columns=10

### Load dataset

In [8]:
import xgboost as xgb

data = pd.read_csv('../dataset/train.csv')
train = data.drop('fraud', axis=1)
label = pd.DataFrame(data['fraud'])
dtrain = xgb.DMatrix(train, label=label)

In [9]:
data.head()

Unnamed: 0,fraud,vehicle_claim,total_claim_amount,customer_age,months_as_customer,...,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Police,police_report_available_No,police_report_available_Yes
0,0,8913.668763,80513.668763,54,94,...,0,1,0,1,0
1,0,19746.724395,26146.724395,41,165,...,0,0,1,0,1
2,0,11652.969918,22052.969918,57,155,...,0,0,1,0,1
3,0,11260.930936,115960.930936,39,80,...,0,1,0,1,0
4,0,27987.704652,31387.704652,39,60,...,0,0,1,1,0


In [10]:
data.groupby('fraud').size()

fraud
0    3869
1     131
dtype: int64

In [11]:
train.head()

Unnamed: 0,vehicle_claim,total_claim_amount,customer_age,months_as_customer,num_claims_past_year,...,authorities_contacted_Fire,authorities_contacted_None,authorities_contacted_Police,police_report_available_No,police_report_available_Yes
0,8913.668763,80513.668763,54,94,0,...,0,1,0,1,0
1,19746.724395,26146.724395,41,165,0,...,0,0,1,0,1
2,11652.969918,22052.969918,57,155,0,...,0,0,1,0,1
3,11260.930936,115960.930936,39,80,0,...,0,1,0,1,0
4,27987.704652,31387.704652,39,60,0,...,0,0,1,1,0


### Hyperparameter

In [12]:
max_depth = 3
eta = 0.2
objective = 'binary:logistic'
scale_pos_weight = 29

In [13]:
params = {'max_depth': max_depth, 'eta': eta, 'objective': objective, 'scale_pos_weight': scale_pos_weight}

In [14]:
num_boost_round = 999
nfold = 5
early_stopping_rounds = 10

### Cross-Validation

In [15]:
cv_results = xgb.cv(
    params = params,
    dtrain = dtrain,
    num_boost_round = num_boost_round,
    nfold = nfold,
    early_stopping_rounds = early_stopping_rounds,
    metrics = ('auc'),
    stratified = True, # 레이블 (0,1) 의 분포에 따라 훈련 , 검증 세트 분리
    seed = 0)

In [16]:
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.819224,0.005817,0.769926,0.047341
1,0.842821,0.012217,0.806518,0.021315
2,0.854213,0.005705,0.806512,0.021542
3,0.861384,0.008274,0.812601,0.030358
4,0.873625,0.009501,0.814152,0.032305
5,0.881067,0.009787,0.813072,0.027425
6,0.886745,0.009047,0.810738,0.025871
7,0.895144,0.009728,0.816828,0.023247
8,0.898417,0.008864,0.817527,0.025424
9,0.903438,0.010094,0.818247,0.024297


### Get mean scores

In [17]:
print(f"[0]#011train-auc:{cv_results.iloc[-1]['train-auc-mean']}")
print(f"[1]#011validation-auc:{cv_results.iloc[-1]['test-auc-mean']}")

[0]#011train-auc:0.9405190344815983
[1]#011validation-auc:0.8218406316371057


In [18]:
metrics_data = {
    'classification_metrics': {
    'validation:auc': { 'value': cv_results.iloc[-1]['test-auc-mean']},
    'train:auc': {'value': cv_results.iloc[-1]['train-auc-mean']}
    }
}

### Create model using train dataset

In [19]:
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=len(cv_results))

### Save model artifact, train/validation metrics

In [23]:
model_location = 'xgboost-model-pytorch'

In [24]:
model.save_model(model_location)

In [25]:
import json
metrics_location = 'metrics.json'

with open(metrics_location, 'w') as f:
        json.dump(metrics_data, f)