# Boston Housing Price XGBoost Sample Model

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
     |████████████████████████████████| 173.6 MB 3.9 kB/s             
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2


In [3]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

boston = load_boston()
data = pd.DataFrame(boston.data)
data.columns = boston.feature_names
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
# y_data
boston.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data, boston.target, test_size=0.2, random_state=0)

In [8]:
# XGBoost 모델 생성
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                         max_depth=5, alpha=10, n_estimators=10)

model = xg_reg.fit(X_train, Y_train)

In [11]:
# 주요 feature 확인
model.get_booster().get_score(importance_type='weight')

{'CRIM': 11.0,
 'INDUS': 11.0,
 'NOX': 8.0,
 'RM': 3.0,
 'AGE': 7.0,
 'RAD': 1.0,
 'PTRATIO': 5.0,
 'B': 13.0,
 'LSTAT': 8.0}

In [12]:
# 예측
pred = xg_reg.predict(X_test)

# 평가
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(Y_test, pred))
print(f'RMSE: {rmse}')

RMSE: 10.966912554267134


# AWS SageMaker 자습 - 은행 고객이 예금 증서(CD)에 등록할지 예측

In [None]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

In [None]:
# Bucket 생성

bucket_name = 'your-s3-bucket-name' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

In [14]:
# AWS SageMaker 인스턴스에 데이터 다운로드
try:
  urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
  print('Success: downloaded bank_clean.csv.')
except Exception as e:
  print('Data load error: ',e)

try:
  model_data = pd.read_csv('./bank_clean.csv',index_col=0)
  print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [16]:
model_data.sample(frac=1, random_state=1)

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
35577,32,1,999,0,1,0,0,1,0,0,...,0,1,0,0,0,0,1,0,1,0
13950,33,2,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
29451,25,5,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
32295,34,1,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
27477,53,7,999,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7813,43,1,999,0,1,1,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
32511,42,7,999,1,1,0,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
5192,36,1,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
12172,37,1,999,0,1,0,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0


In [17]:
# 데이터 셔플(.sample(frac=1)), 데이터 split (np.split)
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)


(28831, 61) (12357, 61)


In [18]:
# AWS SageMaker의 사전 구축된 XGBoost 모델을 사용하려면, 훈련 데이터의 헤더와 첫번째의 열의 형식을 다시 지정!
# S3 버킷에서 데이터를 로드!