#1. 데이터 읽기

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [None]:
df_data = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df_labels = pd.DataFrame(cancer.target)
labels = df_labels.values.reshape(-1)
df_data.head(2)
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

#2. 모델 만들기

#2.1 데이터 분류

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_data, labels, test_size=0.3, random_state=62)
print(X_train.shape, X_test.shape)

(398, 30) (171, 30)


##2.2 모델 만들기

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import time

#time.time()은 현재 시간을 얻게 된다.
#모델의 학습 시간을 알기 위해 시작 시간을 현재시간으로 설정한다.
start_time = time.time()

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
end_time = time.time()

#모델 생성 이후 값을 예측하기 위해 사용되는 부분으로 학습 시간에서는 배제되어야 한다.
pred = xgb.predict(X_test)

print('정확도 : ', accuracy_score(y_test, pred))
print('수행 시간 : ', end_time - start_time)

정확도 :  0.9415204678362573
수행 시간 :  0.24432992935180664


##2.3 옵션 적용

In [None]:
from xgboost import XGBClassifier

#검증을 위한 데이터 셋
evals = [(X_test, y_test)]

# n_estimators는 생성할 트리의 개수로 높을 수록 성능이 향상되지만 시간이 오래걸림
# learning_rate는 학습을 진행할 떄마다 적용되는 학습률로 작을수록 오류를 많이 검출 할 수 있지만 시간이 오래걸림
# max_depth는 트리의 최대 깊이, 보통 3~10정도 설정함
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)

# early_stopping_rounds는 n_estimators가 진행되는 동안 학습 오류가 지정된 횟수 만큼 감소되지 않으면 정지시킴
# eval_set검증 세트를 지정함
# eval_metric은 검증 함수를 지정함.
# verbose는 동작 결과를 확인함
xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=evals, eval_metric='logloss', verbose=True)
pred = xgb.predict(X_test)

print('정확도 : ', accuracy_score(y_test, pred))

#3. 하이퍼 파라미터

In [None]:
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
evals = [(X_test, y_test)]

def get_best_hyper_parameter(n_estimators_list, learning_rate_list, max_depth_list, subsample_list):
  parameters = {
    'n_estimators' : n_estimators_list,
    'learning_rate' : learning_rate_list,
    'max_depth' : max_depth_list,
    'subsample' : subsample_list
  }

  start_time = time.time()
  
  #시간이 너무 오래걸리기 떄문에 cv수를 2로 줄였다.
  grid_cv = GridSearchCV(xgb, param_grid=parameters, scoring='accuracy', cv=2)
  grid_cv.fit(X_train, y_train, early_stopping_rounds=50, eval_set=evals, eval_metric='logloss')

  end_time = time.time()
  print('\n====================================================')
  print('최적의 파라미터 : ', grid_cv.best_params_)
  print('최고의 정확도 : ', grid_cv.best_score_)
  print('수행 시간 : ', end_time - start_time)
  print('====================================================\n')

In [None]:
n_estimators_list= list(range(100, 500, 100))
learning_rate_list= [0.1, 0.4, 0.7, 1]
max_depth_list= list(range(3, 10, 3))
subsample_list= [0.1, 0.4, 0.7, 1]

get_best_hyper_parameter(n_estimators_list, learning_rate_list, max_depth_list, subsample_list)

In [None]:
n_estimators_list= list(range(150, 300, 50))
learning_rate_list= [0.05, 0.1, 0.15]
max_depth_list= list(range(3, 5, 1))
subsample_list= [0.5, 0.6, 0.65, 0.7, 0.75, 0.8]

get_best_hyper_parameter(n_estimators_list, learning_rate_list, max_depth_list, subsample_list)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[33]	validation_0-logloss:0.195963
[34]	validation_0-logloss:0.194181
[35]	validation_0-logloss:0.194885
[36]	validation_0-logloss:0.193885
[37]	validation_0-logloss:0.193756
[38]	validation_0-logloss:0.194096
[39]	validation_0-logloss:0.191978
[40]	validation_0-logloss:0.19387
[41]	validation_0-logloss:0.193259
[42]	validation_0-logloss:0.19248
[43]	validation_0-logloss:0.192522
[44]	validation_0-logloss:0.193025
[45]	validation_0-logloss:0.193959
[46]	validation_0-logloss:0.194637
[47]	validation_0-logloss:0.194861
[48]	validation_0-logloss:0.193846
[49]	validation_0-logloss:0.192744
[50]	validation_0-logloss:0.192814
[51]	validation_0-logloss:0.191389
[52]	validation_0-logloss:0.190981
[53]	validation_0-logloss:0.190025
[54]	validation_0-logloss:0.189067
[55]	validation_0-logloss:0.188142
[56]	validation_0-logloss:0.188003
[57]	validation_0-logloss:0.187678
[58]	validation_0-logloss:0.189172
[59]	validation_0-logloss:0.18811
[60]	val

In [None]:
n_estimators_list= [250]
learning_rate_list= [0.05]
max_depth_list= [3]
subsample_list= [0.6]

from xgboost import XGBClassifier

#검증을 위한 데이터 셋
evals = [(X_test, y_test)]

# n_estimators는 생성할 트리의 개수로 높을 수록 성능이 향상되지만 시간이 오래걸림
# learning_rate는 학습을 진행할 떄마다 적용되는 학습률로 작을수록 오류를 많이 검출 할 수 있지만 시간이 오래걸림
# max_depth는 트리의 최대 깊이, 보통 3~10정도 설정함
xgb = XGBClassifier(n_estimators=250, learning_rate=0.05, max_depth=3, subsample=0.6)

# early_stopping_rounds는 n_estimators가 진행되는 동안 학습 오류가 지정된 횟수 만큼 감소되지 않으면 정지시킴
# eval_set검증 세트를 지정함
# eval_metric은 검증 함수를 지정함.
# verbose는 동작 결과를 확인함
xgb.fit(X_train, y_train, early_stopping_rounds=100, eval_set=evals, eval_metric='logloss', verbose=True)
pred = xgb.predict(X_test)

print('정확도 : ', accuracy_score(y_test, pred))

[0]	validation_0-logloss:0.65749
Will train until validation_0-logloss hasn't improved in 100 rounds.
[1]	validation_0-logloss:0.624492
[2]	validation_0-logloss:0.596138
[3]	validation_0-logloss:0.569356
[4]	validation_0-logloss:0.544033
[5]	validation_0-logloss:0.521017
[6]	validation_0-logloss:0.500194
[7]	validation_0-logloss:0.48196
[8]	validation_0-logloss:0.464693
[9]	validation_0-logloss:0.448295
[10]	validation_0-logloss:0.432536
[11]	validation_0-logloss:0.418143
[12]	validation_0-logloss:0.405139
[13]	validation_0-logloss:0.391089
[14]	validation_0-logloss:0.377231
[15]	validation_0-logloss:0.366129
[16]	validation_0-logloss:0.356593
[17]	validation_0-logloss:0.346849
[18]	validation_0-logloss:0.336792
[19]	validation_0-logloss:0.328108
[20]	validation_0-logloss:0.32107
[21]	validation_0-logloss:0.314772
[22]	validation_0-logloss:0.307943
[23]	validation_0-logloss:0.302048
[24]	validation_0-logloss:0.295589
[25]	validation_0-logloss:0.288883
[26]	validation_0-logloss:0.281564