In [110]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [None]:
dataset = load_breast_cancer()
dataset

In [29]:
# 독립변수
X_features = dataset.data
print(X_features[:1])

# 독립변수 값 이름
feature_names = dataset.feature_names
print(feature_names)

# 종속변수
# 악성인지 양성인지   0 : 악  |  1 :양
y_label =  dataset.target
print(y_label[:1])

[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]]
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
[0]


In [None]:
# X_features -> DataFrame, 
# columns 아름 : feature_naems
cancer_df = pd.DataFrame(data = X_features, columns = feature_names)

# target 열 추가: 값은 y_label
cancer_df['target'] = y_label

cancer_df

In [41]:
# 양성, 악성값 카운팅
cancer_df["target"].value_counts()

1    357
0    212
Name: target, dtype: int64

In [71]:
# 독립변수
cancer_df.iloc[:, : -1] # [처음부터: 끝까지 , 처음부터 : 끝에 한줄뺀것 까지]

# 종속변수
cancer_df["target"]

# random_state = None 
# Always_Random_Split  -> train/test(75%/25%)  
train_test_split(
    cancer_df.iloc[:, :-1]
    ,cancer_df["target"]    
)

# random_state = int  
# Always_Same_Split -> train/test(75%/25%)
tts = \
train_test_split(
    cancer_df.iloc[:, :-1]
    ,cancer_df["target"]
    ,test_size = 0.2  # train/test(80%/20%)
    ,random_state = 111   
)

In [72]:
X_train, X_test, y_train, y_test = tts

In [79]:
# 오차 절대값이 작아지는 트리 10개를 만들 객체 생성
xgboost = XGBClassifier(n_estimators = 10)

# 훈련
xgboost.fit(
    X_train,
    y_train,
    eval_set = [(X_test, y_test)], # xgboost 성능테스트 해주는 옵션
    eval_metric = 'error' # 성능 테스트 방법
)

[0]	validation_0-error:0.04386
[1]	validation_0-error:0.035088
[2]	validation_0-error:0.061404
[3]	validation_0-error:0.035088
[4]	validation_0-error:0.052632
[5]	validation_0-error:0.052632
[6]	validation_0-error:0.052632
[7]	validation_0-error:0.04386
[8]	validation_0-error:0.04386
[9]	validation_0-error:0.017544


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# 예측한 값들
pred = xgboost.predict(X_test)

# 혼동행렬
print("혼동행렬:\n",confusion_matrix(y_test, pred))
print("="* 30)

# precision  # Default 1
print("1의 precision(정밀도) : \n", precision_score(y_test, pred))  
print("="* 30)

# 0 precision
print("0의 precision(정밀도) : \n", precision_score(y_test, pred, pos_label = 0))
print("="* 30)

# recall # Default 1
print("1의 recall(재현율): \n" , recall_score(y_test,pred))
print("="* 30)

# 0 recall
print("1의 recall(재현율): \n" , recall_score(y_test,pred, pos_label =0))
print("="* 30)

# f1_score
print(f1_score(y_test , pred, pos_label = 1))
print("="* 30)

# roc
print(roc_auc_score(y_test, pred))

print(roc_auc_score(y_test, pred, labels = [0, 1]))

# 실습

In [121]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [118]:
dataset = load_breast_cancer()

X_features = dataset.data

feature_names = dataset.feature_names

y_label =  dataset.target

y_label =  dataset.target

cancer_df['target'] = y_label

X_train, X_test, y_train, y_test = train_test_split(
    cancer_df.iloc[:, :-1]
    ,cancer_df["target"]
    ,test_size = 0.2  # train/test(80%/20%)
    ,random_state = 111   
)

In [None]:
# 오차 절대값이 작아지는 트리 10개를 만들 객체 생성
xgboost = XGBClassifier(n_estimators = 10)

# 훈련
xgboost.fit(
    X_train,
    y_train,
    eval_set = [(X_test, y_test)], # xgboost 성능테스트 해주는 옵션
    eval_metric = 'error' # 성능 테스트 방법
)

print("혼동행렬:\n",confusion_matrix(y_test, pred))

In [None]:
xgb_clf = XGBClassifier(n_estimators = 100)

params = {
    'mat_depth' : [5, 7, 9],
    'min_child_weight' : [1, 3, 5, 7, 9],
    'colsample_bytree' : [0.1, 0.3, 0.5, 0.75, 1.0],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
}

gridcv = GridSearchCV(xgb_clf, param_grid = params)

gridcv.fit(X_train,
           y_train,
           eval_set= [(X_test, y_test)],
           eval_metric = "error")

In [None]:
print("GridSearchCV 최적 마라미터 : ", gridcv.best_params_)

# 최적의 파라미터를 갖는 XGboost를 이용해 X_test 예측
pred = gridcv.predict(X_test)
print("예측: \n", pred)

# confusion_matrix
print("confusion_matrix : \n", confusion_matrix(y_test, pred))

# precision  # Default 1
print("1의 precision(정밀도) : \n", precision_score(y_test, pred))  
print("="* 30)

# 0 precision
print("0의 precision(정밀도) : \n", precision_score(y_test, pred, pos_label = 0))
print("="* 30)

# recall # Default 1
print("1의 recall(재현율): \n" , recall_score(y_test,pred))
print("="* 30)

# 0 recall
print("1의 recall(재현율): \n" , recall_score(y_test,pred, pos_label =0))
print("="* 30)

# f1_score
print(f1_score(y_test , pred, pos_label = 1))
print("="* 30)


In [None]:
# gridcv에서 설절된 모든 경우의 수에 대해서 최소의 cost를 갖는 XGboost 객체 리턴
ftr_importances_values = gridcv.best_estimator_.feature_importances_

ftr_importances = pd.Series(ftr_importances_values, index = feature_names)

ftr_importances = ftr_importances_values.sort_values(ascending = False)

In [138]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
plt.figure(figsize(8,6))
plt.title("Feature importances")
sns.barplot(x=ftr_importances, y= ftr_importances.index)
plt.show()

# s

In [None]:
params = {
    'mat_depth' : [5, 7, 9],
    'min_child_weight' : [1, 3, 5, 7, 9],
    'colsample_bytree' : [0.1, 0.3, 0.5, 0.75, 1.0],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
}


# 성능이 좋은 분류기를 만들기위에  
# 위  경우의 수에 맞는 모든 트리를 만들어서 최적의 mat_depth, min_child_weight, colsample_bytree, learning_rate 값을 찾는다
gridcv = GridSearchCV(xgb_clf, param_grid = params)
gridcv.fit(X_train,
           y_train,
           eval_set= [(X_test, y_test)],
           eval_metric = "error")


# 선택된 파라미터들은 아래와 같이 출력 가능
print("GridSearchCV 최적 마라미터 : ", gridcv.best_params_)

# gridcv.best_estimator_ 를 사용해서 위 최적의 파라미터를 가진 분류기를 하나 바로 맹들어줌
gridcv.best_estimator_