# TF-IDF를 활용한 XGBoost모델 구현  

입력값에 대해 TF-IDF 값으로 벡터화 진행


## 1. 데이터 불러오기

### 데이터 확인

In [1]:
import pandas as pd

DATA_IN_PATH = 'C:/Users/JS/Desktop/mbti_data/'
TRAIN_CLEAN_DATA = 'preprocessing_final2.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

train_data.head()

Unnamed: 0.1,Unnamed: 0,articleid,mbti,title,content,menu_id,mbti_label,e_i,n_s,f_t,j_p
0,0,277264,estj,Estj istj 차이는 뭘까요?,저는 estj인데 istj 특징을 보면 별로 저랑 다르지않은것같아요 제 주변에 es...,11,6,1,0,0,1
1,1,277263,istj,블루스 추천,동영상 n n nGrover Washington Jr Ain t No S nshin...,11,14,0,0,0,1
2,2,277216,enfp,ENFP 여자 ESTJ 남자친구랑 오래갈 수 있을까요 ㅠㅠ,안녕하세요 ENFP 여자입니다 매번 짧고 금방 식는 연애만 해 왔는데지금 남자친구가...,11,1,1,1,1,0
3,3,276899,istj,생각에 확신이 없는 ISTJ,직장밴드에 이해하기 애매한 글이 올라왔어요 제 생각에는 찬성 반대로만 물으면 안 될...,11,14,0,0,0,1
4,4,276311,estj,원하는 연애스타일 있으신가요?,저는 개인적으로 알콩달콩도 좋지만 레드벨벳 싸이코 노래같은 그런연애 해보고 싶더라구...,11,6,1,0,0,1


In [2]:
from konlpy.tag import Okt
import re

def preprocessing(text, okt, remove_stopwords=False, stop_words=[]) :
    # 1. 한글 및 공백 제외 문자 모두 제거
    text_split = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]","", text)
    # 2. okt 객체를 사용해 형태소 단위로 나눈다
    text_split = okt.morphs(text, stem=True)
    # 3. 불용어 제거
    if remove_stopwords :
        text_split = [token for token in text_split if not token in stop_words]
        
    return text_split

In [3]:
stop_words = ['은', '는', '이', '가', '하', '아', '것', '들', '의', '있', '되', '수', '보', '주', '등', '한', '.', '..', 'ㅠ', 'ㅠㅠ', '을', '를', '에',' 하다']
okt=Okt()
clean_content = []

for content in train_data['content']:
    #비어있는 데이터에서 멈추지 않도록 문자열인 경우에만 진행
    if type(content) == str:
        clean_content.append(preprocessing(content, okt, remove_stopwords=True, stop_words=stop_words))
    else :
        clean_content.append([]) #string이 아니면 비어있는 값 추가
        
clean_content[:4]

[['저',
  'estj',
  '인데',
  'istj',
  '특징',
  '보다',
  '별로',
  '저',
  '랑',
  '다르다',
  '않다',
  '같다',
  '제',
  '주변',
  'estj',
  'istj',
  '늘다',
  '명도',
  '없다',
  '비교',
  '하다',
  '없다',
  '다른',
  '분들',
  '시기',
  '에는',
  '둘',
  '차이점',
  '보이다'],
 ['동영상',
  'n',
  'n',
  'nGrover',
  'Washington',
  'Jr',
  'Ain',
  't',
  'No',
  'S',
  'nshine',
  'n',
  'nyo',
  't',
  'e',
  'n',
  'n',
  'n',
  'nBill',
  'withers',
  '음악',
  '으로',
  '알려지다',
  '있다',
  '이건',
  'Grover',
  'Washington',
  'Jr',
  '연',
  '곡',
  '으로',
  '기다',
  '버젼',
  '이다'],
 ['안녕하다',
  'ENFP',
  '여자',
  '이다',
  '매번',
  '짧다',
  '금방',
  '식다',
  '연애',
  '만',
  '해',
  '오다',
  '지금',
  '남자친구',
  '진짜',
  '너무',
  '좋다',
  '얼굴',
  '도',
  '잘생기다',
  '다정하다',
  '가끔',
  '씩',
  '그',
  '성격',
  '적',
  '인',
  '차이',
  '느끼다',
  '직',
  '까지',
  '막',
  '크다',
  '충돌',
  '없다',
  '그리다',
  '제',
  '엄청',
  '집착',
  '하다',
  '찡찡거리다',
  '스타일',
  '인데',
  'estj',
  '분',
  '이렇다',
  '귀찮다',
  '남자친구',
  '좀',
  '버겁다',
  '같다',
  '노력',
  '하다',
  '고치다',
  '있다',
  '자다'

## 2. 데이터 인덱스화

### 인덱스 변환

In [4]:
import json
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

content = clean_content

tokenizer = Tokenizer()
tokenizer.fit_on_texts(content)
text_sequences = tokenizer.texts_to_sequences(content)

In [5]:
print(text_sequences[0])

[10, 372, 75, 280, 455, 8, 327, 10, 88, 218, 17, 7, 15, 208, 372, 280, 65, 3159, 16, 971, 1, 16, 53, 163, 1160, 118, 417, 1994, 113]


### 단어사전 만들기

In [6]:
word_vocab = tokenizer.word_index
word_vocab ["<PAD>"] = 0

print("전체 단어 개수 : ", len(word_vocab))

전체 단어 개수 :  64340


### 패딩 처리

In [7]:
MAX_SEQUENCE_LENGTH = 200   #문장 최대 길이

train_inputs = pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print('Shape of train data : ', train_inputs.shape)

Shape of train data :  (43561, 200)


10369개의 데이터가 200이라는 길이를 동일하게 가지게 되었음  

### 저장하기

In [8]:
import numpy as np

#라벨값 넘파이 배열로 불러오기
train_label_e_i = np.array(train_data['e_i'])
train_label_n_s = np.array(train_data['n_s'])
train_label_f_t = np.array(train_data['f_t'])
train_label_j_p = np.array(train_data['j_p'])

DATA_IN_PATH = 'C:/Users/JS/desktop/mbti_data/'
TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_E_I_DATA = 'nsmc_train_e_i_data.npy'
TRAIN_N_S_DATA = 'nsmc_train_n_s_data.npy'
TRAIN_F_T_DATA = 'nsmc_train_f_t_data.npy'
TRAIN_J_P_DATA = 'nsmc_train_j_p_data.npy'

DATA_CONFIGS = 'data_configs.json'
DEFAULT_PATH = ''

#단어 사전 및전체 단어 개수는 새롭게 딕셔너리 값을 지정해 저장
data_configs={}

data_configs['vocab']=word_vocab
data_configs['vocab_size']=len(word_vocab)+1 #vocab size 추가

import os
#저장하는 디렉터리가 존재하지 않으면 생성
if not os.path.exists(DEFAULT_PATH + DATA_IN_PATH):
    os.makedirs(DEFAULT_PATH + DATA_IN_PATH)

#전처리 된 데이터를 넘파이 형태로 저장
np.save(open(DEFAULT_PATH + DATA_IN_PATH + TRAIN_INPUT_DATA, 'wb'), train_inputs)
np.save(open(DEFAULT_PATH + DATA_IN_PATH + TRAIN_E_I_DATA, 'wb'), train_label_e_i)
np.save(open(DEFAULT_PATH + DATA_IN_PATH + TRAIN_N_S_DATA, 'wb'), train_label_n_s)
np.save(open(DEFAULT_PATH + DATA_IN_PATH + TRAIN_F_T_DATA, 'wb'), train_label_f_t)
np.save(open(DEFAULT_PATH + DATA_IN_PATH + TRAIN_J_P_DATA, 'wb'), train_label_j_p)


#데이터 사전을 json 형태로 저장
json.dump(data_configs, open(DEFAULT_PATH + DATA_IN_PATH + DATA_CONFIGS, 'w', encoding='UTF-8'), ensure_ascii=False)

## 3. 모델링 및 성능 평가

## 1) E or I lable
### 데이터셋 불러오기

In [9]:
import numpy as np

DATA_IN_PATH = 'C:/Users/JS/desktop/mbti_data/'

TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_E_I_DATA = 'nsmc_train_e_i_data.npy'

#데이터 가져오기
train_input = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label_e_i = np.load(open(DATA_IN_PATH + TRAIN_E_I_DATA, 'rb'))

### 데이터셋 분리

In [10]:
from sklearn.model_selection import train_test_split

RANDOM_SEED = 42
TEST_SPLIT = 0.2

train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_label_e_i, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

### 모델링

In [11]:
import xgboost as xgb

train_data = xgb.DMatrix(train_input, label=train_label)
eval_data = xgb.DMatrix(eval_input, label=eval_label)

In [12]:
#하이퍼 파라미터
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

#리스트
data_list = [(train_data, 'train'), (eval_data, 'valid')]

#학습
bst = xgb.train(params, train_data, num_boost_round=1000, evals=data_list, early_stopping_rounds=100)

[0]	train-rmse:0.48803	valid-rmse:0.48929
[1]	train-rmse:0.48066	valid-rmse:0.48346
[2]	train-rmse:0.47608	valid-rmse:0.48080
[3]	train-rmse:0.47320	valid-rmse:0.47938
[4]	train-rmse:0.47124	valid-rmse:0.47878
[5]	train-rmse:0.46884	valid-rmse:0.47862
[6]	train-rmse:0.46751	valid-rmse:0.47848
[7]	train-rmse:0.46593	valid-rmse:0.47881
[8]	train-rmse:0.46447	valid-rmse:0.47890
[9]	train-rmse:0.46318	valid-rmse:0.47898
[10]	train-rmse:0.46165	valid-rmse:0.47907
[11]	train-rmse:0.45973	valid-rmse:0.47917
[12]	train-rmse:0.45797	valid-rmse:0.47964
[13]	train-rmse:0.45616	valid-rmse:0.47992
[14]	train-rmse:0.45545	valid-rmse:0.47999
[15]	train-rmse:0.45378	valid-rmse:0.48037
[16]	train-rmse:0.45260	valid-rmse:0.48050
[17]	train-rmse:0.45224	valid-rmse:0.48062
[18]	train-rmse:0.45097	valid-rmse:0.48076
[19]	train-rmse:0.45007	valid-rmse:0.48094
[20]	train-rmse:0.44859	valid-rmse:0.48100
[21]	train-rmse:0.44731	valid-rmse:0.48125
[22]	train-rmse:0.44649	valid-rmse:0.48117
[23]	train-rmse:0.445

In [13]:
#predict()를 통해 예측 결과를 추정할 수 있는 확률값 반환
pred_probs = bst.predict(eval_data)
print('수행 결과값 10개만 표시, 예측 확률값으로 표시됨')
print(np.round(pred_probs[:10], 3))

#예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측값 결정해 리스트 객체인 preds에 저장
preds = [1 if x>0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시 : ', preds[:10])

수행 결과값 10개만 표시, 예측 확률값으로 표시됨
[0.32  0.373 0.278 0.328 0.668 0.378 0.42  0.237 0.493 0.308]
예측값 10개만 표시 :  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]


### E or I 평가

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None) :
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    #ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    #ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [15]:
get_clf_eval(eval_label, preds, pred_probs)

오차 행렬
[[5134  502]
 [2734  343]]
정확도: 0.6286, 정밀도: 0.4059, 재현율: 0.1115, F1: 0.1749, AUC: 0.5217


## 2) N or S label
### 데이터셋 불러오기

In [16]:
DATA_IN_PATH = 'C:/Users/JS/desktop/mbti_data/'

TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_N_S_DATA = 'nsmc_train_n_s_data.npy'

#데이터 가져오기
train_input = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label_n_s = np.load(open(DATA_IN_PATH + TRAIN_N_S_DATA, 'rb'))

### 데이터셋 분리

In [17]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_label_n_s, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

### 모델링

In [18]:
import xgboost as xgb

train_data = xgb.DMatrix(train_input, label=train_label)
eval_data = xgb.DMatrix(eval_input, label=eval_label)

In [19]:
#하이퍼 파라미터
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

#리스트
data_list = [(train_data, 'train'), (eval_data, 'valid')]

#학습
bst = xgb.train(params, train_data, num_boost_round=1000, evals=data_list, early_stopping_rounds=100)

[0]	train-rmse:0.46342	valid-rmse:0.46500
[1]	train-rmse:0.44364	valid-rmse:0.44715
[2]	train-rmse:0.43308	valid-rmse:0.43772
[3]	train-rmse:0.42679	valid-rmse:0.43323
[4]	train-rmse:0.42315	valid-rmse:0.43082
[5]	train-rmse:0.42100	valid-rmse:0.42955
[6]	train-rmse:0.42007	valid-rmse:0.42862
[7]	train-rmse:0.41936	valid-rmse:0.42799
[8]	train-rmse:0.41890	valid-rmse:0.42751
[9]	train-rmse:0.41827	valid-rmse:0.42731
[10]	train-rmse:0.41755	valid-rmse:0.42680
[11]	train-rmse:0.41725	valid-rmse:0.42660
[12]	train-rmse:0.41676	valid-rmse:0.42644
[13]	train-rmse:0.41633	valid-rmse:0.42606
[14]	train-rmse:0.41446	valid-rmse:0.42592
[15]	train-rmse:0.41420	valid-rmse:0.42583
[16]	train-rmse:0.41365	valid-rmse:0.42573
[17]	train-rmse:0.41336	valid-rmse:0.42553
[18]	train-rmse:0.41313	valid-rmse:0.42531
[19]	train-rmse:0.41296	valid-rmse:0.42526
[20]	train-rmse:0.41275	valid-rmse:0.42521
[21]	train-rmse:0.41224	valid-rmse:0.42510
[22]	train-rmse:0.41179	valid-rmse:0.42500
[23]	train-rmse:0.411

In [20]:
#predict()를 통해 예측 결과를 추정할 수 있는 확률값 반환
pred_probs = bst.predict(eval_data)
print('수행 결과값 10개만 표시, 예측 확률값으로 표시됨')
print(np.round(pred_probs[:10], 3))

#예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측값 결정해 리스트 객체인 preds에 저장
preds = [1 if x>0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시 : ', preds[:10])

수행 결과값 10개만 표시, 예측 확률값으로 표시됨
[0.793 0.727 0.765 0.955 0.573 0.71  0.959 0.73  0.931 0.763]
예측값 10개만 표시 :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


### N or S 평가

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None) :
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    #ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    #ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [22]:
get_clf_eval(eval_label, preds, pred_probs)

오차 행렬
[[  71 2055]
 [  80 6507]]
정확도: 0.7550, 정밀도: 0.7600, 재현율: 0.9879, F1: 0.8591, AUC: 0.5825


## 3) F or T label
### 데이터셋 불러오기

In [23]:
DATA_IN_PATH = 'C:/Users/JS/desktop/mbti_data/'

TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_F_T_DATA = 'nsmc_train_f_t_data.npy'

#데이터 가져오기
train_input = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label_f_t = np.load(open(DATA_IN_PATH + TRAIN_F_T_DATA, 'rb'))

### 데이터셋 분리

In [24]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_label_f_t, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

### 모델링

In [25]:
import xgboost as xgb

train_data = xgb.DMatrix(train_input, label=train_label)
eval_data = xgb.DMatrix(eval_input, label=eval_label)

In [26]:
#하이퍼 파라미터
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

#리스트
data_list = [(train_data, 'train'), (eval_data, 'valid')]

#학습
bst = xgb.train(params, train_data, num_boost_round=1000, evals=data_list, early_stopping_rounds=100)

[0]	train-rmse:0.49648	valid-rmse:0.49918
[1]	train-rmse:0.49406	valid-rmse:0.49853
[2]	train-rmse:0.49182	valid-rmse:0.49797
[3]	train-rmse:0.49021	valid-rmse:0.49754
[4]	train-rmse:0.48964	valid-rmse:0.49718
[5]	train-rmse:0.48901	valid-rmse:0.49701
[6]	train-rmse:0.48847	valid-rmse:0.49649
[7]	train-rmse:0.48764	valid-rmse:0.49627
[8]	train-rmse:0.48723	valid-rmse:0.49599
[9]	train-rmse:0.48633	valid-rmse:0.49574
[10]	train-rmse:0.48597	valid-rmse:0.49545
[11]	train-rmse:0.48562	valid-rmse:0.49525
[12]	train-rmse:0.48530	valid-rmse:0.49492
[13]	train-rmse:0.48502	valid-rmse:0.49471
[14]	train-rmse:0.48474	valid-rmse:0.49451
[15]	train-rmse:0.48431	valid-rmse:0.49440
[16]	train-rmse:0.48399	valid-rmse:0.49429
[17]	train-rmse:0.48369	valid-rmse:0.49411
[18]	train-rmse:0.48335	valid-rmse:0.49403
[19]	train-rmse:0.48204	valid-rmse:0.49324
[20]	train-rmse:0.48182	valid-rmse:0.49308
[21]	train-rmse:0.47958	valid-rmse:0.49347
[22]	train-rmse:0.47936	valid-rmse:0.49349
[23]	train-rmse:0.478

In [27]:
#predict()를 통해 예측 결과를 추정할 수 있는 확률값 반환
pred_probs = bst.predict(eval_data)
print('수행 결과값 10개만 표시, 예측 확률값으로 표시됨')
print(np.round(pred_probs[:10], 3))

#예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측값 결정해 리스트 객체인 preds에 저장
preds = [1 if x>0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시 : ', preds[:10])

수행 결과값 10개만 표시, 예측 확률값으로 표시됨
[0.503 0.543 0.447 0.298 0.498 0.53  0.449 0.635 0.577 0.584]
예측값 10개만 표시 :  [1, 1, 0, 0, 0, 1, 0, 1, 1, 1]


### F or T 평가

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None) :
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    #ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    #ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [29]:
get_clf_eval(eval_label, preds, pred_probs)

오차 행렬
[[1215 2841]
 [ 995 3662]]
정확도: 0.5597, 정밀도: 0.5631, 재현율: 0.7863, F1: 0.6563, AUC: 0.5709


## 4) J or P label
### 데이터셋 불러오기

In [30]:
DATA_IN_PATH = 'C:/Users/JS/desktop/mbti_data/'

TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_F_T_DATA = 'nsmc_train_j_p_data.npy'

#데이터 가져오기
train_input = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
train_label_j_p = np.load(open(DATA_IN_PATH + TRAIN_J_P_DATA, 'rb'))

### 데이터셋 분리

In [31]:
train_input, eval_input, train_label, eval_label = train_test_split(train_input, train_label_j_p, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

### 모델링

In [32]:
import xgboost as xgb

train_data = xgb.DMatrix(train_input, label=train_label)
eval_data = xgb.DMatrix(eval_input, label=eval_label)

In [33]:
#하이퍼 파라미터
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'rmse'

#리스트
data_list = [(train_data, 'train'), (eval_data, 'valid')]

#학습
bst = xgb.train(params, train_data, num_boost_round=1000, evals=data_list, early_stopping_rounds=100)

[0]	train-rmse:0.48044	valid-rmse:0.48329
[1]	train-rmse:0.47055	valid-rmse:0.47452
[2]	train-rmse:0.46573	valid-rmse:0.46978
[3]	train-rmse:0.46312	valid-rmse:0.46712
[4]	train-rmse:0.46062	valid-rmse:0.46570
[5]	train-rmse:0.45964	valid-rmse:0.46491
[6]	train-rmse:0.45875	valid-rmse:0.46435
[7]	train-rmse:0.45819	valid-rmse:0.46387
[8]	train-rmse:0.45757	valid-rmse:0.46353
[9]	train-rmse:0.45719	valid-rmse:0.46324
[10]	train-rmse:0.45685	valid-rmse:0.46300
[11]	train-rmse:0.45440	valid-rmse:0.46303
[12]	train-rmse:0.45390	valid-rmse:0.46276
[13]	train-rmse:0.45353	valid-rmse:0.46252
[14]	train-rmse:0.45265	valid-rmse:0.46235
[15]	train-rmse:0.45240	valid-rmse:0.46209
[16]	train-rmse:0.45047	valid-rmse:0.46239
[17]	train-rmse:0.45006	valid-rmse:0.46236
[18]	train-rmse:0.44985	valid-rmse:0.46223
[19]	train-rmse:0.44955	valid-rmse:0.46199
[20]	train-rmse:0.44933	valid-rmse:0.46188
[21]	train-rmse:0.44908	valid-rmse:0.46176
[22]	train-rmse:0.44887	valid-rmse:0.46155
[23]	train-rmse:0.448

In [34]:
#predict()를 통해 예측 결과를 추정할 수 있는 확률값 반환
pred_probs = bst.predict(eval_data)
print('수행 결과값 10개만 표시, 예측 확률값으로 표시됨')
print(np.round(pred_probs[:10], 3))

#예측 확률이 0.5보다 크면 1, 그렇지 않으면 0으로 예측값 결정해 리스트 객체인 preds에 저장
preds = [1 if x>0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시 : ', preds[:10])

수행 결과값 10개만 표시, 예측 확률값으로 표시됨
[0.284 0.294 0.431 0.102 0.339 0.399 0.572 0.319 0.317 0.155]
예측값 10개만 표시 :  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


### J or T 평가

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None) :
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    #ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    #ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [36]:
get_clf_eval(eval_label, preds, pred_probs)

오차 행렬
[[5539  395]
 [2457  322]]
정확도: 0.6727, 정밀도: 0.4491, 재현율: 0.1159, F1: 0.1842, AUC: 0.5897
