# 모델 학습 및 평가

## 개요
DataFabric의 JupyterHub 환경에서 머신러닝 모델을 만들고 평가할 수 있습니다.
DataFabric 분석환경은 Scikit-Learn과 LightGBM 같은 머신러닝 파이썬 패키지를 제공하며 노트북에서 이를 임포트하여 사용할 수 있습니다.
이 문서에서는 구글 빅쿼리에 저장된 데이터를 활용하여 추천 모델을 학습하고 평가하는 예제를 설명합니다.


## 모델 학습
모델 학습 주제 설명

### 1. 데이터 확인

학습 데이터셋을 만들기 전에 먼저 활용할 데이터를 확인해보는 과정입니다.  
사용자 Feature 정보와 라벨링 데이터를 저장하고 있는 테이블을 다음과 같이 확인할 수 있습니다.

In [1]:
from pydatafabric.gcp import load_bigquery_ipython_magic

load_bigquery_ipython_magic()

In [2]:
%%bq 
SELECT * 
FROM  smart-ruler-304409.cds_amt.TB_AMT_AFLCO_CUST_DNA_DATA 
LIMIT 10

Query complete after 0.03s: 100%|██████████| 1/1 [00:00<00:00, 1347.35query/s]
Downloading: 100%|██████████| 10/10 [00:01<00:00,  9.05rows/s]

BigQuery execution took 2 seconds.





Unnamed: 0,YM_WCNT,CUST_ID,AFLCO_CD,DATA_CRTN_DT,TOP1_STR_CD,TOP1_STR_LA,TOP1_STR_LO,TOP1_STR_DSTNC,TOP2_STR_CD,TOP2_STR_LA,...,DAVG_PURCHS_AMT_CHG_RT,TPO1,TPO2,TPO3,TPO4,TPO5,TPO6,TPO7,TPO8,TOP1_TPO
0,20210201,58cdd0fb1a5b3cac0a19eecc3bd627255917d2559b2e04...,1,2022-01-19,1420,,,,,,...,,,,,,,,,,
1,20210201,25b63c3f524812cda90d4fd4b06811f7e5f2b341b37bef...,1,2022-01-19,1034,34.811627,126.425205,,,,...,,,,,,,,,,
2,20210201,3d41dcd1e94834681c0fc6a61faceef3d24a5b2f18c3ef...,1,2022-01-19,1413,37.646978,126.894776,,1688.0,37.662884,...,,,,,,,,,,
3,20210201,3b50922d2cd07426da3c7b56863be702fbe079cb7739e2...,1,2022-01-19,1600,37.255145,127.119251,,,,...,,,,,,,,,,
4,20210201,28f52450c58fc88e6b706af14725bfdb1b5bd7044cb55d...,1,2022-01-19,1009,35.407698,127.374466,,,,...,,,,,,,,,,
5,20210201,6b46392240533cd4076e911e02534b8cd2b33788f2b690...,1,2022-01-19,1055,35.885227,128.589788,,,,...,,,,,,,,,,
6,20210201,b97ccbf532894a1cdb3c26d7956eaca6b392ce17ea366b...,1,2022-01-19,1420,,,,,,...,,,,,,,,,,
7,20210201,03bf7fb3109186b8fe0a10c1465212216ad19f72985f5c...,1,2022-01-19,1057,37.531035,126.7369,,,,...,,,,,,,,,,
8,20210201,167e2a36e7ed17debd4de22f94ac8bd5f9974a4a4aece0...,1,2022-01-19,1087,37.462969,127.036218,,,,...,,,,,,,,,,
9,20210201,365af665928f2f2fa4fdbd290038ccf29db0325ffae825...,1,2022-01-19,1024,37.361089,126.93142,,,,...,,,,,,,,,,


### 2. 학습 데이터 생성
구글 빅쿼리에 저장된 데이터를 다음과 같이 쿼리하여 Pandas DataFrame으로 저장합니다.

In [11]:
from pydatafabric.gcp import bq_to_pandas

query = f"""
SELECT * 
FROM  smart-ruler-304409.cds_amt.TB_AMT_AFLCO_CUST_DNA_DATA 
WHERE YM_WCNT='20220301'
LIMIT 100
"""

df = bq_to_pandas(query)

destination: emart-datafabric._2dd36219768c7c869a5680edf9fd6e104ea57800.anon5774d10bad6dbd5b020221bba984ee7ad43900f2
total_rows: 100
slot_secs: 5.752



Downloading: 100%|██████████| 100/100 [00:00<00:00, 102.67rows/s]


결측값을 포함하는 데이터를 제거합니다.

In [2]:
df = df.dropna()

In [12]:
import numpy as np
df["column"] = np.random.randint(0.0,1.0, size=len(df))
df

Unnamed: 0,YM_WCNT,CUST_ID,AFLCO_CD,DATA_CRTN_DT,TOP1_STR_CD,TOP1_STR_LA,TOP1_STR_LO,TOP1_STR_DSTNC,TOP2_STR_CD,TOP2_STR_LA,...,TPO1,TPO2,TPO3,TPO4,TPO5,TPO6,TPO7,TPO8,TOP1_TPO,column
0,20220301,7a4738f5685ed1d74b5f9f8331d0b4f02e5d9facff07a1...,001,2022-03-01,4544,,,,,,...,,,,,,,,,,0
1,20220301,2cbfff1b6ae1040b1b2abe7bc543ada5818e1174963d8e...,001,2022-03-01,4548,,,,,,...,,,,,,,,,,0
2,20220301,5fb8325c59e015e940459e74f9f40129af315e47b7ded7...,001,2022-03-01,1656,37.116707000,126.911944000,,,,...,,,,,,,,,,0
3,20220301,ce3d336c0f99dd7af8b289d708b1e9cfed6d6d7fe4a823...,001,2022-03-01,1402,36.357731000,127.362905000,,,,...,,,,,,,,,,0
4,20220301,9246ae6d7cff604fa3ce21d3fe5c9c4bdb9b8e98db7df4...,001,2022-03-01,1162,35.229339000,128.872242000,,,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,20220301,be51fb6960e5a99021a216718ca7998b2ceec4d81cfcee...,001,2022-03-01,1154,37.743184000,127.102364000,,,,...,,,,,,,,,,0
96,20220301,8acf5d7addda21d33fb1b445e60378768d52e44a2fce0b...,001,2022-03-01,1421,35.185007000,129.112172000,9.200000000,,,...,,,,,,,,,,0
97,20220301,0c028e7d9722a94a3a83bae916df9debca24c614a5a065...,001,2022-03-01,1087,37.462969000,127.036218000,,,,...,,,,,,,,,,0
98,20220301,1561e9269d8eb4ced1ed2d080712fba44dfde48bc55915...,001,2022-03-01,1416,37.480139000,127.148402000,31.560000000,,,...,,,,,,,,,,0


라벨 정보를 숫자로 인코딩합니다.

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoding'] = le.fit_transform(df['column'])

사용자 Feature를 x로 라벨을 y로 하는 데이터셋을 만듭니다.  
데이터셋은 학습과 평가로 나누도록 합니다.

In [None]:
from sklearn.model_selection import train_test_split

idx_col = ['CUST_ID', 'dt', 'column', 'dataset_type', 'label', 'label_dt', 'label_encoding']
cols = set(df.columns)
features = list(cols - set(idx_col))

label_col = 'label_encoding'
train_x, valid_x, train_y, valid_y = train_test_split(df[features], df[label_col], test_size=0.2, random_state=1234)

성능이 좋은 변수만 사용하도록 처리하여 데이터셋을 생성하는 과정입니다.

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.feature_selection import SelectKBest, f_classif

first_k = SelectKBest(score_func=f_classif).fit(train_x, train_y)
score = first_k.scores_
importance_df = pd.DataFrame(index=train_x.columns, data={'score':first_k.scores_}).sort_values('score', ascending=False)

k = 100
k_features = list(importance_df.index[:k])

num_classes = len(le.classes_)

lgb_trn = lgb.Dataset(train_x[k_features], train_y, feature_name=k_features)
lgb_val = lgb.Dataset(valid_x[k_features], valid_y, feature_name=k_features)

### 2. 모델 학습

준비된 데이터셋을 이용하여 학습을 합니다. 우선 Bayesian Optimazation을 이용해서 하이퍼 파라미터를 최적화하는 작업을 수행합니다.

In [None]:
from bayes_opt import BayesianOptimization

LR = 0.05
NUM_THREAD = 8

def lgb_eval(num_leaves, feature_fraction, lambda_l1, lambda_l2, num_boost_round):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclassova',
        'num_class': num_classes,
        'metric': 'multi_logloss',
        'is_unbalance': True,
        'learning_rate': LR,
        'num_threads': NUM_THREAD,
        'num_leaves': int(num_leaves),
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2
    }
    
    num_boost_round = int(num_boost_round)
    
    print('num_leaves: ', num_leaves)
    print('feature_fraction: ', feature_fraction)
    print('lambda_l1: ', lambda_l1)
    print('lambda_l2: ', lambda_l2)
    print('_num_boost_round: ', num_boost_round)
    
    clf = lgb.train(params,
                    train_set=lgb_trn,
                    valid_sets=lgb_val,
                    num_boost_round=num_boost_round,
                    early_stopping_rounds=100,
                    verbose_eval=False)
    
    return clf.best_score['valid_0']['multi_logloss']

TARGET_PARAMS = {'num_leaves': (8, 64),
                 'feature_fraction': (0.5, 1.0),
                 'lambda_l1': (0, 100),
                 'lambda_l2': (0, 1000),
                 'num_boost_round': (500, 2000)}

lgbBO = BayesianOptimization(lgb_eval, TARGET_PARAMS)
lgbBO.maximize(init_points=5, n_iter=3)

In [None]:
result = pd.concat([pd.DataFrame(lgbBO.res)['target'], pd.json_normalize(pd.DataFrame(lgbBO.res)['params'])], axis=1).sort_values(by='target')[::1]
print(result)

최적화된 파라미터를 사용하여 학습합니다.

In [None]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclassova',
    'num_class': len(le.classes_),
    'metric': 'multi_logloss',
    'is_unbalance': True,
    'num_threads': NUM_THREAD,
    'num_leaves': result['num_leaves'].iloc[0].astype('int32'),
    'feature_fraction': result['feature_fraction'].iloc[0],
    'learning_rate': LR,
    'lambda_l1': result['lambda_l1'].iloc[0],
    'lambda_l2': result['lambda_l2'].iloc[0]
}

train_x, valid_x, train_y, valid_y = train_test_split(df[k_features], df[label_col], test_size=0.2, random_state=1234)
lgb_trn = lgb.Dataset(train_x, train_y)
lgb_val = lgb.Dataset(valid_x, valid_y)

# training
clf = lgb.train(lgb_params,
                train_set=lgb_trn,
                valid_sets=[lgb_trn, lgb_val],
                num_boost_round=5000,
                early_stopping_rounds=100,
                verbose_eval=50)

### 3. 학습된 모델 평가

학습한 모델을 평가하는 과정입니다.

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# model evaluation
def model_evaluation(model, x_trn, y_trn, top_k):
    # cross-table
    predict = model.predict(x_trn).argmax(axis=1)
    print(pd.crosstab(predict, y_trn))
    
    # metric by each label
    labels, cnt = np.unique(predict, return_counts=True)
    metrics_summary = precision_recall_fscore_support(y_true=y_trn, y_pred=predict, labels=labels)
    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    
    class_report = pd.DataFrame(
        list(metrics_summary),
        index = metrics_sum_index,
        columns = labels)
    
    print(class_report.T)
    
    # metric by each label
    y_tmp = pd.DataFrame(y_trn).reset_index(drop=True)
    y_tmp['pred_score'] = list(map(lambda x: x.argsort()[-top_k:][::-1], model.predict(x_trn)))
    y_tmp['flag'] = y_tmp.apply(lambda x: int(x['label_encoding'] in x['pred_score']), axis=1)
    acc = y_tmp['flag'].sum()/y_tmp['flag'].count()
    print(acc)
    
    return acc

print("[Accuracy for Training Set]")
acc_trn = model_evaluation(clf, train_x, train_y, 3)

print("[Accuracy for Validation Set]")
acc_val = model_evaluation(clf, valid_x, valid_y, 3)

학습된 모델은 이후 Prediction 단계에서 사용될 수 있습니다.  
다음 단계에서 사용하기 위해 HDFS에 모델을 저장합니다.

In [None]:
import os
import pickle
from pydatafabric.ye import get_hdfs_conn

model_name = 'example_model'
model_version = 'v0'

clf.params['label_encoder'] = le
clf.params['training_multi_logloss'] = clf.best_score['training']['multi_logloss']
clf.params['valid_multi_logloss'] = clf.best_score['valid_1']['multi_logloss']
clf.params['acc_trn'] = acc_trn
clf.params['acc_val'] = acc_val
clf.params['model_name'] = model_name
clf.params['model_version'] = model_version

# save model file
output_path = f'/data/tmp/{model_name}/{model_version}'
connection = get_hdfs_conn()
with connection.open(os.path.join(output_path), 'wb') as f:
    pickle.dump(clf, f)

참고로 mlops-sdk를 사용하여 모델의 형상을 관리하실 수 있습니다.
모델 관리를 위해서는 mlops에 모델을 등록하셔야 됩니다. MLS에 대한 자세한 내용은 <a href="https://rec.shinsegae.ai/swagger/index.html" target="_blank">mlops-sdk 문서</a>를 참고하시기 바랍니다.
아래는 실제 등록된 모델이 저장된 경로를 가져오는 예제입니다.

In [None]:


# Code Example