# 데모

## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import seaborn as sns
import warnings

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

[03-pandas-eda.ipynb](https://github.com/kaggler-tv/dku-kaggle-class/blob/master/notebook/03-pandas-eda.ipynb)에서 생성한 `feature.csv` 피처파일 사용

In [4]:
data_dir = Path('input')
feature_dir = Path('feature')
sub_dir = Path('sub')
tst_dir = Path('tst')
val_dir = Path('val')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'class'
n_fold = 5
n_class = 3
seed = 42

In [5]:
algo_name = 'RF_hyperopt'
feature_name = 'feature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [6]:
df = pd.read_csv(feature_file, index_col=0)
print(df.shape)
df.head()

(400000, 20)


Unnamed: 0_level_0,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,airmass_u,class,d_dered_u,d_dered_g,d_dered_r,d_dered_i,d_dered_z,d_dered_ig,d_dered_zg,d_dered_rz,d_dered_iz,d_obs_det
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,16.9396,-8.1086e-05,23.1243,20.2578,18.9551,17.6321,16.9089,18,1.1898,0.0,-0.1397,-0.079,-0.0544,-0.0403,-0.0307,-2.6257,-3.3488,2.0462,0.7232,0
1,13.1689,0.0045061,14.9664,14.0045,13.4114,13.2363,13.1347,1,1.2533,1.0,-0.0857,-0.0574,-0.041,-0.0322,-0.0343,-0.7683,-0.8698,0.2767,0.1016,0
2,15.35,0.00047198,16.6076,15.6866,15.44,15.3217,15.2961,2,1.0225,0.0,-0.1787,-0.1388,-0.0963,-0.0718,-0.054,-0.3649,-0.3905,0.144,0.0257,0
3,19.6346,5.8143e-06,25.3536,20.9947,20.0873,19.7947,19.5552,4,1.2054,0.0,-0.307,-0.1941,-0.1339,-0.1003,-0.0795,-1.2,-1.4395,0.5321,0.2395,1
4,17.9826,-3.3247e-05,23.7714,20.4338,18.863,18.1903,17.8759,13,1.1939,0.0,-0.682,-0.2653,-0.1794,-0.1339,-0.1067,-2.2436,-2.5579,0.9871,0.3144,1


In [7]:
y = df[target_col].values[:320000]
df.drop(target_col, axis=1, inplace=True)
trn = df.iloc[:320000].values
tst = df.iloc[320000:].values
feature_name = df.columns.tolist()
print(y.shape, trn.shape, tst.shape)

(320000,) (320000, 19) (80000, 19)


## Hyperparameter Tuning

In [8]:
X_trn, X_val, y_trn, y_val = train_test_split(trn, y, test_size=.2, random_state=seed)

In [9]:
params = {
    'n_jobs'                    : -1,
    'random_state'              : seed,
    'verbose'                   : 0,
    'warm_start'                : False,  #False로 해야 비교가 되지...
    
    
    #'bootstrap'                 : True,
    #'oob_score'                 : hp.choice('oob_score', [True, False]),
    'bootstrap'                : False,
    
    #class_weight               : None
    
    #'min_weight_fraction_leaf' : .0
    #'min_samples_split'        : 2
    # 'max_leaf_nodes'          : None,
    # 'min_impurity_decrease'   : .0,     : 불순도(?)감소가 주어진 값 이상이 되어야 분리됨,
    # 'min_impurity_split'      : None,      : early-stopping을 위한 threshold
    #'ccp_alpha'                : .0,
    #'max_samples'              : None,
}

#oob_score는 bootstrap이 true일 때만 가능
space = {
    'criterion'                 : hp.choice('criterion', ['entropy', 'gini']),
    'max_features'              : hp.choice('max_features', ['sqrt', 'log2', None]), 
    'n_estimators'              : hp.choice('n_estimators', np.arange(50, 300, dtype=int)),
    #overfitting 방지
    'min_samples_leaf'          : hp.choice('min_samples_leaf', np.arange(1, 30, dtype=int)),
    'max_depth'                 : hp.quniform('max_depth', 10, 100, 10)
}

In [10]:
def objective(hyperparams):
    model = RandomForestClassifier(**params, **hyperparams)
    model.fit(X=X_trn, y=y_trn)
    score = accuracy_score(y_val, model.predict(X_val))

    return {'loss': -score, 'status': STATUS_OK, 'model': model}

trials = Trials()
best = fmin(fn=objective, space=space, trials=trials,
            algo=tpe.suggest, max_evals=100, verbose=1)
best

  0%|          | 0/100 [01:29<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

'criterion': 1,
 'max_depth': 80.0,
 'max_features': 1,
 'min_samples_leaf': 1,
 'n_estimators': 139

## Stratified K-Fold Cross Validation

In [12]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

## LightGBM 모델 학습

In [14]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))

for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = RandomForestClassifier(**params, criterion= 'gini',
                                 max_depth= 80.0,
                                 max_features= 'log2',
                                 min_samples_leaf= 1,
                                 n_estimators= 139)
    clf.fit(trn[i_trn], y[i_trn])
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
training model for CV #2
training model for CV #3
training model for CV #4
training model for CV #5


In [18]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

93.0666%


In [None]:
print(p_val.shape, p_tst.shape)

In [19]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')