# 데모

## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import warnings

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

[03-pandas-eda.ipynb](https://github.com/kaggler-tv/dku-kaggle-class/blob/master/notebook/03-pandas-eda.ipynb)에서 생성한 `feature.csv` 피처파일 사용

In [4]:
data_dir = Path('..dataset/input/')
feature_dir = Path('..dataset/input/')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'class'
n_fold = 5
n_class = 3
seed = 42

In [5]:
algo_name = 'lgbcv'
feature_name = 'feature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [6]:
df = pd.read_csv('C:/Users/leeji/ML_/2020 캐글뽀개기/dataset/input/feature.csv', index_col=0)
print(df.shape)
df.head()

(400000, 20)


Unnamed: 0_level_0,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,airmass_u,class,d_dered_u,d_dered_g,d_dered_r,d_dered_i,d_dered_z,d_dered_ig,d_dered_zg,d_dered_rz,d_dered_iz,d_obs_det
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,16.9396,-8.1086e-05,23.1243,20.2578,18.9551,17.6321,16.9089,2.9444,1.1898,0.0,-0.1397,-0.079,-0.0544,-0.0403,-0.0307,-2.6257,-3.3488,2.0462,0.7232,-15.0556
1,13.1689,0.0045061,14.9664,14.0045,13.4114,13.2363,13.1347,0.6931,1.2533,1.0,-0.0857,-0.0574,-0.041,-0.0322,-0.0343,-0.7683,-0.8698,0.2767,0.1016,-0.3069
2,15.35,0.00047198,16.6076,15.6866,15.44,15.3217,15.2961,1.0986,1.0225,0.0,-0.1787,-0.1388,-0.0963,-0.0718,-0.054,-0.3649,-0.3905,0.144,0.0257,-0.9014
3,19.6346,5.8143e-06,25.3536,20.9947,20.0873,19.7947,19.5552,1.6094,1.2054,0.0,-0.307,-0.1941,-0.1339,-0.1003,-0.0795,-1.2,-1.4395,0.5321,0.2395,-1.3906
4,17.9826,-3.3247e-05,23.7714,20.4338,18.863,18.1903,17.8759,2.6391,1.1939,0.0,-0.682,-0.2653,-0.1794,-0.1339,-0.1067,-2.2436,-2.5579,0.9871,0.3144,-9.3609


In [7]:
y = df[target_col].values[:320000]
df.drop(target_col, axis=1, inplace=True)
trn = df.iloc[:320000].values
tst = df.iloc[320000:].values
feature_name = df.columns.tolist()
print(y.shape, trn.shape, tst.shape)

(320000,) (320000, 19) (80000, 19)


## Stratified K-Fold Cross Validation

In [8]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

## LightGBM 모델 학습

In [9]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=.5,
                             subsample_freq=1,
                             colsample_bytree=.8,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 0.860242
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 0.761507
[3]	valid_0's multi_logloss: 0.675529
[4]	valid_0's multi_logloss: 0.604629
[5]	valid_0's multi_logloss: 0.546718
[6]	valid_0's multi_logloss: 0.505092
[7]	valid_0's multi_logloss: 0.467961
[8]	valid_0's multi_logloss: 0.440039
[9]	valid_0's multi_logloss: 0.411566
[10]	valid_0's multi_logloss: 0.382604
[11]	valid_0's multi_logloss: 0.358461
[12]	valid_0's multi_logloss: 0.337889
[13]	valid_0's multi_logloss: 0.319551
[14]	valid_0's multi_logloss: 0.303063
[15]	valid_0's multi_logloss: 0.288774
[16]	valid_0's multi_logloss: 0.276421
[17]	valid_0's multi_logloss: 0.266034
[18]	valid_0's multi_logloss: 0.25627
[19]	valid_0's multi_logloss: 0.24705
[20]	valid_0's multi_logloss: 0.240043
[21]	valid_0's multi_logloss: 0.232841
[22]	valid_0's multi_logloss: 0.226516
[23]	valid_0's multi_logloss: 0.220679
[24]	valid_0's multi_logloss

[130]	valid_0's multi_logloss: 0.163441
[131]	valid_0's multi_logloss: 0.163416
[132]	valid_0's multi_logloss: 0.163423
[133]	valid_0's multi_logloss: 0.163405
[134]	valid_0's multi_logloss: 0.163393
[135]	valid_0's multi_logloss: 0.163356
[136]	valid_0's multi_logloss: 0.163338
[137]	valid_0's multi_logloss: 0.163326
[138]	valid_0's multi_logloss: 0.163399
[139]	valid_0's multi_logloss: 0.163428
[140]	valid_0's multi_logloss: 0.163397
[141]	valid_0's multi_logloss: 0.163383
[142]	valid_0's multi_logloss: 0.163355
[143]	valid_0's multi_logloss: 0.163365
[144]	valid_0's multi_logloss: 0.163432
[145]	valid_0's multi_logloss: 0.163416
[146]	valid_0's multi_logloss: 0.163417
[147]	valid_0's multi_logloss: 0.163447
Early stopping, best iteration is:
[137]	valid_0's multi_logloss: 0.163326
training model for CV #3
[1]	valid_0's multi_logloss: 0.860647
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 0.761945
[3]	valid_0's multi_logloss: 0.67616
[4]	va

[66]	valid_0's multi_logloss: 0.168247
[67]	valid_0's multi_logloss: 0.168083
[68]	valid_0's multi_logloss: 0.167932
[69]	valid_0's multi_logloss: 0.167781
[70]	valid_0's multi_logloss: 0.167661
[71]	valid_0's multi_logloss: 0.167501
[72]	valid_0's multi_logloss: 0.167429
[73]	valid_0's multi_logloss: 0.167289
[74]	valid_0's multi_logloss: 0.167162
[75]	valid_0's multi_logloss: 0.167074
[76]	valid_0's multi_logloss: 0.166987
[77]	valid_0's multi_logloss: 0.167141
[78]	valid_0's multi_logloss: 0.167577
[79]	valid_0's multi_logloss: 0.167076
[80]	valid_0's multi_logloss: 0.167048
[81]	valid_0's multi_logloss: 0.166954
[82]	valid_0's multi_logloss: 0.166808
[83]	valid_0's multi_logloss: 0.166674
[84]	valid_0's multi_logloss: 0.166588
[85]	valid_0's multi_logloss: 0.166402
[86]	valid_0's multi_logloss: 0.166286
[87]	valid_0's multi_logloss: 0.166209
[88]	valid_0's multi_logloss: 0.166164
[89]	valid_0's multi_logloss: 0.166047
[90]	valid_0's multi_logloss: 0.165987
[91]	valid_0's multi_logl

[90]	valid_0's multi_logloss: 0.167365
[91]	valid_0's multi_logloss: 0.167443
[92]	valid_0's multi_logloss: 0.167326
[93]	valid_0's multi_logloss: 0.166697
[94]	valid_0's multi_logloss: 0.16669
[95]	valid_0's multi_logloss: 0.168145
[96]	valid_0's multi_logloss: 0.166572
Early stopping, best iteration is:
[86]	valid_0's multi_logloss: 0.164274


In [10]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

93.1072%


In [11]:
print(p_val.shape, p_tst.shape)

(320000, 3) (80000, 3)


In [12]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

FileNotFoundError: [Errno 2] No such file or directory: '..\\build\\val\\lgbcv_feature.val.csv'

## 피처 중요도 시각화

In [None]:
imp = pd.DataFrame({'feature': df.columns, 'importance': clf.feature_importances_})
imp = imp.sort_values('importance').set_index('feature')
imp.plot(kind='barh')

## 제출 파일 생성

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[target_col] = np.argmax(p_tst, axis=1)
sub.head()

In [None]:
sub[target_col].value_counts()

In [None]:
sub.to_csv(sub_file)