# 월간 데이콘 2 천체 유형 분류
## Public 2위 Private 1위 코드

매달 재밌는 대회 개최해주셔서 감사합니다.
모두 고생하셨습니다!


Feature Engineering

1. Magnitude, row별 max, min, max-min, std, sum Feature 추가

2. 모든 magnitude들의 조합(2)으로 diff feature 추가 

3. 각 magnitude 별 max-max, min-min, sum-sum 을 구함

4. 정확히 이것이 무엇인지는 모르겠는데 ugriz를 다른 system으로 변환하는 것 같았습니다. (성능 차이 거의 없음)

5. fiberID별 fiberMag mean, (fiber_u,g,r,i,z)/fiberMag_mean

6. 아래 사이트를 읽어보고 icolor, scolor, p1 등 feature 추가
 -> https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy

7. asinh 변환

8. 위에서 구한 diff feature들의 표준편차

9. 원본 Magnitude들 decomposition 수행하여 Feature 추가

10. Permutation Importance를 사용하여 Feature Selection


Modeling

1. LightGBM Single Model이었고 Parameter는 모두 Hyper Optimization으로 찾았습니다.

2. 'boosting_type': 'dart' 로 한것이 효과가 좋았습니다. 
 -> gbdt가 0.3285정도 나왔고 dart는 0.3255, goss는 0.3300 정도 나왔습니다.

3. stratifiedkfold 5fold를 사용했고 stratified에 type을 넣었습니다.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import itertools

# #Data Load

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
submission = pd.read_csv('../data/sample_submission.csv')

Feature Engineering

In [3]:
# submission의 순서로 string으로 된 train type을 숫자로 변경
# https://dacon.io/competitions/official/235573/codeshare/690
column_number = {}
for i, column in enumerate(submission.columns[1:]):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

In [4]:
train.shape

(199991, 24)

In [5]:
# Feature Engineering하면서 계속 사용하게 될 각 Magnitude column(u,g,r,i,z)을 list로 변경
psfMag_col = [c for c in train.columns if c.find('psfMag')!=-1]
fiberMag_col = [c for c in train.columns if c.find('fiberMag')!=-1]
petroMag_col = [c for c in train.columns if c.find('petroMag')!=-1]
modelMag_col = [c for c in train.columns if c.find('modelMag')!=-1]

In [6]:
# zip 함수를 이용하여 각 Row별, Magnitude별 max, min, max-min, std, sum을 구한다.
# mean, skew, 등 다른 것들 시도 시 cv 점수가 안 좋아져서 사용하지 않음
for prefix, g in zip(['psfMag','fiberMag','petroMag','modelMag'], [psfMag_col, fiberMag_col, petroMag_col, modelMag_col]):
    train[f'{prefix}_max'] = train[g].max(axis=1)
    test[f'{prefix}_max'] = test[g].max(axis=1)
    
    train[f'{prefix}_min'] = train[g].min(axis=1)
    test[f'{prefix}_min'] = test[g].min(axis=1)
    
    train[f'{prefix}_diff'] = train[f'{prefix}_max'] - train[f'{prefix}_min']
    test[f'{prefix}_diff'] = test[f'{prefix}_max'] - test[f'{prefix}_min']
    
    train[f'{prefix}_std'] = train[g].std(axis=1)
    test[f'{prefix}_std'] = test[g].std(axis=1)
    
    train[f'{prefix}_sum'] = train[g].sum(axis=1)
    test[f'{prefix}_sum'] = test[g].sum(axis=1)
    
    #print(train[g])

각각의 psfMag, fiberMag, petroMag, modelMag (u, g, r, i, z) 

- 최대 (4개)
- 최소 (4개)
- 최대-최소 (4개)
- 표준편차 (4개)
- 합 (4개)

총 20개의 새로운 열 생성


In [7]:
train.shape

(199991, 44)

## diff feature 추가 예: psfMag_z - psfMag_i 
- sdss lagacy solution 등을 보면 대 부분 mag간 차이를 사용하기 때문에 이런 diff feature가 의미가 있을 것이라고 판단
- 그리고 각 magnitude에서만 diff를 구하는 것이 아닌 itertools combinations를 활용하여 전체 magnitude에서 diff를 구함
- 총 190가지 조합이 나오고 여기서 안 좋은 것은 permutation importance를 활용하여 feature 제거 수행


In [10]:
type(psfMag_col[::-1]+fiberMag_col[::-1]+petroMag_col[::-1]+modelMag_col[::-1])

list

In [11]:
for i,j in itertools.combinations(['a','b','c'], 2):
    print(i,j)

a b
a c
b c


### 각각의 열 끼리 뺄셈 연산을 통해 새로운 190개열 생성

In [12]:
diff_feature = []
for c1, c2 in itertools.combinations(psfMag_col[::-1]+fiberMag_col[::-1]+petroMag_col[::-1]+modelMag_col[::-1],2):
    #print(c1,c2)
    new_c = f'{c1}_{c2}_diff'
    train[new_c] = train[c1]-train[c2]
    test[new_c] = test[c1]-test[c2]
    diff_feature.append(new_c)

In [13]:
train.shape

(199991, 234)

### 각 magnitude 별로 최대값, 최소값, 합을 구한 후에 각각을 뺄셈연산을 통해 총 18개 변수 생성

In [14]:
# 각 magnitude 별 max-max, min-min, sum-sum 을 구함
for c in itertools.combinations(['psfMag','fiberMag','petroMag','modelMag'],2):
    train[f'{c[0]}_{c[1]}_max_diff'] = train[f'{c[0]}_max'] - train[f'{c[1]}_max']
    test[f'{c[0]}_{c[1]}_max_diff'] = test[f'{c[0]}_max'] - test[f'{c[1]}_max']
    
    train[f'{c[0]}_{c[1]}_min_diff'] = train[f'{c[0]}_min'] - train[f'{c[1]}_min']
    test[f'{c[0]}_{c[1]}_min_diff'] = test[f'{c[0]}_min'] - test[f'{c[1]}_min']
    
    train[f'{c[0]}_{c[1]}_sum_diff'] = train[f'{c[0]}_sum'] - train[f'{c[1]}_sum']
    test[f'{c[0]}_{c[1]}_sum_diff'] = test[f'{c[0]}_sum'] - test[f'{c[1]}_sum']
    

In [15]:
train.shape

(199991, 252)

In [16]:
# 도메인 지식이 없어 정확히는 모르지만 아래와 같은 공식들이 있어 구현함
# 추가적으로 다른 것들도 시도하였는데 좋아지지 않음
# http://classic.sdss.org/dr4/algorithms/sdssUBVRITransform.html
def make_2flux_feature(train, test, c1,c2, func, mag_list=None):

    for c in mag_list:
        x=train[f'{c}_{c1}'].values
        y=train[f'{c}_{c2}'].values
        train[f'{c}_{func.__name__}'] = func(x,y)
        
        x=test[f'{c}_{c1}'].values
        y=test[f'{c}_{c2}'].values
        
        test[f'{c}_{func.__name__}'] = func(x,y)
        
def quasar_UB_jester(x1, x2):
    return 0.75*(x1-x2)-0.81

make_2flux_feature(train, test, 'u','g',quasar_UB_jester,['psfMag'])

def quasar_BV_jester(x1, x2):
    return 0.62*(x1-x2)+0.15

make_2flux_feature(train, test, 'g','r',quasar_BV_jester,['psfMag'])

def quasar_VR_jester(x1, x2):
    return 0.38*(x1-x2)+0.27

make_2flux_feature(train, test, 'r','i',quasar_VR_jester,['psfMag'])

def quasar_RcIc_jester(x1, x2):
    return 0.72*(x1-x2)+0.27

make_2flux_feature(train, test, 'u','g',quasar_RcIc_jester,['psfMag'])

In [17]:
def groupby_helper(all_data, source, target, agg_func):
    
    temp = all_data.groupby(source)[target].agg(agg_func)
    new_col = ['_'.join(source)+'_' +c[0]+'_'+c[1] for c in itertools.product(target,agg_func)]
    temp.columns = new_col
    temp = temp.reset_index()
    all_data = all_data.merge(temp, on=source, how='left')
    return all_data, new_col

# fiberID별 fiber Magnitude 평균 aggregation 수행
all_data = pd.concat([train, test], ignore_index=True)
all_data, new_c = groupby_helper(all_data, ['fiberID'], fiberMag_col, ['mean'])

# 각 filter별 fiberMag/fiberMag_mean
for c1, c2 in zip(new_c, fiberMag_col):
    all_data[f'{c2}_div_mean'] = all_data[c2]/all_data[c1]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


#### psfMag와 연관된 새로운 열 추가

In [18]:
# https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy
all_data['psf_icolor'] = all_data['psfMag_u']*(-0.436) + all_data['psfMag_g']*(1.129) + all_data['psfMag_r']*(-0.119) + all_data['psfMag_i']*(-0.574) +0.1984
all_data['psf_scolor'] = all_data['psfMag_u']*(-0.249) + all_data['psfMag_g']*(0.794) + all_data['psfMag_r']*(-0.555) +0.234
all_data['psf_p1'] = (all_data['psfMag_u']-all_data['psfMag_g'])*(0.91) + (all_data['psfMag_g']-all_data['psfMag_r'])*(0.415) -1.280
all_data['psfMag_r_std_div'] = all_data['psfMag_r']/all_data['psfMag_r'].std()

In [19]:
all_data.shape

(210000, 270)

#### modelMag와 연관된 새로운 열 추가

In [20]:
# https://www.sdss.org/dr16/algorithms/legacy_target_selection/
all_data['modelMag_borthogonal'] =(all_data['modelMag_r']-all_data['modelMag_i'])-(all_data['modelMag_g']-all_data['modelMag_r'])/4-0.177
all_data['modelMag_parallel'] = 0.7*(all_data['modelMag_g']-all_data['modelMag_r']) + 1.2*((all_data['modelMag_r']-all_data['modelMag_i'])-0.177)


In [21]:
all_data.shape

(210000, 272)

# ?

In [22]:
# https://www.sdss.org/dr12/algorithms/magnitudes/
color_list = ['u', 'g', 'r', 'i', 'z']
b_list = [1.4*10e-10, 0.9*10e-10, 1.2*10e-10, 1.8*10e-10, 7.4*10e-10]
f0_list = [24.63, 25.11, 24.80, 24.36, 22.83]
for c, b, f0 in zip(color_list, b_list, f0_list):
    all_data[f'psfMag_{c}_asinh'] = -2.5*np.log(10)*(np.arcsinh((all_data[f'psfMag_{c}']/f0)/(2*b))+np.log(b))


In [23]:
all_data.shape

(210000, 277)

In [26]:
#!pip install eli5

## Permutation_importance

In [31]:
# import eli5
# from eli5.sklearn import PermutationImportance
# from sklearn.ensemble import RandomForestClassifier
# # from sklearn.inspection import permutation_importance  # sklearn 22 버전부터 해당

# my_model=RandomForestClassifier().fit(train,test)
# perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
# # eli5.show_weights(perm, feature_names = val_X.columns.tolist()) # Notebook에서 실행 가능

# print(eli5.format_as_text(eli5.explain_weights(perm, feature_names = val_X.columns.tolist())))


### permutation importacne로 찾아낸 나쁜 feature들 제거

In [34]:
# permutation importacne로 찾아낸 나쁜 feature들 제거
bad_feature = ['petroMag_i_modelMag_i_diff','petroMag_g_modelMag_g_diff','psfMag_g_modelMag_g_diff',
 'psfMag_g_petroMag_g_diff','psfMag_r_petroMag_r_diff','petroMag_modelMag_min_diff',
 'psfMag_fiberMag_min_diff','psfMag_modelMag_min_diff','psfMag_fiberMag_sum_diff',
 'psfMag_u_fiberMag_u_diff','psfMag_u_modelMag_u_diff','psfMag_modelMag_sum_diff',
 'psfMag_z_fiberMag_z_diff','psfMag_petroMag_min_diff','psfMag_z_modelMag_z_diff',
 'fiberMag_modelMag_sum_diff','psfMag_fiberMag_max_diff','psfMag_modelMag_max_diff',
 'petroMag_z_modelMag_z_diff','fiberMag_modelMag_min_diff','fiberMag_petroMag_min_diff',
 'fiberMag_modelMag_max_diff','petroMag_u_modelMag_u_diff','fiberMag_u_modelMag_u_diff',
 'fiberMag_z_petroMag_z_diff','petroMag_modelMag_sum_diff','fiberMag_z_modelMag_z_diff',
 'fiberMag_u_petroMag_u_diff','psfMag_z_petroMag_z_diff','petroMag_modelMag_max_diff',
 'psfMag_petroMag_max_diff','fiberMag_petroMag_max_diff','fiberMag_petroMag_sum_diff',
 'psfMag_u_petroMag_u_diff','petroMag_i_modelMag_z_diff','psfMag_petroMag_sum_diff',
 'fiberMag_u_div_mean','fiberMag_z_modelMag_i_diff','petroMag_z_petroMag_i_diff',
 'psfMag_z_petroMag_i_diff','psfMag_g_petroMag_r_diff','fiberMag_i_petroMag_z_diff',
 'fiberMag_z_petroMag_i_diff','petroMag_z','fiberMag_g_petroMag_u_diff',
 'psfMag_i_petroMag_z_diff','petroMag_z_modelMag_i_diff','fiberMag_r_petroMag_i_diff',
 'petroMag_sum','fiberMag_r_petroMag_g_diff','psfMag_r_petroMag_i_diff',
 'fiberMag_u','psfMag_u','petroMag_max',
 'petroMag_r_petroMag_g_diff','psfMag_i_petroMag_r_diff','petroMag_u',
 'fiberMag_r_modelMag_z_diff','petroMag_g_modelMag_r_diff','petroMag_diff',
 'petroMag_std','fiberMag_z_petroMag_r_diff','psfMag_i_fiberMag_g_diff',
 'psfMag_z_petroMag_r_diff','psfMag_g_petroMag_i_diff','fiberMag_r_modelMag_u_diff',
 'petroMag_r_modelMag_z_diff','fiberMag_g_petroMag_i_diff','fiberMag_z_modelMag_r_diff',
 'psfMag_i_psfMag_g_diff','psfMag_i_petroMag_g_diff','fiberMag_diff',
 'petroMag_z_petroMag_r_diff','psfMag_std','modelMag_std',
 'modelMag_diff','psfMag_diff','fiberMag_std',
 'petroMag_z_modelMag_r_diff','fiberMag_r_fiberMag_u_diff','psfMag_u_petroMag_r_diff',
 'fiberMag_u_petroMag_r_diff','psfMag_r_petroMag_z_diff','fiberMag_u_modelMag_r_diff',
 'petroMag_i_modelMag_g_diff','petroMag_r_petroMag_u_diff','fiberMag_z_fiberMag_g_diff',
 'psfMag_r_petroMag_u_diff','fiberMag_i_petroMag_g_diff','fiberMag_r_petroMag_u_diff',
 'psfMag_g_fiberMag_z_diff','petroMag_r_modelMag_u_diff','psfMag_g_modelMag_z_diff',
 'petroMag_i_petroMag_g_diff','psfMag_z_fiberMag_g_diff','petroMag_g_modelMag_i_diff',
 'fiberMag_z_modelMag_g_diff','modelMag_z_modelMag_g_diff','psfMag_z_psfMag_g_diff',
 'fiberMag_g_modelMag_z_diff','fiberMag_z_petroMag_g_diff','psfMag_z_petroMag_g_diff',
 'psfMag_g_petroMag_z_diff','petroMag_u_modelMag_r_diff','fiberMag_g_petroMag_z_diff',
 'psfMag_i_psfMag_u_diff','psfMag_u_petroMag_i_diff','psfMag_z_petroMag_u_diff',
 'petroMag_z_petroMag_g_diff','psfMag_i_fiberMag_u_diff','psfMag_u_fiberMag_i_diff',
 'psfMag_u_fiberMag_z_diff','petroMag_z_modelMag_g_diff','psfMag_i_modelMag_u_diff',
 'fiberMag_u_modelMag_z_diff','petroMag_g_modelMag_z_diff','fiberMag_u_petroMag_z_diff',
 'psfMag_i_petroMag_u_diff','fiberMag_i_fiberMag_u_diff','fiberMag_u_modelMag_i_diff',
 'petroMag_i_petroMag_u_diff','psfMag_u_modelMag_z_diff','petroMag_i_modelMag_u_diff',
 'psfMag_z_modelMag_u_diff','fiberMag_i_petroMag_u_diff','petroMag_z_petroMag_u_diff',
 'psfMag_z_fiberMag_u_diff','petroMag_z_modelMag_u_diff','fiberMag_z_modelMag_u_diff',
 'fiberMag_u_petroMag_i_diff','fiberMag_z_petroMag_u_diff','modelMag_z_modelMag_u_diff',
 'petroMag_u_modelMag_i_diff','fiberMag_z_fiberMag_u_diff','petroMag_u_modelMag_z_diff']

In [36]:
train_columns = [c for c in all_data.columns if c not in ['id','type','type_num']+bad_feature]

# diff feature들 간의 표준편차(permutation importance로 걸러낸 것들 중에서..), 실험적으로 찾은 것
intersect_good_feature = list(set(diff_feature).intersection(set(train_columns)))
all_data['diff_feature_std'] = all_data[intersect_good_feature].std(axis=1)

In [39]:
print(len(bad_feature), len(train_columns), len(all_data.columns))

135 139 278


In [None]:
train = all_data.loc[all_data['type'].notnull()]
test = all_data.loc[all_data['type'].isnull()].reset_index(drop=True)

In [40]:
train.shape, test.shape

((199991, 256), (10009, 254))

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA, FastICA, FactorAnalysis, KernelPCA
from sklearn.decomposition import IncrementalPCA, LatentDirichletAllocation,MiniBatchSparsePCA, SparsePCA

def get_decomposition_feature(train, test, feature, param, decompose_func, prefix):
    n_components = param['n_components']
    de = decompose_func(**param)
    de_train = de.fit_transform(train[feature])
    de_test = de.transform(test[feature])
    train = pd.concat([train, pd.DataFrame(de_train,columns=[f'{prefix}_{c}' for c in range(n_components)])],axis=1)
    test = pd.concat([test, pd.DataFrame(de_test,columns=[f'{prefix}_{c}' for c in range(n_components)])],axis=1)
    return train, test

org_feature = psfMag_col+fiberMag_col+petroMag_col+modelMag_col
# decompostion해서 다시 feature로 추가, 원래 original feature만 사용하고 5개로 축소
decom_common_param = {'n_components':5,'random_state':42}
train, test = get_decomposition_feature(train, test, org_feature, decom_common_param, TruncatedSVD, 'tsvd5')
train, test = get_decomposition_feature(train, test, org_feature, decom_common_param, FastICA, 'ica5')


In [None]:
# permutation importance를 사용하여 feature를 제거함
# https://eli5.readthedocs.io/en/latest/
# lb 0.001~0.003정도 좋아진 것으로 기억
print(len(train.columns))
train_columns = [c for c in train.columns if c not in ['id','type','type_num']+bad_feature]
num_class = train['type'].nunique()

print(len(train_columns))

In [None]:
# hyper optimization으로 찾아낸 parameter
# lightgbm dart 사용, 보다 lb 0.03 정도 좋음
# gbdt가 0.3285라면 dart는 0.3255, goss는 0.3300
lgb_param_dart = {'objective': 'multiclass', 
 'num_class': 19, 
 'boosting_type': 'dart', 
 'subsample_freq': 5, 
 'num_leaves': 92, 
 'min_data_in_leaf': 64, 
 'subsample_for_bin': 23000, 
 'max_depth': 10, 
 'feature_fraction': 0.302, 
 'bagging_fraction': 0.904, 
 'lambda_l1': 0.099, 
 'lambda_l2': 1.497, 
 'min_child_weight': 38.011, 
 'nthread': 32, 
 'metric': 'multi_logloss', 
 'learning_rate': 0.021, 
 'min_sum_hessian_in_leaf': 3, 
 'drop_rate': 0.846244, 
 'skip_drop': 0.792465, 
 'max_drop': 65,
 'seed': 42,
 'n_estimators': 1000}

In [None]:
print(train_columns)
print(lgb_param_dart)
x_train = train.copy()
y_train = train['type_num']
x_test = test.copy()

num_class = lgb_param_dart['num_class']
oof_train = np.zeros((len(x_train),num_class))
oof_test = np.zeros((len(x_test),num_class))
log_loss_score_list= []   
NFOLD = 5
SEED = 42

# stratifiedkfold 5 fold 사용
folds = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=42)
for fold_, (trn_index, val_index) in enumerate(folds.split(x_train, y_train)):
    print(f"{fold_+1} FOLD Start!!")
    trn_x, trn_y = x_train.iloc[trn_index][train_columns], y_train.iloc[trn_index]
    val_x, val_y = x_train.iloc[val_index][train_columns], y_train.iloc[val_index]
    dtrain = lgbm.Dataset(trn_x, label=trn_y, silent=True)
    dcross = lgbm.Dataset(val_x, label=val_y, silent=True)
    
    # dart는 얼리스탑핑이 안되서 한번 num_boost_round를 넉넉히 돌린다음에 5fold에서 가장 좋았던 round로 고정하고 돌린다.
    clf = lgbm.train(lgb_param_dart, train_set=dtrain, num_boost_round=1000, valid_sets=[dtrain, dcross], 
                       verbose_eval=100)
    
    val_pred = clf.predict(val_x)
    oof_train[val_index, :] = val_pred
    
    log_loss_score = log_loss(val_y, val_pred)
    log_loss_score_list.append(log_loss_score)
    print(f"{fold_+1} FOLD LogLoss: ", log_loss_score)
    
    # 5fold 평균으로 제출
    oof_test += clf.predict(x_test[train_columns])/NFOLD

In [None]:
print(log_loss_score_list)
np.mean(log_loss_score_list), np.std(log_loss_score_list)

In [None]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
submission = pd.DataFrame(data=oof_test, columns=sample_submission.columns[1:], index=sample_submission['id'])
submission.to_csv('../output/submission.csv', index=True)