# 월간 데이콘 2 천체 유형 분류
## Public 12위

안녕하세요 데이콘!  
이번 대회에 참가하여, Public 12등으로 대회를 마무리하였습니다.  
제출파일을 잘못선택하여  정확한 Private 점수는 알 수가 없습니다..  

일주일동안 대회를 진행하면서, 많은 변수를 만들어보면서 진행하였습니다.  
참고한 사이트는 http://classic.sdss.org/dr4/algorithms/sdssUBVRITransform.html  입니다.  
모델링은 Xgboost, Catboos, LightGBM을 사용하였으며, 마지막에는 Weight Ensemble을 사용하였습니다.  
시간이 없어서 하이퍼파라미터의 경우, Manual Search나  단순 CV가 절 나오는 경우로 선택하였습니다.  
모두 수고하셨습니다!  

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Modeling
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import xgboost as xgb

# Metric, Kfold
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

# Weight Ensemble
from scipy.optimize import minimize

import time
import warnings 
warnings.filterwarnings('ignore')


## Read Data

In [None]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')

submission = pd.read_csv('../data/sample_submission.csv')

## Feature Engineering

In [None]:
def SpectralClass(x):
    if x < -0.3:
        return "O"
    
    elif (x > 0):
        return "B"
    
    elif (x > 0.33):
        return "A"
    
    elif (x > 0.6):
        return "F"
    
    elif (x > 0.81):
        return "G"
    
    elif (x > 1.4):
        return "K"
    
    else:
        return "M"
def ugriz(df):
    mag = ['psfMag', 'fiberMag', 'PetroMag', 'model']
    colors = list('ugriz')
    
    u = list(df.columns[df.columns.str.endswith('_u')])
    g = list(df.columns[df.columns.str.endswith('_g')])
    r = list(df.columns[df.columns.str.endswith('_r')])
    i = list(df.columns[df.columns.str.endswith('_i')])
    z = list(df.columns[df.columns.str.endswith('_z')])
    
    # u_g
    for idx, cols in enumerate(mag):
        df[cols+'_u_g'] = df[u[idx]] - df[g[idx]]
     
    # g_r
    for idx, cols in enumerate(mag):
        df[cols+'_g_r'] = df[g[idx]] - df[r[idx]]
        
    # r_i
    for idx, cols in enumerate(mag):
        df[cols+'_r_i'] = df[r[idx]] - df[i[idx]]
        
    # g_i
    for idx, cols in enumerate(mag):
        df[cols+'_g_i'] = df[g[idx]] - df[i[idx]] 
        
    # i_z
    for idx, cols in enumerate(mag):
        df[cols+'_i_z'] = df[i[idx]] - df[z[idx]]
    
    # model-[psf,fiber,petro]
    # fiber-[psf]
    for color in colors:
        df['model_psf_'+ color] = df[locals()[color][3]] - df[locals()[color][0]]
        df['model_fiber_'+ color] = df[locals()[color][3]] - df[locals()[color][1]]
        df['model_petro_'+ color] = df[locals()[color][3]] - df[locals()[color][2]]
        df['fiber_psf_' + color] = df[locals()[color][1]] - df[locals()[color][0]]
        
    # B_V
    for idx, cols in enumerate(mag):
        df[cols+'_b_v'] = 0.98 * (df[g[idx]] - df[r[idx]]) + 0.22
        df['star_spectrum_'+cols] = df[cols+'_b_v'].apply(lambda x: SpectralClass(x))
        
    return df

In [None]:
train = ugriz(train)
test = ugriz(test)

In [None]:
bins= [0, 100, 200, 300, 400, 500, 600, 640, 1000]
labels = list('01234567')

train['fiberID2'] = pd.cut(train['fiberID'], bins= bins, labels = labels)
test['fiberID2'] = pd.cut(test['fiberID'], bins= bins, labels = labels)

In [None]:
# One-Hot Encoding
star = list(train.columns[train.columns.str.startswith('star_')])
star.append('fiberID2')

train= pd.get_dummies(data=train, columns=star)
test = pd.get_dummies(data=test, columns=star)



In [None]:
type_list  = list(submission.columns)[1:]
type_dict = {w: i for i, w in enumerate(type_list)}

train['type'] = train['type'].apply(lambda x: type_dict[x])

## Modeling

In [None]:
features = [c for c in train.columns if c not in ['type']]
target = train['type']

## LIGHTGBM

In [None]:
param = {
        'num_leaves': 10,
        'num_class': 19,
        'learning_rate': 0.03,
        'bagging_fraction': 0.7, 
        'feature_fraction': 0.7,
        'max_depth': 8,
        'seed': 1337,
        'lambda_l1': 4.972,
        'lambda_l2': 2.276,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'objective': 'multiclass',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'multi_logloss',
        'is_unbalance': True,
        'boost_from_average': False,
    }
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_lgb = np.zeros((len(train),19))
lgb_pred = np.zeros((len(test),19))

start = time.time()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train, train['type'])):
    print("fold num_: {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    
    num_round = 5000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)    
    lgb_pred += clf.predict(test[features], num_iteration=clf.best_iteration) / 5
    
print('\nCross Validation Is Complete')                           
print("CV score: {:<8.5f}".format(log_loss(target, oof_lgb)))

## CatBoost

In [None]:
model = CatBoostClassifier(loss_function='MultiClass', 
                           early_stopping_rounds=50,
                           random_state=42,
                           task_type="CPU",
                           learning_rate=0.03,
                           iterations=5000)

skf = StratifiedKFold(n_splits=5, random_state=74, shuffle=True)


oof_cat = np.zeros((len(train),19))
cat_pred = np.zeros((len(test),19))


for idx, (train_index, valid_index) in enumerate(skf.split(train, train['type'])):
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_train, X_valid = train[features].iloc[train_index,:], train[features].iloc[valid_index,:]
    _train = Pool(X_train, label=y_train)
    _valid = Pool(X_valid, label=y_valid)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True, 
                          verbose=500 )
    pred = fit_model.predict_proba(X_valid)
    print( "  Log loss = ", log_loss(y_valid, pred) )
    oof_cat[valid_index] = pred
    cat_pred += fit_model.predict_proba(test[features])
cat_pred /= 5

## XGBoost

In [None]:
xgb_params={'eta':0.03,
            'max_depth':6,
            'objective':'multi:softprob',
            'alpha' : 4.972,
            'lambda' : 2.276,
            'num_class':19,
            'subsample':0.7,
            'colsample_bytree':0.7,
            'random_state':42,
            'eval_metric': 'mlogloss',
            'tree_method':'gpu_hist',
            'predictor':'gpu_predictor'}
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_xgb = np.zeros((len(train),19))
xgb_pred = np.zeros((len(test),19))

start = time.time()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train, train['type'])):
    print("fold num_: {}".format(fold_))
    trn_data = xgb.DMatrix(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(train.iloc[val_idx][features], label=target.iloc[val_idx])
    
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    num_round = 5000
    clf = xgb.train(params = xgb_params,
                    dtrain = trn_data,
                    num_boost_round  = num_round,
                    evals = watchlist,
                    verbose_eval=100,
                    early_stopping_rounds = 100
                )
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(train.iloc[val_idx][features]), ntree_limit=clf.best_iteration)

    
    xgb_pred += clf.predict(xgb.DMatrix(test[features]), ntree_limit=clf.best_iteration) / 5
    
print('\nCross Validation Is Complete')                           
print("CV score: {:<8.5f}".format(log_loss(target, oof_xgb)))

## Weight Ensemble

In [None]:
predictions = [oof_xgb, oof_lgb, oof_cat]

def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(train['type'], final_prediction)
    
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)

#adding constraints  and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)

res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensamble Score: 0.32772569398255863  
Best Weights: [0.57025908 0.03675345 0.39298747]  

In [None]:
weight_result = [0.57025908, 0.03675345, 0.39298747]
final = xgb_pred*weight_result[0] + lgb_pred*weight_result[1] + cat_pred*weight_result[2]

## Submit

In [None]:
final = pd.DataFrame(data=final, columns=submission.columns[1:], index=submission.id).reset_index()
final.to_csv('final_victory.csv', index=False)