In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from tqdm import tqdm_notebook

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import time

import warnings
warnings.filterwarnings(action='ignore') 

In [8]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

# id : 구분자
# rho : 측정 거리 (단위: mm)
# src : 광원 스펙트럼 (650 nm ~ 990 nm)
# dst : 측정 스펙트럼 (650 nm ~ 990 nm)

# hhb : 디옥시헤모글로빈 농도
# hbo2 : 옥시헤모글로빈 농도
# ca : 칼슘 농도
# na : 나트륨 농도

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

print ("Size of train data : {}" .format(train.shape))
print ("Size of test data : {}" .format(test.shape))

Size of train data : (10000, 75)
Size of test data : (10000, 71)


In [4]:
target_col = train.loc[:, 'hhb':'na'].columns
feature_col = train.columns.difference(target_col)

Xtrain = train[feature_col]
Xtest = test[feature_col]

Ytrain = train[target_col]
Ytrain1 = Ytrain['hhb']
Ytrain2 = Ytrain['hbo2']
Ytrain3 = Ytrain['ca']
Ytrain4 = Ytrain['na']

In [5]:
# base_model parameter

lgbm_base_param = {'objective':'regression',
              'metric':'mae',
              'random_state': 18,
              'learning_rate':0.1, 
              'subsample':0.7, 
              #'feature_fraction':0.8,
              #'bagging_fraction':0.7,
              #'tree_learner': 'serial',
              'subsample_freq': 1,
              'reg_lambda': 7,
              'reg_alpha': 5,
              'num_leaves': 50,
              #'seed' : 1993,
              'n_estimators': 900,
              'colsample_bytree': 0.8  
            }

lgbm_base_model = lgb.LGBMRegressor(verbose = -1, silent=False, importance_type='gain',
                                **lgbm_base_param)

multi_model = MultiOutputRegressor(lgbm_base_model)

In [6]:
def model_scoring_cv(model, x, y, cv=10):
    start = time.time()
    score = -cross_val_score(model, x, y, cv=cv, scoring='neg_mean_absolute_error').mean()
    stop = time.time()
    print(f"Validation Time : {round(stop-start, 3)} sec")
    return score

## Skewness

In [7]:
dst_list = Xtrain.filter(regex='_dst$', axis=1).columns

train_dst = Xtrain[dst_list]
test_dst = Xtest[dst_list]

for i in Xtrain.index:
    train_dst.loc[i] = train_dst.loc[i].interpolate()
    
for i in Xtest.index:
    test_dst.loc[i] = test_dst.loc[i].interpolate()

KeyboardInterrupt: 

In [None]:
# 보간법으로 못 채운 부분은 다음 10nm 파장의 값을 이용하여 채워줌
train_dst.loc[train_dst['700_dst'].isnull(),'700_dst'] = train_dst.loc[train_dst['700_dst'].isnull(),'710_dst']
train_dst.loc[train_dst['690_dst'].isnull(),'690_dst'] = train_dst.loc[train_dst['690_dst'].isnull(),'700_dst']
train_dst.loc[train_dst['680_dst'].isnull(),'680_dst'] = train_dst.loc[train_dst['680_dst'].isnull(),'690_dst']
train_dst.loc[train_dst['670_dst'].isnull(),'670_dst'] = train_dst.loc[train_dst['670_dst'].isnull(),'680_dst']
train_dst.loc[train_dst['660_dst'].isnull(),'660_dst'] = train_dst.loc[train_dst['660_dst'].isnull(),'670_dst']
train_dst.loc[train_dst['650_dst'].isnull(),'650_dst'] = train_dst.loc[train_dst['650_dst'].isnull(),'660_dst']

test_dst.loc[test_dst['700_dst'].isnull(),'700_dst'] = test_dst.loc[test_dst['700_dst'].isnull(),'710_dst']
test_dst.loc[test_dst['690_dst'].isnull(),'690_dst'] = test_dst.loc[test_dst['690_dst'].isnull(),'700_dst']
test_dst.loc[test_dst['680_dst'].isnull(),'680_dst'] = test_dst.loc[test_dst['680_dst'].isnull(),'690_dst']
test_dst.loc[test_dst['670_dst'].isnull(),'670_dst'] = test_dst.loc[test_dst['670_dst'].isnull(),'680_dst']
test_dst.loc[test_dst['660_dst'].isnull(),'660_dst'] = test_dst.loc[test_dst['660_dst'].isnull(),'670_dst']
test_dst.loc[test_dst['650_dst'].isnull(),'650_dst'] = test_dst.loc[test_dst['650_dst'].isnull(),'660_dst']