In [24]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputRegressor
import optuna
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.metrics import mean_absolute_error
import shap

from lightgbm import LGBMRegressor, LGBMClassifier

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
import warnings ; warnings.filterwarnings('ignore')
import time
from sklearn.metrics import f1_score, roc_auc_score, classification_report


train=pd.read_csv('./data/train.csv', index_col='id')
test=pd.read_csv('./data/test.csv', index_col='id')
submission=pd.read_csv('./data/sample_submission.csv', index_col='id')

print(train.shape, test.shape, submission.shape)

(10000, 75) (10000, 71) (10000, 4)


In [25]:
feature_names=list(test)
target_names=list(submission)

Xtrain = train[feature_names]
Xtest = test[feature_names]

Ytrain=train[target_names]
Ytrain1=Ytrain['hhb']
Ytrain2=Ytrain['hbo2']
Ytrain3=Ytrain['ca']
Ytrain4=Ytrain['na']

In [26]:
Xtrain.head()

Unnamed: 0_level_0,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,730_src,...,900_dst,910_dst,920_dst,930_dst,940_dst,950_dst,960_dst,970_dst,980_dst,990_dst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,25,0.3795,0.42993,0.52076,0.57166,0.67818,0.75476,0.8358,0.93623,0.96333,...,,3.527371e-18,,6.455564e-19,,0.0,,1.067504e-18,5.9989490000000004e-18,4.3785130000000007e-17
1,10,0.0,0.0,0.01813,0.0,0.0,0.01974,0.00321,0.0,0.0,...,2.647633e-09,,5.23348e-09,1.264238e-08,1.343132e-08,6.112685e-09,2.130547e-09,,9.710091e-09,
2,25,0.0,0.03289,0.02416,0.0361,0.05843,0.09015,0.14944,0.18578,0.25584,...,4.6291250000000004e-18,1.409413e-18,3.23748e-18,0.0,0.0,0.0,0.0,0.0,1.329725e-18,
3,10,0.27503,0.31281,0.32898,0.41041,0.46587,0.52769,0.64369,0.73562,0.79865,...,1.274876e-10,6.118947e-11,,1.663888e-10,2.245998e-10,1.299511e-10,7.782625e-11,,4.088921e-10,
4,15,1.01521,1.00872,0.9893,0.98874,1.01773,1.01632,1.00009,0.98217,1.01564,...,0.0,,1.537619e-14,4.996522e-14,1.457955e-13,8.769053e-14,,1.330237e-13,,


In [27]:
Ytrain.head()

Unnamed: 0_level_0,hhb,hbo2,ca,na
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.59,4.32,8.92,4.29
1,0.0,2.83,7.25,4.64
2,10.64,3.0,8.4,5.16
3,5.67,4.01,5.05,4.35
4,11.97,4.41,10.78,2.42


In [8]:
base_params={'learning_rate':0.01,
            'max_depth':16,
            'boosting':'gbdt',
            'objective':'regression',
            'metric':'mse',
            'is_training_metric':True,
            'num_leaves':144,
            'feature_fraction':0.9,
            'bagging_fraction':0.7,
            'bagging_freq':5,
            'seed':2020}

base_model=LGBMRegressor(objective='l1', subsample_freq=1, silent=False, random_state=18, 
                         importance_type='gain', params=base_params)

multi_model=MultiOutputRegressor(base_model)

In [9]:
def model_scoring_cv(model, x, y, cv=5):
    start=time.time()
    score=-cross_val_score(model, x, y, cv=cv, scoring='neg_mean_absolute_error').mean()
    stop=time.time()
    print(f"Validation Time : {round(stop-start, 3)} sec")
    return score

In [10]:
src_list=['650_src', '660_src', '670_src', '680_src', '690_src', '700_src', '710_src', '720_src', '730_src', 
          '740_src', '750_src', '760_src', '770_src', '780_src', '790_src', '800_src', '810_src', '820_src', 
          '830_src', '840_src', '850_src', '860_src', '870_src', '880_src', '890_src', '900_src', '910_src', 
          '920_src', '930_src', '940_src', '950_src', '960_src', '970_src', '980_src', '990_src']

dst_list=['650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst', '710_dst', '720_dst', '730_dst', 
          '740_dst', '750_dst', '760_dst', '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst', 
          '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst', '890_dst', '900_dst', '910_dst', 
          '920_dst', '930_dst', '940_dst', '950_dst', '960_dst', '970_dst', '980_dst', '990_dst']

In [8]:
model_scoring_cv(multi_model, Xtrain.fillna(-1), Ytrain)

Validation Time : 737.005 sec


1.2719531111067477

In [11]:
alpha=Xtrain[dst_list]
beta=Xtest[dst_list]

for i in tqdm(Xtrain.index):
    alpha.loc[i] = alpha.loc[i].interpolate()
    
for i in tqdm(Xtest.index):
    beta.loc[i] = beta.loc[i].interpolate()

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:23<00:00, 427.31it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:24<00:00, 411.64it/s]


In [12]:
print(alpha.isnull().sum(), '\n', beta.isnull().sum())

650_dst    1948
660_dst     393
670_dst      78
680_dst      17
690_dst       5
700_dst       1
710_dst       0
720_dst       0
730_dst       0
740_dst       0
750_dst       0
760_dst       0
770_dst       0
780_dst       0
790_dst       0
800_dst       0
810_dst       0
820_dst       0
830_dst       0
840_dst       0
850_dst       0
860_dst       0
870_dst       0
880_dst       0
890_dst       0
900_dst       0
910_dst       0
920_dst       0
930_dst       0
940_dst       0
950_dst       0
960_dst       0
970_dst       0
980_dst       0
990_dst       0
dtype: int64 
 650_dst    2026
660_dst     402
670_dst      85
680_dst      22
690_dst       6
700_dst       1
710_dst       0
720_dst       0
730_dst       0
740_dst       0
750_dst       0
760_dst       0
770_dst       0
780_dst       0
790_dst       0
800_dst       0
810_dst       0
820_dst       0
830_dst       0
840_dst       0
850_dst       0
860_dst       0
870_dst       0
880_dst       0
890_dst       0
900_dst       0
910_dst  

In [13]:
alpha.loc[alpha['700_dst'].isnull(),'700_dst']=alpha.loc[alpha['700_dst'].isnull(),'710_dst']
alpha.loc[alpha['690_dst'].isnull(),'690_dst']=alpha.loc[alpha['690_dst'].isnull(),'700_dst']
alpha.loc[alpha['680_dst'].isnull(),'680_dst']=alpha.loc[alpha['680_dst'].isnull(),'690_dst']
alpha.loc[alpha['670_dst'].isnull(),'670_dst']=alpha.loc[alpha['670_dst'].isnull(),'680_dst']
alpha.loc[alpha['660_dst'].isnull(),'660_dst']=alpha.loc[alpha['660_dst'].isnull(),'670_dst']
alpha.loc[alpha['650_dst'].isnull(),'650_dst']=alpha.loc[alpha['650_dst'].isnull(),'660_dst']

beta.loc[beta['700_dst'].isnull(),'700_dst']=beta.loc[beta['700_dst'].isnull(),'710_dst']
beta.loc[beta['690_dst'].isnull(),'690_dst']=beta.loc[beta['690_dst'].isnull(),'700_dst']
beta.loc[beta['680_dst'].isnull(),'680_dst']=beta.loc[beta['680_dst'].isnull(),'690_dst']
beta.loc[beta['670_dst'].isnull(),'670_dst']=beta.loc[beta['670_dst'].isnull(),'680_dst']
beta.loc[beta['660_dst'].isnull(),'660_dst']=beta.loc[beta['660_dst'].isnull(),'670_dst']
beta.loc[beta['650_dst'].isnull(),'650_dst']=beta.loc[beta['650_dst'].isnull(),'660_dst']

In [14]:
Xtrain[dst_list] = np.array(alpha)
Xtest[dst_list] = np.array(beta)

In [None]:
model_scoring_cv(multi_model, Xtrain, Ytrain)

In [15]:
for col in dst_list:
    Xtrain[col] = Xtrain[col] * (Xtrain['rho'] ** 2)
    Xtest[col] = Xtest[col] * (Xtest['rho']**2)

In [16]:
gap_feature_names=[]
for i in range(650, 1000, 10):
    gap_feature_names.append(str(i) + '_gap')

alpha=pd.DataFrame(np.array(Xtrain[src_list]) - np.array(Xtrain[dst_list]), columns=gap_feature_names, index=train.index)
beta=pd.DataFrame(np.array(Xtest[src_list]) - np.array(Xtest[dst_list]), columns=gap_feature_names, index=test.index)

Xtrain=pd.concat((Xtrain, alpha), axis=1)
Xtest=pd.concat((Xtest, beta), axis=1)

print(Xtrain.shape, Ytrain.shape, Xtest.shape)

(10000, 106) (10000, 4) (10000, 106)


In [17]:
Xtrain.head()

Unnamed: 0_level_0,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,730_src,...,900_gap,910_gap,920_gap,930_gap,940_gap,950_gap,960_gap,970_gap,980_gap,990_gap
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,25,0.3795,0.42993,0.52076,0.57166,0.67818,0.75476,0.8358,0.93623,0.96333,...,0.14493,0.1315,0.12442,0.10786,0.11984,0.09851,0.10965,0.07424,0.07777,0.05536
1,10,0.0,0.0,0.01813,0.0,0.0,0.01974,0.00321,0.0,0.0,...,0.41456,0.63716,0.842299,0.974799,0.998029,0.971459,0.87045,0.753299,0.634139,0.536989
2,25,0.0,0.03289,0.02416,0.0361,0.05843,0.09015,0.14944,0.18578,0.25584,...,0.46222,0.39113,0.27879,0.1907,0.16603,0.11617,0.06744,0.02343,0.03926,0.03136
3,10,0.27503,0.31281,0.32898,0.41041,0.46587,0.52769,0.64369,0.73562,0.79865,...,0.17876,0.15139,0.15656,0.13392,0.14247,0.10126,0.09928,0.09239,0.10523,0.06349
4,15,1.01521,1.00872,0.9893,0.98874,1.01773,1.01632,1.00009,0.98217,1.01564,...,0.983,1.00616,0.99045,1.01321,1.01559,1.01072,0.99439,0.98597,0.99889,1.01412


In [18]:
Xtest.head()

Unnamed: 0_level_0,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,730_src,...,900_gap,910_gap,920_gap,930_gap,940_gap,950_gap,960_gap,970_gap,980_gap,990_gap
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,15,0.15406,0.23275,0.30977,0.42949,0.51264,0.62558,0.7434,0.85418,0.90815,...,0.02841,0.0129,0.00132,0.0,0.0,0.01904,-3.217423e-12,0.01952,-8.235266e-12,0.01285
10001,15,0.48552,0.56939,0.67575,0.79089,0.85114,0.92581,0.98071,0.98177,0.98678,...,0.01144,0.00703,0.00096,0.01406,-1.085789e-11,0.00223,0.01188,-1.653393e-11,0.0091,0.00201
10002,10,0.46883,0.56085,0.62442,0.73172,0.81724,0.91517,0.94801,0.99108,1.01261,...,0.13508,0.14524,0.10918,0.1028,0.10387,0.09468,0.06623,0.08721,0.06559,0.0688
10003,10,0.06905,0.07517,0.10226,0.14905,0.16182,0.19659,0.26085,0.36753,0.51432,...,0.04793,0.03783,0.04006,0.02746,0.04642,0.0063,0.01501,0.01071,0.0331,0.01281
10004,25,0.00253,0.00757,0.01649,0.00128,0.0,0.0,0.00105,0.01975,0.0,...,0.72828,0.83838,0.92615,0.96783,0.99902,1.00555,0.95138,0.91243,0.85656,0.76263


In [None]:
model_scoring_cv(multi_model, Xtrain, Ytrain)

In [19]:
epsilon=1e-10

for dst_col, src_col in zip(dst_list, src_list):
    dst_val=Xtrain[dst_col]
    src_val=Xtrain[src_col] + epsilon
    delta_ratio = dst_val / src_val
    Xtrain[dst_col + '_' + src_col + '_ratio'] = delta_ratio
    
    dst_val=Xtest[dst_col]
    src_val=Xtest[src_col] + epsilon
    
    delta_ratio = dst_val / src_val
    Xtest[dst_col + '_' + src_col + '_ratio'] = delta_ratio
    
print(Xtrain.shape, Xtest.shape)

(10000, 141) (10000, 141)


In [None]:
model_scoring_cv(multi_model, Xtrain, Ytrain)

In [20]:
alpha_real=Xtrain[dst_list]
alpha_imag=Xtrain[dst_list]

beta_real=Xtest[dst_list]
beta_imag=Xtest[dst_list]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in dst_list:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:47<00:00, 210.83it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:47<00:00, 212.35it/s]


In [21]:
Xtrain=pd.concat((Xtrain, alpha), axis=1)
Xtest=pd.concat((Xtest, beta), axis=1)

print(Xtrain.shape, Ytrain.shape, Xtest.shape)

(10000, 211) (10000, 4) (10000, 211)


In [None]:
model_scoring_cv(multi_model, Xtrain, Ytrain)

In [22]:
Xtrain=Xtrain.drop(columns=src_list)
Xtest=Xtest.drop(columns=src_list)

print(Xtrain.shape, Ytrain.shape, Xtest.shape)

(10000, 176) (10000, 4) (10000, 176)


In [None]:
model_scoring_cv(multi_model, Xtrain, Ytrain)

In [23]:
multi_model.fit(Xtrain, Ytrain)
preds=multi_model.predict(Xtest)

preds=pd.DataFrame(data=preds, columns=submission.columns, index=submission.index)
preds.head()

TypeError: Unknown type of parameter:params, got:dict

In [21]:
preds.to_csv('./Dacon_200616_1.csv')