In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputRegressor
import optuna
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict
from sklearn.metrics import mean_absolute_error
import shap

from lightgbm import LGBMRegressor, LGBMClassifier

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
import warnings ; warnings.filterwarnings('ignore')
import time
from sklearn.metrics import f1_score, roc_auc_score, classification_report


train=pd.read_csv('./data/train.csv', index_col='id')
test=pd.read_csv('./data/test.csv', index_col='id')
submission=pd.read_csv('./data/sample_submission.csv', index_col='id')

print(train.shape, test.shape, submission.shape)

(10000, 75) (10000, 71) (10000, 4)


In [2]:
feature_names=list(test)
target_names=list(submission)

Xtrain = train[feature_names]
Xtest = test[feature_names]

Ytrain=train[target_names]

In [3]:
Xtrain.head()

Unnamed: 0_level_0,rho,650_src,660_src,670_src,680_src,690_src,700_src,710_src,720_src,730_src,...,900_dst,910_dst,920_dst,930_dst,940_dst,950_dst,960_dst,970_dst,980_dst,990_dst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,25,0.3795,0.42993,0.52076,0.57166,0.67818,0.75476,0.8358,0.93623,0.96333,...,,3.527371e-18,,6.455564e-19,,0.0,,1.067504e-18,5.9989490000000004e-18,4.3785130000000007e-17
1,10,0.0,0.0,0.01813,0.0,0.0,0.01974,0.00321,0.0,0.0,...,2.647633e-09,,5.23348e-09,1.264238e-08,1.343132e-08,6.112685e-09,2.130547e-09,,9.710091e-09,
2,25,0.0,0.03289,0.02416,0.0361,0.05843,0.09015,0.14944,0.18578,0.25584,...,4.6291250000000004e-18,1.409413e-18,3.23748e-18,0.0,0.0,0.0,0.0,0.0,1.329725e-18,
3,10,0.27503,0.31281,0.32898,0.41041,0.46587,0.52769,0.64369,0.73562,0.79865,...,1.274876e-10,6.118947e-11,,1.663888e-10,2.245998e-10,1.299511e-10,7.782625e-11,,4.088921e-10,
4,15,1.01521,1.00872,0.9893,0.98874,1.01773,1.01632,1.00009,0.98217,1.01564,...,0.0,,1.537619e-14,4.996522e-14,1.457955e-13,8.769053e-14,,1.330237e-13,,


In [4]:
Ytrain.head()

Unnamed: 0_level_0,hhb,hbo2,ca,na
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.59,4.32,8.92,4.29
1,0.0,2.83,7.25,4.64
2,10.64,3.0,8.4,5.16
3,5.67,4.01,5.05,4.35
4,11.97,4.41,10.78,2.42


In [5]:
src_list=['650_src', '660_src', '670_src', '680_src', '690_src', '700_src', '710_src', '720_src', '730_src', 
          '740_src', '750_src', '760_src', '770_src', '780_src', '790_src', '800_src', '810_src', '820_src', 
          '830_src', '840_src', '850_src', '860_src', '870_src', '880_src', '890_src', '900_src', '910_src', 
          '920_src', '930_src', '940_src', '950_src', '960_src', '970_src', '980_src', '990_src']

dst_list=['650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst', '710_dst', '720_dst', '730_dst', 
          '740_dst', '750_dst', '760_dst', '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst', 
          '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst', '890_dst', '900_dst', '910_dst', 
          '920_dst', '930_dst', '940_dst', '950_dst', '960_dst', '970_dst', '980_dst', '990_dst']

In [6]:
alpha=Xtrain[dst_list]
beta=Xtest[dst_list]

for i in tqdm(Xtrain.index):
    alpha.loc[i] = alpha.loc[i].interpolate()
    
for i in tqdm(Xtest.index):
    beta.loc[i] = beta.loc[i].interpolate()

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:23<00:00, 425.54it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:22<00:00, 453.43it/s]


In [7]:
print(alpha.isnull().sum(), '\n', beta.isnull().sum())

650_dst    1948
660_dst     393
670_dst      78
680_dst      17
690_dst       5
700_dst       1
710_dst       0
720_dst       0
730_dst       0
740_dst       0
750_dst       0
760_dst       0
770_dst       0
780_dst       0
790_dst       0
800_dst       0
810_dst       0
820_dst       0
830_dst       0
840_dst       0
850_dst       0
860_dst       0
870_dst       0
880_dst       0
890_dst       0
900_dst       0
910_dst       0
920_dst       0
930_dst       0
940_dst       0
950_dst       0
960_dst       0
970_dst       0
980_dst       0
990_dst       0
dtype: int64 
 650_dst    2026
660_dst     402
670_dst      85
680_dst      22
690_dst       6
700_dst       1
710_dst       0
720_dst       0
730_dst       0
740_dst       0
750_dst       0
760_dst       0
770_dst       0
780_dst       0
790_dst       0
800_dst       0
810_dst       0
820_dst       0
830_dst       0
840_dst       0
850_dst       0
860_dst       0
870_dst       0
880_dst       0
890_dst       0
900_dst       0
910_dst  

In [8]:
alpha.loc[alpha['700_dst'].isnull(),'700_dst']=alpha.loc[alpha['700_dst'].isnull(),'710_dst']
alpha.loc[alpha['690_dst'].isnull(),'690_dst']=alpha.loc[alpha['690_dst'].isnull(),'700_dst']
alpha.loc[alpha['680_dst'].isnull(),'680_dst']=alpha.loc[alpha['680_dst'].isnull(),'690_dst']
alpha.loc[alpha['670_dst'].isnull(),'670_dst']=alpha.loc[alpha['670_dst'].isnull(),'680_dst']
alpha.loc[alpha['660_dst'].isnull(),'660_dst']=alpha.loc[alpha['660_dst'].isnull(),'670_dst']
alpha.loc[alpha['650_dst'].isnull(),'650_dst']=alpha.loc[alpha['650_dst'].isnull(),'660_dst']

beta.loc[beta['700_dst'].isnull(),'700_dst']=beta.loc[beta['700_dst'].isnull(),'710_dst']
beta.loc[beta['690_dst'].isnull(),'690_dst']=beta.loc[beta['690_dst'].isnull(),'700_dst']
beta.loc[beta['680_dst'].isnull(),'680_dst']=beta.loc[beta['680_dst'].isnull(),'690_dst']
beta.loc[beta['670_dst'].isnull(),'670_dst']=beta.loc[beta['670_dst'].isnull(),'680_dst']
beta.loc[beta['660_dst'].isnull(),'660_dst']=beta.loc[beta['660_dst'].isnull(),'670_dst']
beta.loc[beta['650_dst'].isnull(),'650_dst']=beta.loc[beta['650_dst'].isnull(),'660_dst']

In [9]:
Xtrain[dst_list] = np.array(alpha)
Xtest[dst_list] = np.array(beta)

In [10]:
for col in dst_list:
    Xtrain[col] = Xtrain[col] * (Xtrain['rho'] ** 2)
    Xtest[col] = Xtest[col] * (Xtest['rho']**2)

In [11]:
gap_feature_names=[]
for i in range(650, 1000, 10):
    gap_feature_names.append(str(i) + '_gap')

alpha=pd.DataFrame(np.array(Xtrain[src_list]) - np.array(Xtrain[dst_list]), columns=gap_feature_names, index=train.index)
beta=pd.DataFrame(np.array(Xtest[src_list]) - np.array(Xtest[dst_list]), columns=gap_feature_names, index=test.index)

Xtrain=pd.concat((Xtrain, alpha), axis=1)
Xtest=pd.concat((Xtest, beta), axis=1)

print(Xtrain.shape, Ytrain.shape, Xtest.shape)

(10000, 106) (10000, 4) (10000, 106)


In [12]:
epsilon=1e-10

for dst_col, src_col in zip(dst_list, src_list):
    dst_val=Xtrain[dst_col]
    src_val=Xtrain[src_col] + epsilon
    delta_ratio = dst_val / src_val
    Xtrain[dst_col + '_' + src_col + '_ratio'] = delta_ratio
    
    dst_val=Xtest[dst_col]
    src_val=Xtest[src_col] + epsilon
    
    delta_ratio = dst_val / src_val
    Xtest[dst_col + '_' + src_col + '_ratio'] = delta_ratio
    
print(Xtrain.shape, Xtest.shape)

(10000, 141) (10000, 141)


In [13]:
alpha_real=Xtrain[dst_list]
alpha_imag=Xtrain[dst_list]

beta_real=Xtest[dst_list]
beta_imag=Xtest[dst_list]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in dst_list:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:50<00:00, 198.95it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:51<00:00, 194.25it/s]


In [14]:
Xtrain=pd.concat((Xtrain, alpha), axis=1)
Xtest=pd.concat((Xtest, beta), axis=1)

print(Xtrain.shape, Ytrain.shape, Xtest.shape)

(10000, 211) (10000, 4) (10000, 211)


In [15]:
Xtrain=Xtrain.drop(columns=src_list)
Xtest=Xtest.drop(columns=src_list)

print(Xtrain.shape, Ytrain.shape, Xtest.shape)

(10000, 176) (10000, 4) (10000, 176)


In [16]:
def model_scoring_cv(model, x, y, cv=5):
    start=time.time()
    score=-cross_val_score(model, x, y, cv=cv, scoring='neg_mean_absolute_error').mean()
    stop=time.time()
    print(f"Validation Time : {round(stop-start, 3)} sec")
    return score

In [19]:
import xgboost as xgb                       # XGBoost 패키지
from sklearn.model_selection import KFold   # K-Fold CV

def train_model(x_data, y_data, k=5):
    models = []
    
    k_fold = KFold(n_splits=k, shuffle=True, random_state=123)
    
    for train_idx, val_idx in k_fold.split(x_data):
        x_train, y_train = x_data.iloc[train_idx], y_data[train_idx]
        x_val, y_val = x_data.iloc[val_idx], y_data[val_idx]
    
        d_train = xgb.DMatrix(data = x_train, label = y_train)
        d_val = xgb.DMatrix(data = x_val, label = y_val)
        
        wlist = [(d_train, 'train'), (d_val, 'eval')]
        
        params = {
            'max_depth' : 5,
            'min_child_weight' : 2,
            'eta' : 0.1,
            'objective': 'reg:squarederror',
            'eval_metric': 'mae',
            'seed':777
            }

        model = xgb.train(params=params, dtrain=d_train, num_boost_round=500, verbose_eval=500, evals=wlist)
        models.append(model)
    
    return models

In [20]:
models = {}
for label in Ytrain.columns:
    print('train column : ', label)
    models[label] = train_model(Xtrain, Ytrain[label])

train column :  hhb
[0]	train-mae:6.75730	eval-mae:6.75359
[499]	train-mae:0.22628	eval-mae:0.81891
[0]	train-mae:6.76800	eval-mae:6.71338
[499]	train-mae:0.23065	eval-mae:0.76022
[0]	train-mae:6.75927	eval-mae:6.75049
[499]	train-mae:0.22978	eval-mae:0.78576
[0]	train-mae:6.75207	eval-mae:6.77905
[499]	train-mae:0.22150	eval-mae:0.76769
[0]	train-mae:6.74862	eval-mae:6.79587
[499]	train-mae:0.22581	eval-mae:0.79170
train column :  hbo2
[0]	train-mae:3.15743	eval-mae:3.16901
[499]	train-mae:0.17403	eval-mae:0.55496
[0]	train-mae:3.15635	eval-mae:3.17448
[499]	train-mae:0.17047	eval-mae:0.53987
[0]	train-mae:3.16772	eval-mae:3.12580
[499]	train-mae:0.17340	eval-mae:0.55299
[0]	train-mae:3.15616	eval-mae:3.17588
[499]	train-mae:0.17269	eval-mae:0.54909
[0]	train-mae:3.16134	eval-mae:3.15286
[499]	train-mae:0.16981	eval-mae:0.56751
train column :  ca
[0]	train-mae:7.65823	eval-mae:7.74359
[499]	train-mae:0.61232	eval-mae:1.78455
[0]	train-mae:7.66649	eval-mae:7.70152
[499]	train-mae:0.604

In [None]:
multi_model.fit(Xtrain, Ytrain)
preds=multi_model.predict(Xtest)

preds=pd.DataFrame(data=preds, columns=submission.columns, index=submission.index)
preds.head()

In [47]:
for col in models:
    preds = []
    for model in models[col]:
        preds.append(model.predict(xgb.DMatrix(Xtest.loc[:, '650_dst':])))
    pred = np.mean(preds, axis=0)

    submission[col] = pred

ValueError: feature_names mismatch: ['rho', '650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst', '710_dst', '720_dst', '730_dst', '740_dst', '750_dst', '760_dst', '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst', '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst', '890_dst', '900_dst', '910_dst', '920_dst', '930_dst', '940_dst', '950_dst', '960_dst', '970_dst', '980_dst', '990_dst', '650_gap', '660_gap', '670_gap', '680_gap', '690_gap', '700_gap', '710_gap', '720_gap', '730_gap', '740_gap', '750_gap', '760_gap', '770_gap', '780_gap', '790_gap', '800_gap', '810_gap', '820_gap', '830_gap', '840_gap', '850_gap', '860_gap', '870_gap', '880_gap', '890_gap', '900_gap', '910_gap', '920_gap', '930_gap', '940_gap', '950_gap', '960_gap', '970_gap', '980_gap', '990_gap', '650_dst_650_src_ratio', '660_dst_660_src_ratio', '670_dst_670_src_ratio', '680_dst_680_src_ratio', '690_dst_690_src_ratio', '700_dst_700_src_ratio', '710_dst_710_src_ratio', '720_dst_720_src_ratio', '730_dst_730_src_ratio', '740_dst_740_src_ratio', '750_dst_750_src_ratio', '760_dst_760_src_ratio', '770_dst_770_src_ratio', '780_dst_780_src_ratio', '790_dst_790_src_ratio', '800_dst_800_src_ratio', '810_dst_810_src_ratio', '820_dst_820_src_ratio', '830_dst_830_src_ratio', '840_dst_840_src_ratio', '850_dst_850_src_ratio', '860_dst_860_src_ratio', '870_dst_870_src_ratio', '880_dst_880_src_ratio', '890_dst_890_src_ratio', '900_dst_900_src_ratio', '910_dst_910_src_ratio', '920_dst_920_src_ratio', '930_dst_930_src_ratio', '940_dst_940_src_ratio', '950_dst_950_src_ratio', '960_dst_960_src_ratio', '970_dst_970_src_ratio', '980_dst_980_src_ratio', '990_dst_990_src_ratio', '650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real', '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real', '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real', '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real', '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real', '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real', '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real', '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real', '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real', '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real', '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real', '980_dst_fft_real', '990_dst_fft_real', '650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag', '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag', '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag', '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag', '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag', '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag', '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag', '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag', '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag', '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag', '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag', '980_dst_fft_imag', '990_dst_fft_imag'] ['650_dst', '660_dst', '670_dst', '680_dst', '690_dst', '700_dst', '710_dst', '720_dst', '730_dst', '740_dst', '750_dst', '760_dst', '770_dst', '780_dst', '790_dst', '800_dst', '810_dst', '820_dst', '830_dst', '840_dst', '850_dst', '860_dst', '870_dst', '880_dst', '890_dst', '900_dst', '910_dst', '920_dst', '930_dst', '940_dst', '950_dst', '960_dst', '970_dst', '980_dst', '990_dst', '650_gap', '660_gap', '670_gap', '680_gap', '690_gap', '700_gap', '710_gap', '720_gap', '730_gap', '740_gap', '750_gap', '760_gap', '770_gap', '780_gap', '790_gap', '800_gap', '810_gap', '820_gap', '830_gap', '840_gap', '850_gap', '860_gap', '870_gap', '880_gap', '890_gap', '900_gap', '910_gap', '920_gap', '930_gap', '940_gap', '950_gap', '960_gap', '970_gap', '980_gap', '990_gap', '650_dst_650_src_ratio', '660_dst_660_src_ratio', '670_dst_670_src_ratio', '680_dst_680_src_ratio', '690_dst_690_src_ratio', '700_dst_700_src_ratio', '710_dst_710_src_ratio', '720_dst_720_src_ratio', '730_dst_730_src_ratio', '740_dst_740_src_ratio', '750_dst_750_src_ratio', '760_dst_760_src_ratio', '770_dst_770_src_ratio', '780_dst_780_src_ratio', '790_dst_790_src_ratio', '800_dst_800_src_ratio', '810_dst_810_src_ratio', '820_dst_820_src_ratio', '830_dst_830_src_ratio', '840_dst_840_src_ratio', '850_dst_850_src_ratio', '860_dst_860_src_ratio', '870_dst_870_src_ratio', '880_dst_880_src_ratio', '890_dst_890_src_ratio', '900_dst_900_src_ratio', '910_dst_910_src_ratio', '920_dst_920_src_ratio', '930_dst_930_src_ratio', '940_dst_940_src_ratio', '950_dst_950_src_ratio', '960_dst_960_src_ratio', '970_dst_970_src_ratio', '980_dst_980_src_ratio', '990_dst_990_src_ratio', '650_dst_fft_real', '660_dst_fft_real', '670_dst_fft_real', '680_dst_fft_real', '690_dst_fft_real', '700_dst_fft_real', '710_dst_fft_real', '720_dst_fft_real', '730_dst_fft_real', '740_dst_fft_real', '750_dst_fft_real', '760_dst_fft_real', '770_dst_fft_real', '780_dst_fft_real', '790_dst_fft_real', '800_dst_fft_real', '810_dst_fft_real', '820_dst_fft_real', '830_dst_fft_real', '840_dst_fft_real', '850_dst_fft_real', '860_dst_fft_real', '870_dst_fft_real', '880_dst_fft_real', '890_dst_fft_real', '900_dst_fft_real', '910_dst_fft_real', '920_dst_fft_real', '930_dst_fft_real', '940_dst_fft_real', '950_dst_fft_real', '960_dst_fft_real', '970_dst_fft_real', '980_dst_fft_real', '990_dst_fft_real', '650_dst_fft_imag', '660_dst_fft_imag', '670_dst_fft_imag', '680_dst_fft_imag', '690_dst_fft_imag', '700_dst_fft_imag', '710_dst_fft_imag', '720_dst_fft_imag', '730_dst_fft_imag', '740_dst_fft_imag', '750_dst_fft_imag', '760_dst_fft_imag', '770_dst_fft_imag', '780_dst_fft_imag', '790_dst_fft_imag', '800_dst_fft_imag', '810_dst_fft_imag', '820_dst_fft_imag', '830_dst_fft_imag', '840_dst_fft_imag', '850_dst_fft_imag', '860_dst_fft_imag', '870_dst_fft_imag', '880_dst_fft_imag', '890_dst_fft_imag', '900_dst_fft_imag', '910_dst_fft_imag', '920_dst_fft_imag', '930_dst_fft_imag', '940_dst_fft_imag', '950_dst_fft_imag', '960_dst_fft_imag', '970_dst_fft_imag', '980_dst_fft_imag', '990_dst_fft_imag']
expected rho in input data

In [None]:
submission.head()

In [21]:
submission.to_csv('Dacon_200613_2.csv', index=False)