## Importing the libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# added
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import make_scorer
import sklearn.metrics as metrics
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

SEED = 42

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(SEED) # Seed 고정

## Importing the train

In [3]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('test.csv').drop(columns=['ID'])

## EDA

In [4]:
# train.shape

In [5]:
# train.describe(include='all')

In [6]:
# train.hist(figsize=(30, 30))

## Preprocessing

In [7]:
# # 방열재료 결측치(i.e., 0) 평균으로 치환
# for i in range(56):
#     if (train.iloc[:,i+1] == 0).sum() > 0:
#         mu = sum(train.iloc[:,i+1])/len(train.iloc[:,i+1])
#         train.iloc[:, i+1] = [mu if temp == 0 else temp for temp in train.iloc[:, i+1]]

In [8]:
# train.drop(['ID','X_04','X_10','X_11','X_23','X_47','X_48'], axis=1, inplace=True)

In [11]:
X_train = train.filter(regex='X') # Input : X Featrue
Y_train = train.filter(regex='Y') # Output : Y Feature

In [12]:
x_train, x_test, y_train, y_test = tts(X_train, Y_train, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 56)
y_train : (31685, 14)
x_test  : (7922, 56)
y_test  : (7922, 14)


## Feature Scaling

In [15]:
# 중앙값과 IQR 사용하여 아웃라이어의 영향 최소화
from sklearn.preprocessing import StandardScaler# RobustScaler
sc_X = StandardScaler()
x_train.loc[:] = sc_X.fit_transform(x_train)
x_test.loc[:] = sc_X.transform(x_test)

## Iterating through the loop to find the best output

In [16]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [17]:
# from sklearn.ensemble import GradientBoostingRegressor#, RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
# from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [18]:
def feature_picker(xTrain, xTest, yVar):
    temp_xTrain = xTrain.copy()
    temp_xTest = xTest.copy()
    for i in range(len(xTrain.columns)):
        corr = np.corrcoef(xTrain.iloc[:,[i]].squeeze(), yVar.squeeze())[0][1]
        if corr <= 0:
            temp_xTrain.drop(xTrain.columns[i], axis=1, inplace=True)
            temp_xTest.drop(xTest.columns[i], axis=1, inplace=True)       
    
    return (temp_xTrain, temp_xTest)

In [19]:
# submit = pd.read_csv('./sample_submission.csv')
submit = pd.DataFrame(np.zeros(y_test.shape), columns=y_test.columns)

# for y_grp in y_trains:
for i in range(len(y_train.columns)):
    print('Currently iterating through STEP {}'.format(i))
    
    # feature scaling
    sc_y = StandardScaler()
    y = sc_y.fit_transform(y_train.iloc[:, [i]])

    # feature selection
    X_train_tuned, X_test_tuned = feature_picker(x_train, x_test, y)
    print('The shape of fine-tuned train/test set: {}'.format(X_train_tuned.shape))
    
    # building the model
    # regressor = GradientBoostingRegressor()
    regressor = MultiOutputRegressor(LGBMRegressor(n_estimators=100,
                                                   learning_rate=0.08, 
                                                   subsample=0.75, 
                                                   colsample_bytree = 1, 
                                                   max_depth=7,
                                                   random_state=SEED)).fit(X_train_tuned, y)
    print('Done')

    # applying k-Fold Cross Validation
    # accuracies = cross_val_score(estimator=regressor, X=X_train_tuned, y=y, cv = 10)
    # print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    # print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
    
    # applying Grid Search to find the best model and the best parameters    
    # param_grid = {'estimator__learning_rate': [0.02, 0.04, 0.06, 0.08], 
    #               'estimator__subsample': [0.9, 0.5, 0.2, 0.1], 
    #               'estimator__max_depth': [4, 6, 8, 10],
    #               'estimator__n_estimators': [100, 200, 300]}
    # param_grid = {'n_estimators': [10, 100],
    #                'max_depth': [6, 8, 10, 12],
    #                'min_samples_leaf': [8, 12, 18],
    #                'min_samples_split': [8, 16, 20],
    #                'random_state': [SEED]}
    
    # score = make_scorer(lg_nrmse, greater_is_better=False)
                
    # grid_search = GridSearchCV(estimator = regressor,
    #                         param_grid = param_grid,
    #                         scoring = score,
    #                         cv = 5,
    #                         n_jobs = -1)
    # grid_search.fit(X_train_tuned, y)
    # best_accuracy = grid_search.best_score_
    # best_parameters = grid_search.best_params_
    # print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
    # print("Best Parameters:", best_parameters)
    
    # predicting the testset
    y_pred = sc_y.inverse_transform(regressor.predict(X_test_tuned).reshape(-1,1))
    np.set_printoptions(precision=2)
    
    submit[y_train.columns[i]] = y_pred

Currently iterating through STEP 0
The shape of fine-tuned train/test set: (31685, 39)
Done
Currently iterating through STEP 1
The shape of fine-tuned train/test set: (31685, 33)
Done
Currently iterating through STEP 2
The shape of fine-tuned train/test set: (31685, 36)
Done
Currently iterating through STEP 3
The shape of fine-tuned train/test set: (31685, 34)
Done
Currently iterating through STEP 4
The shape of fine-tuned train/test set: (31685, 31)
Done
Currently iterating through STEP 5
The shape of fine-tuned train/test set: (31685, 33)
Done
Currently iterating through STEP 6
The shape of fine-tuned train/test set: (31685, 30)
Done
Currently iterating through STEP 7
The shape of fine-tuned train/test set: (31685, 31)
Done
Currently iterating through STEP 8
The shape of fine-tuned train/test set: (31685, 31)
Done
Currently iterating through STEP 9
The shape of fine-tuned train/test set: (31685, 32)
Done
Currently iterating through STEP 10
The shape of fine-tuned train/test set: (316

In [20]:
submit

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,1.331263,1.040245,1.071137,13.366290,31.749152,16.663896,3.180849,-26.655418,-26.687541,-22.493505,24.228144,-26.664750,-26.602101,-26.615097
1,1.355288,1.120943,1.070534,13.604819,31.717266,16.210062,2.957023,-26.376816,-26.437224,-22.568321,24.348663,-26.289684,-26.312175,-26.406764
2,1.475754,1.137980,1.059735,13.304909,31.311082,16.703450,3.158081,-26.343459,-26.287661,-22.244416,24.201715,-26.200131,-26.280003,-26.257308
3,1.504937,1.151621,1.088582,11.687712,31.420172,16.479761,3.206696,-26.313362,-26.203038,-22.261686,24.233184,-26.262760,-26.263392,-26.273183
4,1.330788,1.037377,1.019081,14.379860,31.458388,16.656484,3.088314,-26.242353,-26.272885,-22.178540,24.459081,-26.180321,-26.182759,-26.223657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7917,1.324017,1.033374,0.933173,13.116851,30.586112,10.785748,3.155340,-26.265811,-26.393521,-22.581383,24.281746,-26.267697,-26.343305,-26.290933
7918,1.341221,1.055606,1.022187,14.175937,31.615707,16.598968,3.166487,-26.192026,-26.279263,-22.257713,24.438033,-26.097147,-26.115196,-26.151928
7919,1.505247,1.146289,1.059502,13.525079,31.126953,16.497605,3.155518,-26.195782,-26.234218,-22.307266,24.248645,-26.232695,-26.138702,-26.171879
7920,1.467617,1.144425,1.080249,14.181007,31.619080,16.626479,3.164676,-26.142361,-26.159286,-22.329922,24.394163,-26.072239,-26.086272,-26.124523


In [21]:
lg_nrmse(y_test.to_numpy(), submit.to_numpy())

1.9758966234411843

## Submit

In [22]:
submit.to_csv('./submit.csv', index=False)