## Importing the libraries

In [1]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

# import xgboost as xgb

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

## Importing the dataset

In [3]:
train_df = pd.read_csv('./train_master.csv')

In [4]:
X = train_df.filter(regex='X') # Input : X Featrue
y = train_df.filter(regex='Y') # Output : Y Feature

## Splitting the dataset into the Training set and Test set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train.shape)
print(X_test.shape)

(31685, 56)
(7922, 56)


In [6]:
X_test.describe()

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_10,...,X_47,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
count,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,...,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0,7922.0
mean,68.380588,103.320168,68.897897,1.0,102.329946,70.598623,29.307811,167.596115,224.739763,0.001578,...,1.0,1.0,16810.521548,130.830347,131.430669,138.650715,128.014893,127.978723,137.885702,128.451127
std,2.635664,0.000374,5.199738,0.0,0.545356,2.267708,6.444187,229.252687,66.478693,0.070322,...,0.0,0.0,8802.561486,6.113569,6.056716,6.563279,5.846209,5.50984,6.702075,5.564762
min,59.327,103.32,56.47,1.0,101.791,62.746,23.99,42.53,37.58,0.0,...,1.0,1.0,3382.63,21.8,21.91,23.1,21.33,21.34,22.98,21.41
25%,66.465,103.32,65.17,1.0,101.949,68.864,27.87,106.1475,188.54,0.0,...,1.0,1.0,13073.105,126.939235,127.648589,134.472969,124.379084,124.617884,133.723875,125.186485
50%,68.504,103.32,67.27,1.0,102.005,69.884,28.82,115.08,234.33,0.0,...,1.0,1.0,15286.23,130.792524,131.314144,138.631429,128.010119,128.092915,137.943318,128.40356
75%,69.524,103.32,71.97,1.0,103.144,71.923,29.86,132.895,263.955,0.0,...,1.0,1.0,17634.73,134.629124,135.116899,142.75854,131.629347,131.442326,142.149664,131.851822
max,84.82,103.321,86.67,1.0,103.16,85.18,163.86,2387.44,633.89,3.3,...,1.0,1.0,114563.63,162.619458,194.513195,173.438623,152.40663,149.941395,170.15598,155.277538


## Feature Scaling

In [7]:
# from sklearn.preprocessing import MinMaxScaler
# sc_X = MinMaxScaler()
# sc_y = MinMaxScaler()
# X_train = sc_X.fit_transform(X_train)
# y_train = sc_y.fit_transform(y_train)

# 중앙값과 IQR 사용하여 아웃라이어의 영향 최소화
from sklearn.preprocessing import RobustScaler

rbst_scaler=RobustScaler()
X_train=rbst_scaler.fit_transform(X_train)
X_test=rbst_scaler.transform(X_test)

## Building the ANN

In [8]:
# from scikeras.wrappers import KerasRegressor
# from keras.models import Sequential 
# from keras.layers import Dense, Dropout
# from keras.optimizers import Adam

In [9]:
# def build_nn():
#     ann = Sequential()
#     ann.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
#     ann.add(Dense(64, activation='relu'))
#     ann.add(Dropout(0.2))
#     ann.add(Dense(1))
#     ann.compile(optimizer = 'adam', loss = 'mse', metrics = ['accuracy'])
    
#     return ann

In [10]:
# random_state = 42

# early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=1, verbose=0)

# k_regressor = KerasRegressor(model = build_nn, optimizer="adam", 
#     optimizer__learning_rate=0.001, epochs=100, verbose=0, callbacks=[early_stop])

## Building the model

In [11]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR

In [12]:
estimators = []

# 일반적으로 높은 성능을 보이는 모델들
estimators.append(('lasso', Lasso()))
estimators.append(('elasticnet', ElasticNet()))
# estimators.append(('rf', RandomForestRegressor())) 1h ↑
# estimators.append(('gbr', GradientBoostingRegressor())) 1h ↑
# estimators.append(('xgb', XGBRegressor())) # 20m
# estimators.append(('lgbm', LGBMRegressor())) ?
estimators.append(('linear', LinearRegression()))
# estimators.append(('svr', SVR())) 1h ↑
# estimators.append(('ann', k_regressor)) 20m ↑

In [13]:
# lg_nrmse 수치 비교
# 전체 0.7133729798071168
# svr 0.680729299180608
# linear 0.6759044272755277 (1.0645019615447864)
# lgbm 0.6946691154605198
# xgb 0.6908788450193392
# gbr 0.7008013449344662
# rf 0.6931997759450955
# elasticnet 0.6772944259379241 (1.0430991032855124)

# elasticnet + linear = 0.6747423563888522 (1.064536665606918)
# elasticnet + lasso = (1.0431273649624482)
# 전체 데이터셋 우승자: lasso


In [14]:
stacker = StackingRegressor(estimators=estimators, final_estimator = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma = 0, subsample=0.75, colsample_bytree = 1, max_depth=7))
regressor = MultiOutputRegressor(estimator=stacker)

regressor.fit(X_train, y_train)
print('Done.')

Done.


## Predicting the Test set results

In [15]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1,14), y_test.values.reshape(len(y_test),1,14)),1))

[[[  1.32   1.08   0.94 ... -26.15 -26.06 -26.11]
  [  1.32   1.1    0.96 ... -25.69 -25.74 -25.71]]

 [[  1.36   1.08   1.06 ... -26.13 -26.12 -26.16]
  [  2.5    1.69   2.26 ... -26.18 -26.12 -25.96]]

 [[  1.22   0.99   0.93 ... -26.18 -26.16 -26.21]
  [  1.97   1.43   1.65 ... -24.73 -24.8  -24.51]]

 ...

 [[  1.34   1.     1.01 ... -26.24 -26.28 -26.31]
  [  1.54   1.11   1.4  ... -26.55 -26.64 -26.38]]

 [[  1.33   0.96   0.96 ... -26.39 -26.39 -26.38]
  [  1.07   0.24   0.7  ... -27.48 -27.48 -27.58]]

 [[  1.4    1.11   1.13 ... -26.19 -26.25 -26.24]
  [  1.68   1.39   1.5  ... -25.17 -25.04 -25.02]]]


## Evaluating the Model Performance

###  Defining loss function

In [16]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1,15): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [17]:
import sklearn.metrics as metrics

print(lg_nrmse(y_test.values.reshape(1, len(y_test), 14), y_pred.reshape(1, len(y_pred), 14)))

1.0600093093784524


### KFold Cross Validation

In [18]:
# from sklearn.model_selection import cross_val_score
# accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 5)
# print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

### Applying Grid Search to find the best model and the best parameters

In [19]:
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline

# # pipe_svr = Pipeline([('reg', MultiOutputRegressor(SVR()))])
# pipe_svr = Pipeline([('linear', LinearRegression()),
#                      ('svr', MultiOutputRegressor(SVR()))])
# # pipe_svr = Pipeline([('reg', regressor)])

# grid_param_svr = {
#     #'reg__estimator__C': [0, 0.1, 10]
# }

# gs_svr = (GridSearchCV(estimator=pipe_svr, 
#                       param_grid=grid_param_svr, 
#                       cv=2,
#                       scoring = lg_nrmse,#'neg_mean_squared_error',
#                       n_jobs = -1))

# gs_svr = gs_svr.fit(X_train,y_train)
# gs_svr.best_estimator_    

# Pipeline(steps=[('linear', LinearRegression(copy_X=False, fit_intercept=False, n_jobs=1, normalize=False, positive=False)),
#                 ('svr', SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])
# # dict_keys(['C', 'cache_size', 'coef0', 'degree', 'epsilon', 'gamma', 'kernel', 'max_iter', 'shrinking', 'tol', 'verbose'])
# print("Best Accuracy: {:.2f} %".format(gs_svr.best_score_*100))
# print("Best Parameters:", gs_svr.best_params_)
# print("Best Estimator:", gs_svr.best_estimator_)

## Visualizing the Test set results

In [20]:
# from matplotlib.colors import ListedColormap
# X_set, y_set = X_test, y_test
# X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
#                      np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
# plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
#              alpha = 0.75, cmap = ListedColormap(('red', 'green')))
# plt.xlim(X1.min(), X1.max())
# plt.ylim(X2.min(), X2.max())
# for i, j in enumerate(np.unique(y_set)):
#     plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
#                 c = ListedColormap(('red', 'green'))(i), label = j)
# plt.title('Kernel SVM (Test set)')
# plt.xlabel('Age')
# plt.ylabel('Estimated Salary')
# plt.legend()
# plt.show()

## Inference

In [21]:
test_x = pd.read_csv('./test_master.csv').drop(columns=['ID'])

In [22]:
preds = regressor.predict(rbst_scaler.transform(test_x))
print('Done.')

Done.


## Submit

In [23]:
submit = pd.read_csv('./sample_submission.csv')

In [24]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

Done.


In [25]:
submit.to_csv('./submit.csv', index=False)