## Importing the libraries

In [None]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

# added
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

SEED = 42

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(SEED) # Seed 고정

## Importing the dataset

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [None]:
# Verifying missing values
print(train.isnull().sum())
(train.isnull().sum() > 0).sum()

## EDA

In [None]:
train.shape

In [None]:
train.describe(include='all')

In [None]:
train.hist(figsize=(30, 30))

## Preprocessing

In [None]:
X_train = train.filter(regex='X') # Input : X Featrue
y_train = train.filter(regex='Y') # Output : Y Feature

X_test = test.filter(regex='X') # Input : X Featrue

## Feature Scaling

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# sc_X = MinMaxScaler()
# sc_y = MinMaxScaler()
# X_train = sc_X.fit_transform(X_train)
# y_train = sc_y.fit_transform(y_train)

# 중앙값과 IQR 사용하여 아웃라이어의 영향 최소화
from sklearn.preprocessing import RobustScaler
sc_X = RobustScaler()
sc_y = RobustScaler()

X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

X_test = sc_X.transform(X_test)

# # X = X.copy() # solving thrown error

# X.loc[:] = sc_x.fit_transform(X.loc[:]) # X.loc[:, ~X.columns.isin(['X_04', 'X_23', 'X_47', 'X_48'])] = sc_x.fit_transform(X.loc[:, ~X.columns.isin(['X_04', 'X_23', 'X_47', 'X_48'])]) # excluding categorical columns
# y.loc[:] = sc_y.fit_transform(y.loc[:])

In [None]:
# Verifying overlapping data
X.duplicated().sum()

In [None]:
X

## Splitting the dataset into the Training set and Test set

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# print(X_train.shape)
# print(X_test.shape)

## Unifying the Training set to have a single dependent variable

In [None]:
X_trains = []
for i in range(1, 15):
    y_name = 'Y_' + str(i).zfill(2)
    X_trains.append(pd.concat([X, y[y_name]], axis=1))

X_trains[0]

## Iterating through the loop to find the best output

In [None]:
!pip install --pre pycaret

In [None]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1,15): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [None]:
from sklearn.ensemble import GradientBoostingRegressor #, RandomForestRegressor
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

# from tensorflow.keras.utils import plot_model
from pycaret.regression import *

In [None]:
regressor = GradientBoostingRegressor(loss=lg_nrmse)

In [None]:
# results = [] # predicted y's

# # iterate through the loop to predict each dependent variable
# for i in range(len(X_trains)):
#     y_col = X_trains[i].columns[-1]
#     print("Currently processing '{}' Independent Variable".format(y_col))
    
#     model = setup(X_trains[i], target=y_col, normalize=True, normalize_method='minmax', fold=5, fold_shuffle=True, use_gpu=True, session_id=SEED) # reset all weights in memory and create a new model
    
#     # models() # show all possible models
    
#     best_model = compare_models(include = ['gbr'], n_select=1) # find the best model
    
#     print("Best Model:", best_model)

#     clf = create_model(estimator=best_model, fold=5, cross_validation=True) # create the model
#     tuned_clf = tune_model(clf) # find the optimal parameters
    
#     print("Evaluating the model performance...")
    
#     evaluate_model(tuned_clf, use_train_data=True)
    
#     # predict the test set
#     test_x = pd.read_csv('./test.csv').drop(columns=['ID']) # loading the test set
    
#     # test_x.loc[:] = sc_x.transform(test_x.loc[:]) # feature scaling
#     results.append(np.expand_dims(predict_model(tuned_clf, data=test_x).iloc[:,-1], axis=1)) # store the result

## ~~Evaluating the model~~

In [None]:
# def lg_nrmse(gt, preds):
#     # 각 Y Feature별 NRMSE 총합
#     # Y_01 ~ Y_08 까지 20% 가중치 부여
#     all_nrmse = []
#     for idx in range(1,15): # ignore 'ID'
#         rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
#         nrmse = rmse/np.mean(np.abs(gt[:,idx]))
#         all_nrmse.append(nrmse)
#     score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
#     return score

In [None]:
# evaluate_model(tuned_clf, use_train_data=True)

In [None]:
# # !pip install shap
# interpret_model(tuned_clf, plot='summary')

## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
# Integrating the multiple results
# preds = sc_y.inverse_transform(np.concatenate(results, axis=1))
preds = np.concatenate(results, axis=1)

In [None]:
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')

In [None]:
submit.to_csv('./submit.csv', index=False)