In [1]:
# # Colab execute.
# from google.colab import drive


# drive.mount('/content/drive')
# _path = '/content/drive/MyDrive/antenna-performance-prediction/antenna-performance-prediction'
# %cd /content/drive/MyDrive/antenna-performance-prediction/antenna-performance-prediction/

In [2]:
import time
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import RegressorChain

from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from mylibrary import manipulation
from mylibrary import visualization as visual

# Local execute.
origin_train = pd.read_csv('dataset/train.csv')
origin_test = pd.read_csv('dataset/test.csv')
x_info = pd.read_csv('dataset/meta/x_feature_info.csv')
y_info = pd.read_csv('dataset/meta/y_feature_info.csv')
submission = pd.read_csv("dataset/sample_submission.csv")

# Colab execute.
# origin_train = pd.read_csv(f'{_path}/dataset/train.csv')
# origin_test = pd.read_csv(f'{_path}/dataset/test.csv')
# x_info = pd.read_csv(f'{_path}/dataset/meta/x_feature_info.csv')
# y_info = pd.read_csv(f'{_path}/dataset/meta/y_feature_info.csv')
# submission = pd.read_csv(f'{_path}/dataset/sample_submission.csv')


x_train = origin_train.filter(regex='X')
y_train = origin_train.filter(regex='Y')

x_test = origin_test.drop(columns=['ID'])

x_name = manipulation.make_namegroup(x_train, x_info)
y_name = manipulation.make_namegroup(y_train, y_info)

visual.print_namegroup(x_name)
print()
visual.print_namegroup(y_name)

ImportError: cannot import name 'mean_squared_error' from 'sklearn.model_selection' (C:\Anaconda\lib\site-packages\sklearn\model_selection\__init__.py)

In [None]:
from sklearn.preprocessing import RobustScaler


# n차 검사 통과 여부
# 모든 값이 1임.
x_train = x_train.drop(columns=x_name['n차 검사 통과 여부'])
x_test = x_test.drop(columns=x_name['n차 검사 통과 여부'])

# 방열 재료 n 무게
# 'X_10','X_11'은 결측값이 절대 다수이고,
# 결측값과 아닌 값 사이의 뚜렷한 차이가 나타나지 않음.
x_train = x_train.drop(columns=['X_10','X_11'])
x_test = x_test.drop(columns=['X_10','X_11'])

In [None]:
pd.DataFrame(x_train)

In [None]:
x_test

### Reference
[Comparing random forests and the multi-output meta estimator](https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py)  
[Multiclass and multioutput algorithms](https://scikit-learn.org/stable/modules/multiclass.html#regressorchain)

In [None]:
start_t = time.time()


def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(1, 15): # ignore 'ID'
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score


def rf_cv(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=1.0, max_leaf_nodes=None):
    
    model = RandomForestRegressor(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
    )

    model.fit(x_t, y_t)
    y_pred = model.predict(x_eval)

    error = lg_nrmse(y_eval, y_pred)
    return -1 * error


x_t, x_eval, y_t, y_eval = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

pbounds = {
    'n_estimators':(10, 100),
    'max_depth':(1, 100),
#     'min_samples_split':,
#     'min_samples_leat':,
#     'min_weight_fraction_leaf':,
#     'max_features':,
#     'max_leaf_nodes':
}

bo = BayesianOptimization(f=rf_cv, pbounds=pbounds, verbose=2, random_state=1)
bo.maximize(init_points=2, n_iter=10, acq='ei', xi=0.01)
print(bo.max)

end_t = time.time()
t = time.gmtime(end_t - start_t)
print(f"[Training time] => {t.tm_min}(min) {t.tm_sec}(sec)")

In [None]:
#visual.show_pred(origin_train.filter(regex='Y'), y_pred)

In [None]:
for idx, col in enumerate(submission.columns):
    if col == 'ID':
        continue
    submission[col] = y_pred[:, idx-1]
    

submission.to_csv(f"submissions/{time.asctime().replace(':', '-')}.csv", index=False)
print('Done.')