In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn  import metrics
from sklearn.ensemble import RandomForestRegressor

In [2]:
# train csv
zip_dir = '/Data1/Radar'
meta_dir = os.path.join(zip_dir, 'meta')
x_feature = pd.read_csv(meta_dir+'/x_feature_info.csv')
train = pd.read_csv(os.path.join(zip_dir,'train.csv'))
test = pd.read_csv(os.path.join(zip_dir,'test.csv'))
submission = pd.read_csv(zip_dir+'/sample_submission.csv')

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [13]:
train[train.columns[-14:]]

Unnamed: 0,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,2.056,1.456,1.680,10.502,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.470,-25.409,-25.304
1,1.446,1.184,1.268,18.507,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,1.251,0.665,0.782,14.082,31.801,17.080,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.370
3,1.464,1.079,1.052,16.975,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,0.983,0.646,0.689,15.047,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,1.382,1.215,1.263,10.874,29.194,16.582,3.410,-26.486,-26.581,-22.772,24.261,-26.491,-26.584,-26.580
39603,1.482,0.606,1.083,8.759,29.859,15.659,3.406,-27.308,-27.203,-24.674,23.427,-27.250,-27.334,-27.325
39604,1.117,1.154,0.993,13.159,24.720,16.823,3.215,-26.502,-26.687,-22.577,24.301,-26.388,-26.425,-26.601
39605,0.895,0.187,0.477,9.123,26.412,15.757,4.216,-26.760,-26.634,-24.066,23.305,-26.536,-26.751,-26.635


In [14]:
train_x = np.array(train[train.columns[1:-14]])
print(f'train_x : {train_x.shape}')
train_y = np.array(train[train.columns[-14:]])
print(f'train_y : {train_y.shape}')

train_x : (39607, 56)
train_y : (39607, 14)


In [15]:
scaler = StandardScaler()
scaler.fit(train_x)
scaled_x = scaler.transform(train_x)
scaled_x.shape

(39607, 56)

In [16]:
label = np.array(train_y)
label.shape

(39607, 14)

# Split the dataset

In [20]:
x_train, x_test, y_train, y_test = tts(scaled_x, label, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 56)
y_train : (31685, 14)
x_test  : (7922, 56)
y_test  : (7922, 14)


# Regressor

In [21]:
score = make_scorer(lg_nrmse, greater_is_better=False)

regr_multirf = MultiOutputRegressor(
    RandomForestRegressor(random_state=1)
)
params ={
    'estimator__bootstrap' : [True, False],
    'estimator__n_estimators':[100,200,300],
    'estimator__max_depth':[6,8,10,12,20,30],
    
    'estimator__min_samples_leaf':[4,8,12,18,20,24],
    'estimator__min_samples_split':[4,8,16,20,30]
}


clf = RandomizedSearchCV(regr_multirf,params,random_state=1,cv=5,n_jobs=-1, scoring = score, n_iter =20)

In [22]:
clf.fit(x_train, y_train)

In [23]:
print('최적 하이퍼파라미터: ', clf.best_params_)

최적 하이퍼파라미터:  {'estimator__n_estimators': 300, 'estimator__min_samples_split': 16, 'estimator__min_samples_leaf': 8, 'estimator__max_depth': 30, 'estimator__bootstrap': True}


In [24]:
clf.cv_results_

{'mean_fit_time': array([2441.32268658, 2878.86258521, 3731.91776466, 1078.40191531,
        2224.33826375, 1076.99882541, 1613.06192656, 2314.3531981 ,
        1926.28417034, 1571.87794971, 4088.5959064 , 1209.10318108,
        1713.71395526, 1837.40143595, 4238.7576479 , 3153.15856113,
        2006.01848426, 2825.99409027, 1290.08510847, 2143.96793981]),
 'std_fit_time': array([216.72770691,  80.14541631,  11.73229571,  15.43763533,
         24.4575893 ,  23.15416594,   7.71111177, 226.80578029,
         22.92846066,  14.48598093,  61.09065167, 150.36279527,
        191.12856065,  32.39143316,  99.77684365, 183.07796568,
         34.28887938,  25.38374233,  19.63166177,  14.69654931]),
 'mean_score_time': array([2.62054057, 1.67498631, 4.99031062, 1.89709692, 1.59153328,
        1.93520498, 2.19253101, 2.30100455, 2.15102143, 2.69209757,
        3.21518154, 1.66351342, 2.62607923, 1.52631154, 3.29973812,
        3.69521461, 1.39085207, 1.66147404, 1.96038094, 2.37208023]),
 'std_scor

In [25]:
clf.best_score_

-1.9549430103870287

In [26]:
params = clf.cv_results_['params']
mean_test_score = clf.cv_results_['mean_test_score']
rank_test_score = clf.cv_results_['rank_test_score']

In [27]:
clf_results = pd.DataFrame({
    'params' : params,
    'mean_test_score' : mean_test_score,
    'rank_test_score' : rank_test_score});clf_results
clf_results.sort_values(by=['rank_test_score'],inplace=True)
clf_results

Unnamed: 0,params,mean_test_score,rank_test_score
2,"{'estimator__n_estimators': 300, 'estimator__m...",-1.954943,1
15,"{'estimator__n_estimators': 300, 'estimator__m...",-1.957041,2
0,"{'estimator__n_estimators': 200, 'estimator__m...",-1.957693,3
12,"{'estimator__n_estimators': 100, 'estimator__m...",-1.958644,4
9,"{'estimator__n_estimators': 100, 'estimator__m...",-1.959126,5
7,"{'estimator__n_estimators': 200, 'estimator__m...",-1.960273,6
18,"{'estimator__n_estimators': 100, 'estimator__m...",-1.96131,7
11,"{'estimator__n_estimators': 100, 'estimator__m...",-1.961957,8
3,"{'estimator__n_estimators': 100, 'estimator__m...",-1.964121,9
8,"{'estimator__n_estimators': 200, 'estimator__m...",-1.964239,10


In [28]:
clf.best_params_

{'estimator__n_estimators': 300,
 'estimator__min_samples_split': 16,
 'estimator__min_samples_leaf': 8,
 'estimator__max_depth': 30,
 'estimator__bootstrap': True}

# Inferenece

In [29]:
my_test = my_preprocessing(test)

NameError: name 'my_preprocessing' is not defined

In [None]:
scaled_x = scaler.transform(my_test)
scaled_x.shape

In [None]:
res = clf.predict(scaled_x)

In [None]:
label_names = submission.columns;label_names

In [None]:
submit = pd.concat([submission[['ID']],pd.DataFrame(res)],axis=1)
submit.columns = label_names
submit.to_csv('m0820a.csv', index=False)