In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn  import metrics
from sklearn.ensemble import RandomForestRegressor

In [2]:
# train csv
zip_dir = '/Data1/Radar'
meta_dir = os.path.join(zip_dir, 'meta')
x_feature = pd.read_csv(meta_dir+'/x_feature_info.csv')
train = pd.read_csv(os.path.join(zip_dir,'train.csv'))
test = pd.read_csv(os.path.join(zip_dir,'test.csv'))
submission = pd.read_csv(zip_dir+'/sample_submission.csv')

In [9]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [3]:
def my_preprocessing(dataset) :
    preprocessed = dataset.drop(['ID','X_04','X_10','X_11','X_23','X_47','X_48'],axis=1)
    preprocessed['X_021'] = dataset['X_02'].apply(lambda x : 1.0 if x > 103.320 else 0.0)
    preprocessed['X_03'] = dataset['X_03'].apply(lambda x : np.log(x))
    preprocessed['X_051'] = dataset['X_05'].apply(lambda x : 1.0 if x <=102.25 else 0.0)
    preprocessed['X_07'] = dataset['X_07'].apply(lambda x : 40 if x > 40 else x)
    preprocessed['X_08'] = dataset['X_08'].apply(lambda x : 6.1 if np.log(x) > 6 else np.log(x))
    preprocessed['X_09'] = dataset['X_09'].apply(lambda x : np.log(x))
    preprocessed['X_25'] = dataset['X_25'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_26'] = dataset['X_26'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_27'] = dataset['X_27'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_28'] = dataset['X_28'].apply(lambda x : 2.25 if x > 2.25 else x)
    preprocessed['X_29'] = dataset['X_29'].apply(lambda x : 2.3 if x > 2.3 else x)
    preprocessed['X_31'] = dataset['X_31'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_33'] = dataset['X_33'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_38'] = dataset['X_38'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_39'] = dataset['X_39'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_49'] = dataset['X_49'].apply(lambda x : 20000.0 if x > 20000 else x)
    preprocessed['X_50'] = dataset['X_50'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_52'] = dataset['X_52'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_53'] = dataset['X_53'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_55'] = dataset['X_55'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_56'] = dataset['X_56'].apply(lambda x : 120.0 if x < 120 else x)
    
    preprocessed['X_57'] = dataset['X_30'] * dataset['X_34'] # 스크류삽입깊이 x 스크류 체결시 분당 회전수
    preprocessed['X_58'] = dataset['X_31'] * dataset['X_35']
    preprocessed['X_59'] = dataset['X_32'] * dataset['X_36']
    preprocessed['X_60'] = dataset['X_33'] * dataset['X_37']
    preprocessed = preprocessed.drop(['X_30','X_31','X_32','X_33','X_34','X_35','X_36','X_37'],axis=1)
    
    return preprocessed[sorted(preprocessed.T.index)]

In [4]:
processed = my_preprocessing(train)
train_x = processed[processed.columns[:-14]]
print(f'train_x : {train_x.shape}')
train_y = processed[processed.columns[-14:]]
print(f'train_y : {train_y.shape}')

train_x : (39607, 48)
train_y : (39607, 14)


In [5]:
scaler = StandardScaler()
scaler.fit(train_x)
scaled_x = scaler.transform(train_x)
scaled_x.shape

(39607, 48)

In [6]:
label = np.array(train_y)
label.shape

(39607, 14)

# Split the dataset

In [7]:
x_train, x_test, y_train, y_test = tts(scaled_x, label, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 48)
y_train : (31685, 14)
x_test  : (7922, 48)
y_test  : (7922, 14)


# Regressor

In [28]:
score = make_scorer(lg_nrmse, greater_is_better=False)

regr_multirf = MultiOutputRegressor(
    RandomForestRegressor(random_state=1)
)
params ={
    'estimator__bootstrap' : [True, False],
    'estimator__n_estimators':[100,200,300],
    'estimator__max_depth':[6,8,10,12,20,30],
    
    'estimator__min_samples_leaf':[4,8,12,18,20,24],
    'estimator__min_samples_split':[4,8,16,20,30]
}


clf = RandomizedSearchCV(regr_multirf,params,random_state=1,cv=5,n_jobs=-1, scoring = score, n_iter =20)

In [29]:
clf.fit(x_train, y_train)

In [30]:
print('최적 하이퍼파라미터: ', clf.best_params_)

최적 하이퍼파라미터:  {'estimator__n_estimators': 300, 'estimator__min_samples_split': 16, 'estimator__min_samples_leaf': 8, 'estimator__max_depth': 30, 'estimator__bootstrap': True}


In [34]:
clf.cv_results_

{'mean_fit_time': array([1871.11258693, 2272.40145974, 3014.11152782,  752.72114658,
        1625.5865077 ,  746.90554352, 1140.44539309, 1634.79637942,
        1350.14022627, 1112.04590445, 3267.13111396,  898.43676815,
        1346.26615763, 1337.1903842 , 3448.34120288, 2593.35906835,
        1519.00604849, 2198.72180114,  907.93825793, 1591.12790399]),
 'std_fit_time': array([124.82038126,  45.50992639,  40.13703371,   5.73994428,
         73.54497348,  40.08011333,   9.49872099, 156.28950012,
        101.10033273,  40.24399961,  67.18402198,   7.29099164,
         17.99615982,  31.8933243 ,  49.58112871,  45.87946463,
         19.60921376,  47.96649901,   7.21705498,  35.64755425]),
 'mean_score_time': array([2.46722398, 1.68465137, 4.50647888, 1.24915543, 1.39781761,
        1.33572006, 1.74307628, 2.00661712, 1.81186419, 1.73011403,
        3.01402302, 1.41183562, 2.1767169 , 1.16178565, 3.19397511,
        3.63193355, 1.17964172, 1.62506456, 1.40955634, 1.95341573]),
 'std_scor

In [35]:
clf.best_score_

-1.9547894595887

In [55]:
params = clf.cv_results_['params']
mean_test_score = clf.cv_results_['mean_test_score']
rank_test_score = clf.cv_results_['rank_test_score']

In [73]:
clf_results = pd.DataFrame({
    'params' : params,
    'mean_test_score' : mean_test_score,
    'rank_test_score' : rank_test_score});clf_results
clf_results.sort_values(by=['rank_test_score'],inplace=True)
clf_results

Unnamed: 0,params,mean_test_score,rank_test_score
2,"{'estimator__n_estimators': 300, 'estimator__m...",-1.954789,1
15,"{'estimator__n_estimators': 300, 'estimator__m...",-1.95664,2
0,"{'estimator__n_estimators': 200, 'estimator__m...",-1.957267,3
12,"{'estimator__n_estimators': 100, 'estimator__m...",-1.958113,4
9,"{'estimator__n_estimators': 100, 'estimator__m...",-1.9585,5
7,"{'estimator__n_estimators': 200, 'estimator__m...",-1.959759,6
18,"{'estimator__n_estimators': 100, 'estimator__m...",-1.960634,7
11,"{'estimator__n_estimators': 100, 'estimator__m...",-1.961239,8
8,"{'estimator__n_estimators': 200, 'estimator__m...",-1.963826,9
3,"{'estimator__n_estimators': 100, 'estimator__m...",-1.963832,10


In [74]:
clf.best_params_

{'estimator__n_estimators': 300,
 'estimator__min_samples_split': 16,
 'estimator__min_samples_leaf': 8,
 'estimator__max_depth': 30,
 'estimator__bootstrap': True}

# Inferenece

In [38]:
my_test = my_preprocessing(test)

In [40]:
scaled_x = scaler.transform(my_test)
scaled_x.shape

(39608, 48)

In [77]:
res = clf.predict(scaled_x)

In [80]:
label_names = submission.columns;label_names

Index(['ID', 'Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08',
       'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14'],
      dtype='object')

In [84]:
submit = pd.concat([submission[['ID']],pd.DataFrame(res)],axis=1)
submit.columns = label_names
submit.to_csv('m0818.csv', index=False)