In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn  import metrics

In [2]:
# train csv
zip_dir = '/Data1/Radar'
meta_dir = os.path.join(zip_dir, 'meta')
x_feature = pd.read_csv(meta_dir+'/x_feature_info.csv')
train = pd.read_csv(os.path.join(zip_dir,'train.csv'))
test = pd.read_csv(os.path.join(zip_dir,'test.csv'))
submission = pd.read_csv(zip_dir+'/sample_submission.csv')

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [4]:
def my_preprocessing(dataset) :
    preprocessed = dataset.drop(['ID','X_04','X_10','X_11','X_23','X_47','X_48'],axis=1)
    preprocessed['X_021'] = dataset['X_02'].apply(lambda x : 1.0 if x > 103.320 else 0.0)
    preprocessed['X_03'] = dataset['X_03'].apply(lambda x : np.log(x))
    preprocessed['X_051'] = dataset['X_05'].apply(lambda x : 1.0 if x <=102.25 else 0.0)
    preprocessed['X_07'] = dataset['X_07'].apply(lambda x : 40 if x > 40 else x)
    preprocessed['X_08'] = dataset['X_08'].apply(lambda x : 6.1 if np.log(x) > 6 else np.log(x))
    preprocessed['X_09'] = dataset['X_09'].apply(lambda x : np.log(x))
    preprocessed['X_25'] = dataset['X_25'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_26'] = dataset['X_26'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_27'] = dataset['X_27'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_28'] = dataset['X_28'].apply(lambda x : 2.25 if x > 2.25 else x)
    preprocessed['X_29'] = dataset['X_29'].apply(lambda x : 2.3 if x > 2.3 else x)
    preprocessed['X_31'] = dataset['X_31'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_33'] = dataset['X_33'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_38'] = dataset['X_38'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_39'] = dataset['X_39'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_49'] = dataset['X_49'].apply(lambda x : 20000.0 if x > 20000 else x)
    preprocessed['X_50'] = dataset['X_50'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_52'] = dataset['X_52'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_53'] = dataset['X_53'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_55'] = dataset['X_55'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_56'] = dataset['X_56'].apply(lambda x : 120.0 if x < 120 else x)
    
    preprocessed['X_57'] = dataset['X_30'] * dataset['X_34'] # 스크류삽입깊이 x 스크류 체결시 분당 회전수
    preprocessed['X_58'] = dataset['X_31'] * dataset['X_35']
    preprocessed['X_59'] = dataset['X_32'] * dataset['X_36']
    preprocessed['X_60'] = dataset['X_33'] * dataset['X_37']
    preprocessed = preprocessed.drop(['X_30','X_31','X_32','X_33','X_34','X_35','X_36','X_37'],axis=1)
    
    return preprocessed[sorted(preprocessed.T.index)]

In [5]:
processed = my_preprocessing(train)
train_x = processed[processed.columns[:-14]]
print(f'train_x : {train_x.shape}')
train_y = processed[processed.columns[-14:]]
print(f'train_y : {train_y.shape}')

train_x : (39607, 48)
train_y : (39607, 14)


In [6]:
scaler = StandardScaler()
scaler.fit(train_x)
scaled_x = scaler.transform(train_x)
scaled_x.shape

(39607, 48)

In [7]:
label = np.array(train_y)
label.shape

(39607, 14)

# Split the dataset

In [8]:
x_train, x_test, y_train, y_test = tts(scaled_x, label, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 48)
y_train : (31685, 14)
x_test  : (7922, 48)
y_test  : (7922, 14)


# Regressor

In [9]:
import xgboost

  from pandas import MultiIndex, Int64Index


In [10]:
multi_xgb = MultiOutputRegressor(xgboost.XGBRegressor(random_state=1 ))

params ={
    'estimator__n_estimators' : [100,200,300,400,500],
    'estimator__max_depth' : [6,8,10,12,20,30],
    'estimator__learning_rate' : [0.01,0.03,0.05,0.07,0.1,0.3],
    'estimator__gamma': [0.1,0.2,0.3,0.4], 
    'estimator__subsample' : [0.6,0.5,0.4],
    'estimator__verbosity' : [1]
}

score = make_scorer(lg_nrmse, greater_is_better=False)

In [11]:
RS = RandomizedSearchCV(
    multi_xgb, param_distributions=params, random_state=1, n_iter=20, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

In [12]:
RS.fit(x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex,

In [13]:
print('최적 하이퍼파라미터: ', RS.best_params_)

최적 하이퍼파라미터:  {'estimator__verbosity': 1, 'estimator__subsample': 0.6, 'estimator__n_estimators': 400, 'estimator__max_depth': 10, 'estimator__learning_rate': 0.03, 'estimator__gamma': 0.1}


In [14]:
RS.cv_results_

{'mean_fit_time': array([2634.0067946 , 1741.35986962,  171.12424192,  413.70248332,
         925.2353703 , 1723.21665597, 1404.5105598 ,  279.24950194,
        1334.2922236 ,  592.29904499, 4663.89269018, 3370.51538072,
         898.25665059,  650.33492608,  758.42551975, 4811.28804936,
        4350.80118117,  223.54172974,  527.63026433, 1257.39346633]),
 'std_fit_time': array([154.49613512, 132.6015325 ,   7.27171701,  21.43193734,
          6.07364207,  84.01007498, 108.74205645,   2.67408702,
        121.07493013,  24.62262544, 171.68361672, 222.11876762,
         66.57281528,  26.46880777,  41.15100105,  17.30469653,
         19.41109492,   1.89126555,  12.65445953,  84.89590941]),
 'mean_score_time': array([ 4.06092529,  8.40743146,  0.83237739,  1.81046839,  3.6446826 ,
         8.44790497,  6.06742039,  1.45458984,  5.86746006,  2.16514916,
        13.28606691, 15.14692016,  3.72267895,  2.28320723,  2.70083814,
         7.63020306, 15.42563343,  1.03868208,  1.89906459,  4.06

In [15]:
RS.best_score_

0.06816641312225895

In [16]:
params = RS.cv_results_['params']
mean_test_score = RS.cv_results_['mean_test_score']
rank_test_score = RS.cv_results_['rank_test_score']

In [17]:
RS_results = pd.DataFrame({
    'params' : params,
    'mean_test_score' : mean_test_score,
    'rank_test_score' : rank_test_score})
RS_results.sort_values(by=['rank_test_score'],inplace=True)
RS_results

Unnamed: 0,params,mean_test_score,rank_test_score
6,"{'estimator__verbosity': 1, 'estimator__subsam...",0.068166,1
9,"{'estimator__verbosity': 1, 'estimator__subsam...",0.067848,2
5,"{'estimator__verbosity': 1, 'estimator__subsam...",0.063774,3
12,"{'estimator__verbosity': 1, 'estimator__subsam...",0.062056,4
14,"{'estimator__verbosity': 1, 'estimator__subsam...",0.061305,5
8,"{'estimator__verbosity': 1, 'estimator__subsam...",0.042964,6
11,"{'estimator__verbosity': 1, 'estimator__subsam...",0.040627,7
1,"{'estimator__verbosity': 1, 'estimator__subsam...",0.037466,8
3,"{'estimator__verbosity': 1, 'estimator__subsam...",0.034739,9
16,"{'estimator__verbosity': 1, 'estimator__subsam...",0.033341,10


# Inferenece

In [18]:
my_test = my_preprocessing(test)

In [19]:
scaled_x = scaler.transform(my_test)
scaled_x.shape

(39608, 48)

In [20]:
res = RS.predict(scaled_x)

In [22]:
label_names = submission.columns;label_names

Index(['ID', 'Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08',
       'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14'],
      dtype='object')

In [23]:
submit = pd.concat([submission[['ID']],pd.DataFrame(res)],axis=1)
submit.columns = label_names
submit.to_csv('m0819b.csv', index=False)