In [437]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.multioutput import MultiOutputRegressor
from sklearn  import metrics
from sklearn.ensemble import RandomForestRegressor

In [392]:
# train csv
zip_dir = '/Data1/Radar'
meta_dir = os.path.join(zip_dir, 'meta')
x_feature = pd.read_csv(meta_dir+'/x_feature_info.csv')
train = pd.read_csv(os.path.join(zip_dir,'train.csv'))
test = pd.read_csv(os.path.join(zip_dir,'test.csv'))

Unnamed: 0,ID,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TRAIN_00001,70.544,103.32,67.47,1,101.892,74.983,29.45,62.38,245.71,...,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.47,-25.409,-25.304
1,TRAIN_00002,69.524,103.321,65.17,1,101.944,72.943,28.73,61.23,233.61,...,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,TRAIN_00003,72.583,103.32,64.07,1,103.153,72.943,28.81,105.77,272.2,...,31.801,17.08,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.37
3,TRAIN_00004,71.563,103.32,67.57,1,101.971,77.022,28.92,115.21,255.36,...,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,TRAIN_00005,69.524,103.32,63.57,1,101.981,70.904,29.68,103.38,241.46,...,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974


In [399]:
def my_preprocessing(dataset) :
    preprocessed = dataset.drop(['ID','X_04','X_10','X_11','X_23','X_47','X_48'],axis=1)
    preprocessed['X_021'] = dataset['X_02'].apply(lambda x : 1.0 if x > 103.320 else 0.0)
    preprocessed['X_03'] = dataset['X_03'].apply(lambda x : np.log(x))
    preprocessed['X_051'] = dataset['X_05'].apply(lambda x : 1.0 if x <=102.25 else 0.0)
    preprocessed['X_07'] = dataset['X_07'].apply(lambda x : 40 if x > 40 else x)
    preprocessed['X_08'] = dataset['X_08'].apply(lambda x : 6.1 if np.log(x) > 6 else np.log(x))
    preprocessed['X_09'] = dataset['X_09'].apply(lambda x : np.log(x))
    preprocessed['X_25'] = dataset['X_25'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_26'] = dataset['X_26'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_27'] = dataset['X_27'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_28'] = dataset['X_28'].apply(lambda x : 2.25 if x > 2.25 else x)
    preprocessed['X_29'] = dataset['X_29'].apply(lambda x : 2.3 if x > 2.3 else x)
    preprocessed['X_31'] = dataset['X_31'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_33'] = dataset['X_33'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_38'] = dataset['X_38'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_39'] = dataset['X_39'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_49'] = dataset['X_49'].apply(lambda x : 20000.0 if x > 20000 else x)
    preprocessed['X_50'] = dataset['X_50'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_52'] = dataset['X_52'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_53'] = dataset['X_53'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_55'] = dataset['X_55'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_56'] = dataset['X_56'].apply(lambda x : 120.0 if x < 120 else x)
    
    preprocessed['X_57'] = dataset['X_30'] * dataset['X_34'] # 스크류삽입깊이 x 스크류 체결시 분당 회전수
    preprocessed['X_58'] = dataset['X_31'] * dataset['X_35']
    preprocessed['X_59'] = dataset['X_32'] * dataset['X_36']
    preprocessed['X_60'] = dataset['X_33'] * dataset['X_37']
    preprocessed = preprocessed.drop(['X_30','X_31','X_32','X_33','X_34','X_35','X_36','X_37'],axis=1)
    
    return preprocessed[sorted(preprocessed.T.index)]

In [417]:
processed = my_preprocessing(train)
train_x = processed[processed.columns[:-14]]
print(f'train_x : {train_x.shape}')
train_y = processed[processed.columns[-14:]]
print(f'train_y : {train_y.shape}')

train_x : (39607, 48)
train_y : (39607, 14)


In [412]:
scaler = StandardScaler()
scaler.fit(train_x)
scaled_x = scaler.transform(train_x)
scaled_x.shape

(39607, 48)

In [425]:
label = np.array(train_y)
label.shape

(39607, 14)

# Split the dataset

In [432]:
x_train, x_test, y_train, y_test = tts(scaled_x, label, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 48)
y_train : (31685, 14)
x_test  : (7922, 48)
y_test  : (7922, 14)


In [438]:
max_depth = 30
regr_multirf = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, max_depth=max_depth, random_state=0)
)
regr_multirf.fit(x_train, y_train)

In [439]:
predicted = regr_multirf.predict(x_test)

In [469]:
from sklearn  import metrics

def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [470]:
lg_nrmse(y_test,predicted)

1.9617460626059677

# Inference

In [472]:
my_test = my_preprocessing(test)

In [473]:
my_test

Unnamed: 0,X_01,X_02,X_021,X_03,X_05,X_051,X_06,X_07,X_08,X_09,...,X_51,X_52,X_53,X_54,X_55,X_56,X_57,X_58,X_59,X_60
0,68.504,103.321,1.0,4.339510,101.867,1.0,73.963,30.51,4.152142,5.479805,...,129.460682,141.506570,133.427229,129.711498,133.138096,121.859684,17.8848,19.8135,17.2501,19.7064
1,67.485,103.320,0.0,4.239454,101.992,1.0,67.845,28.03,4.762088,5.242963,...,128.266277,145.911745,131.196417,132.411480,133.629025,124.178623,17.5984,20.1084,17.4200,19.9640
2,69.524,103.320,0.0,4.233672,101.884,1.0,77.022,29.65,5.326322,5.370312,...,119.988804,132.099908,120.450155,130.051708,128.252972,120.000000,19.3950,21.8010,19.1394,22.2471
3,69.524,103.320,0.0,4.187683,101.866,1.0,73.963,28.15,4.638412,5.197391,...,125.069180,147.507669,123.142653,125.963665,139.666592,126.589253,19.3551,21.4170,18.8486,24.6240
4,73.603,103.321,1.0,4.199755,101.891,1.0,74.983,29.92,4.265493,5.446436,...,135.116192,138.600473,127.173033,137.252712,134.411335,124.020016,19.2548,22.2132,18.9654,21.7267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,68.504,103.320,0.0,4.158414,103.157,0.0,68.864,29.49,4.756603,5.649537,...,126.494312,139.119905,125.271109,128.284572,140.176945,128.292843,17.7826,21.5800,17.5576,20.1240
39604,68.504,103.320,0.0,4.116921,103.137,0.0,68.864,32.29,4.756001,5.607308,...,124.062809,138.238664,120.000000,127.322529,137.312047,131.570614,17.5712,21.2380,17.6664,21.8400
39605,69.524,103.320,0.0,4.153713,103.149,0.0,69.884,30.00,4.727830,5.688804,...,124.774037,138.659624,123.999571,126.075542,135.656132,127.671108,18.1860,20.8656,17.5635,22.1020
39606,67.485,103.321,1.0,4.123418,103.148,0.0,67.845,32.05,4.745367,5.588222,...,125.471699,134.989984,120.889578,129.296909,132.673977,131.882893,17.8986,20.2800,17.7963,20.5110


In [474]:
scaler = StandardScaler()
scaler.fit(my_test)
scaled_x = scaler.transform(my_test)
scaled_x.shape

(39608, 48)

In [475]:
res = regr_multirf.predict(scaled_x)

In [482]:
submission = pd.read_csv(zip_dir+'/sample_submission.csv')
submission

Unnamed: 0,ID,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TEST_00001,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,TEST_00002,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,TEST_00003,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,TEST_00004,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,TEST_00005,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,TEST_39604,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39604,TEST_39605,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39605,TEST_39606,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39606,TEST_39607,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [503]:
label_names = submission.columns

In [505]:
Submission = submission[['ID']]
Submission

Unnamed: 0,ID
0,TEST_00001
1,TEST_00002
2,TEST_00003
3,TEST_00004
4,TEST_00005
...,...
39603,TEST_39604
39604,TEST_39605
39605,TEST_39606
39606,TEST_39607


In [509]:
submit = pd.concat([Submission,pd.DataFrame(res)],axis=1)
submit.columns = label_names

In [510]:
submit

Unnamed: 0,ID,Y_01,Y_02,Y_03,Y_04,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,TEST_00001,1.425995,1.167701,1.092097,14.331664,30.790934,16.792551,3.256440,-26.123937,-26.184573,-22.192406,24.512267,-26.190614,-26.098461,-26.171951
1,TEST_00002,1.549340,1.269528,1.259582,13.747636,31.042620,16.547202,3.116079,-26.194255,-26.180664,-22.295026,24.323267,-26.069488,-26.077022,-26.215481
2,TEST_00003,1.472770,1.182350,0.987030,14.288100,31.999580,16.861020,3.125070,-26.138940,-26.025900,-22.675070,24.195870,-25.921640,-25.952260,-25.898580
3,TEST_00004,1.458310,1.182791,1.009234,14.631919,31.876521,17.059750,3.079645,-25.764700,-25.612370,-21.823819,24.744110,-25.665120,-25.606760,-25.672290
4,TEST_00005,1.398585,1.113478,0.976050,14.493420,31.250440,16.982935,3.021211,-25.621900,-25.686430,-22.113840,24.677363,-25.585560,-25.521120,-25.589530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,TEST_39604,1.261933,0.943686,0.990748,12.889711,30.991424,16.470064,3.185807,-26.593566,-26.574533,-22.962814,24.243815,-26.535821,-26.474717,-26.518638
39604,TEST_39605,1.194200,0.858389,0.952068,14.437760,31.299963,16.711231,3.192984,-26.396878,-26.453103,-22.945464,24.328219,-26.397309,-26.421423,-26.472749
39605,TEST_39606,1.213846,0.903726,0.994370,13.658564,31.005685,16.575390,3.102225,-26.526237,-26.481274,-22.959979,24.176828,-26.494906,-26.453989,-26.476438
39606,TEST_39607,1.153255,0.827964,0.955401,13.490748,31.489842,16.615558,3.164698,-26.471083,-26.499830,-22.983776,24.473589,-26.422789,-26.352722,-26.439194


In [511]:
submit.to_csv('submit_randomforest.csv', index=False)