In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn  import metrics
from sklearn.ensemble import RandomForestRegressor

In [7]:
# train csv
zip_dir = '/Data1/Radar'
meta_dir = os.path.join(zip_dir, 'meta')
x_feature = pd.read_csv(meta_dir+'/x_feature_info.csv')
train = pd.read_csv(os.path.join(zip_dir,'train.csv'))
test = pd.read_csv(os.path.join(zip_dir,'test.csv'))
submission = pd.read_csv(zip_dir+'/sample_submission.csv')

In [8]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score


In [10]:
def my_preprocessing(dataset, mul=True) :
    preprocessed = dataset.drop(['ID','X_04','X_10','X_11','X_23','X_47','X_48'],axis=1)
    preprocessed['X_021'] = dataset['X_02'].apply(lambda x : 1.0 if x > 103.320 else 0.0)
    preprocessed['X_03'] = dataset['X_03'].apply(lambda x : np.log(x))
    preprocessed['X_051'] = dataset['X_05'].apply(lambda x : 1.0 if x <=102.25 else 0.0)
    preprocessed['X_07'] = dataset['X_07'].apply(lambda x : 40 if x > 40 else x)
    preprocessed['X_08'] = dataset['X_08'].apply(lambda x : 6.1 if np.log(x) > 6 else np.log(x))
    preprocessed['X_09'] = dataset['X_09'].apply(lambda x : np.log(x))
    preprocessed['X_25'] = dataset['X_25'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_26'] = dataset['X_26'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_27'] = dataset['X_27'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_28'] = dataset['X_28'].apply(lambda x : 2.25 if x > 2.25 else x)
    preprocessed['X_29'] = dataset['X_29'].apply(lambda x : 2.3 if x > 2.3 else x)
    preprocessed['X_31'] = dataset['X_31'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_33'] = dataset['X_33'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_38'] = dataset['X_38'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_39'] = dataset['X_39'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_49'] = dataset['X_49'].apply(lambda x : 20000.0 if x > 20000 else x)
    preprocessed['X_50'] = dataset['X_50'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_52'] = dataset['X_52'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_53'] = dataset['X_53'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_55'] = dataset['X_55'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_56'] = dataset['X_56'].apply(lambda x : 120.0 if x < 120 else x)
    
    if mul :
        preprocessed['X_57'] = dataset['X_30'] * dataset['X_34'] # 스크류삽입깊이 x 스크류 체결시 분당 회전수
        preprocessed['X_58'] = dataset['X_31'] * dataset['X_35']
        preprocessed['X_59'] = dataset['X_32'] * dataset['X_36']
        preprocessed['X_60'] = dataset['X_33'] * dataset['X_37']
        preprocessed = preprocessed.drop(['X_30','X_31','X_32','X_33','X_34','X_35','X_36','X_37'],axis=1)
    
    return preprocessed[sorted(preprocessed.T.index)]

In [11]:
my_train = my_preprocessing(train, mul=False) ; my_train

Unnamed: 0,X_01,X_02,X_021,X_03,X_05,X_051,X_06,X_07,X_08,X_09,...,Y_05,Y_06,Y_07,Y_08,Y_09,Y_10,Y_11,Y_12,Y_13,Y_14
0,70.544,103.320,0.0,4.211683,101.892,1.0,74.983,29.45,4.133245,5.504152,...,29.632,16.083,4.276,-25.381,-25.529,-22.769,23.792,-25.470,-25.409,-25.304
1,69.524,103.321,1.0,4.176999,101.944,1.0,72.943,28.73,4.114637,5.453653,...,33.179,16.736,3.229,-26.619,-26.523,-22.574,24.691,-26.253,-26.497,-26.438
2,72.583,103.320,0.0,4.159976,103.153,0.0,72.943,28.81,4.661267,5.606537,...,31.801,17.080,2.839,-26.238,-26.216,-22.169,24.649,-26.285,-26.215,-26.370
3,71.563,103.320,0.0,4.213164,101.971,1.0,77.022,28.92,4.746757,5.542674,...,34.503,17.143,3.144,-25.426,-25.079,-21.765,24.913,-25.254,-25.021,-25.345
4,69.524,103.320,0.0,4.152142,101.981,1.0,70.904,29.68,4.638412,5.486704,...,32.602,17.569,3.138,-25.376,-25.242,-21.072,25.299,-25.072,-25.195,-24.974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,66.465,103.320,0.0,4.131480,103.150,0.0,66.825,30.20,4.354527,5.697261,...,29.194,16.582,3.410,-26.486,-26.581,-22.772,24.261,-26.491,-26.584,-26.580
39603,66.465,103.321,1.0,4.139477,102.021,1.0,66.825,29.21,4.627421,5.600900,...,29.859,15.659,3.406,-27.308,-27.203,-24.674,23.427,-27.250,-27.334,-27.325
39604,68.504,103.320,0.0,4.169297,103.144,0.0,68.864,29.96,4.630935,5.288621,...,24.720,16.823,3.215,-26.502,-26.687,-22.577,24.301,-26.388,-26.425,-26.601
39605,66.465,103.320,0.0,4.153713,102.025,1.0,67.845,30.30,4.723842,5.618660,...,26.412,15.757,4.216,-26.760,-26.634,-24.066,23.305,-26.536,-26.751,-26.635


In [13]:
train_x = np.array(my_train[my_train.columns[1:-14]])
print(f'train_x : {train_x.shape}')
train_y = np.array(my_train[my_train.columns[-14:]])
print(f'train_y : {train_y.shape}')

train_x : (39607, 51)
train_y : (39607, 14)


In [14]:
scaler = StandardScaler()
scaler.fit(train_x)
scaled_x = scaler.transform(train_x)
scaled_x.shape

(39607, 51)

In [15]:
label = np.array(train_y)
label.shape

(39607, 14)

# PCA

In [32]:
from sklearn.decomposition import PCA
n = 30
pca = PCA(n_components=n) # 주성분을 몇개로 할지 결정
printcipalComponents = pca.fit_transform(scaled_x)
principalDf = pd.DataFrame(data=printcipalComponents, columns = [f'PC{i+1}' for i in range(n)])
# 주성분으로 이루어진 데이터 프레임 구성

In [33]:
principalDf

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30
0,-0.149098,-5.142664,2.699059,0.086521,1.893257,0.720828,3.507182,0.866531,-3.129937,1.278513,...,0.509641,0.127973,0.503565,0.654921,0.785313,0.904366,1.768699,-0.425966,0.217485,0.649172
1,2.497773,-1.110035,4.880530,-1.119249,2.354790,3.403576,-0.347193,1.159808,-1.652805,1.556064,...,0.816678,0.419503,0.748604,1.227848,-0.090523,1.527923,0.300557,-0.471839,-1.056934,-0.091342
2,0.007550,-2.856722,3.126643,-0.392989,4.938258,0.841353,0.108258,-1.629329,-1.213619,-0.127835,...,0.433799,0.529180,0.929004,0.361871,0.778536,1.348836,1.175766,-0.960408,-0.141437,0.334335
3,-2.364906,-5.042139,4.831369,-1.312083,3.506066,0.284227,2.239029,1.009102,-1.752589,1.773151,...,0.863185,0.115353,-0.732658,0.129951,0.170006,-0.240797,-0.071625,-0.567703,-0.108065,-0.269966
4,0.976025,-3.200923,4.818436,-0.684380,2.619760,1.794157,3.183879,-1.391901,-2.277832,0.655370,...,-0.753377,0.648838,-0.976394,0.339971,1.588093,0.743877,1.011482,1.220465,-0.615874,0.454165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,1.791823,-0.713808,0.419404,1.472083,1.950025,-1.110043,-1.877975,-0.781139,0.750818,-1.176897,...,-0.594131,-1.053722,-0.428416,0.559563,-0.393600,0.620956,1.121505,0.013312,0.180267,-0.179819
39603,2.199036,-0.350607,3.167808,0.942584,0.030272,2.758903,-2.080190,0.515082,1.960279,-0.302441,...,1.451108,-0.119741,1.280556,0.473695,0.788522,1.617688,-0.117758,-0.779879,0.496761,-0.572681
39604,1.355994,-1.103364,-0.201077,0.267251,1.930896,-0.829773,-1.416306,-0.712020,-0.024916,-1.022397,...,-1.959564,0.750646,0.407337,0.394510,0.290212,0.483772,-0.387965,0.644850,-0.805536,0.278158
39605,-1.285571,-2.924364,1.661957,1.442748,0.593821,-0.766493,-1.212452,-1.885431,0.738973,0.898837,...,-0.148343,0.394603,0.790548,0.364769,1.255030,0.487887,-0.009740,0.202414,1.080258,0.262404


In [34]:
pca.explained_variance_ratio_.sum()

0.9197500577174019

In [36]:
x = np.array(principalDf)
y = label

print(x.shape, y.shape)

(39607, 30) (39607, 14)


# Split the dataset

In [37]:
x_train, x_test, y_train, y_test = tts(x, y, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 30)
y_train : (31685, 14)
x_test  : (7922, 30)
y_test  : (7922, 14)


# Regressor

In [39]:
score = make_scorer(lg_nrmse, greater_is_better=False)

regr_multirf = MultiOutputRegressor(RandomForestRegressor(
    n_estimators = 300,
    min_samples_split = 16,
    min_samples_leaf = 8,
    max_depth = 30,
    random_state=1))

regr_multirf.fit(x_train,y_train)

In [41]:
pred = regr_multirf.predict(x_test)
lg_nrmse(y_test, pred)

1.9806027025934967

# Inference

In [40]:
my_test = my_preprocessing(test, mul=False) ; my_test

Unnamed: 0,X_01,X_02,X_021,X_03,X_05,X_051,X_06,X_07,X_08,X_09,...,X_45,X_46,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,68.504,103.321,1.0,4.339510,101.867,1.0,73.963,30.51,4.152142,5.479805,...,0.16,1469,17227.63,138.130429,129.460682,141.506570,133.427229,129.711498,133.138096,121.859684
1,67.485,103.320,0.0,4.239454,101.992,1.0,67.845,28.03,4.762088,5.242963,...,0.27,1462,17134.53,136.148839,128.266277,145.911745,131.196417,132.411480,133.629025,124.178623
2,69.524,103.320,0.0,4.233672,101.884,1.0,77.022,29.65,5.326322,5.370312,...,0.14,1469,14860.83,120.447446,119.988804,132.099908,120.450155,130.051708,128.252972,120.000000
3,69.524,103.320,0.0,4.187683,101.866,1.0,73.963,28.15,4.638412,5.197391,...,0.13,1469,15252.53,133.994695,125.069180,147.507669,123.142653,125.963665,139.666592,126.589253
4,73.603,103.321,1.0,4.199755,101.891,1.0,74.983,29.92,4.265493,5.446436,...,0.09,1469,10752.23,137.918202,135.116192,138.600473,127.173033,137.252712,134.411335,124.020016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39603,68.504,103.320,0.0,4.158414,103.157,0.0,68.864,29.49,4.756603,5.649537,...,0.11,1469,20000.00,127.741246,126.494312,139.119905,125.271109,128.284572,140.176945,128.292843
39604,68.504,103.320,0.0,4.116921,103.137,0.0,68.864,32.29,4.756001,5.607308,...,0.19,1469,20000.00,127.767377,124.062809,138.238664,120.000000,127.322529,137.312047,131.570614
39605,69.524,103.320,0.0,4.153713,103.149,0.0,69.884,30.00,4.727830,5.688804,...,0.12,1469,20000.00,128.593640,124.774037,138.659624,123.999571,126.075542,135.656132,127.671108
39606,67.485,103.321,1.0,4.123418,103.148,0.0,67.845,32.05,4.745367,5.588222,...,0.11,1469,20000.00,121.110646,125.471699,134.989984,120.889578,129.296909,132.673977,131.882893


In [51]:
scaled_x = scaler.transform(test_x)
scaled_x.shape

(39608, 56)

In [42]:
printcipalComponents = pca.fit_transform(scaled_x)

In [43]:
test_x = np.array(printcipalComponents)

In [44]:
res = regr_multirf.predict(test_x)

In [45]:
label_names = submission.columns;label_names

Index(['ID', 'Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08',
       'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14'],
      dtype='object')

In [46]:
submit = pd.concat([submission[['ID']],pd.DataFrame(res)],axis=1)
submit.columns = label_names
submit.to_csv('m0823b.csv', index=False)