In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import make_scorer
from sklearn  import metrics
from sklearn.ensemble import RandomForestRegressor

In [2]:
# train csv
zip_dir = '/Data1/Radar'
meta_dir = os.path.join(zip_dir, 'meta')
x_feature = pd.read_csv(meta_dir+'/x_feature_info.csv')
train = pd.read_csv(os.path.join(zip_dir,'train.csv'))
test = pd.read_csv(os.path.join(zip_dir,'test.csv'))
submission = pd.read_csv(zip_dir+'/sample_submission.csv')

In [3]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:14])
    return score

In [4]:
def my_preprocessing(dataset) :
    preprocessed = dataset.drop(['ID','X_04','X_10','X_11','X_23','X_47','X_48'],axis=1)
    preprocessed['X_021'] = dataset['X_02'].apply(lambda x : 1.0 if x > 103.320 else 0.0)
    preprocessed['X_03'] = dataset['X_03'].apply(lambda x : np.log(x))
    preprocessed['X_051'] = dataset['X_05'].apply(lambda x : 1.0 if x <=102.25 else 0.0)
    preprocessed['X_07'] = dataset['X_07'].apply(lambda x : 40 if x > 40 else x)
    preprocessed['X_08'] = dataset['X_08'].apply(lambda x : 6.1 if np.log(x) > 6 else np.log(x))
    preprocessed['X_09'] = dataset['X_09'].apply(lambda x : np.log(x))
    preprocessed['X_25'] = dataset['X_25'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_26'] = dataset['X_26'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_27'] = dataset['X_27'].apply(lambda x : 2.2 if x > 2.2 else x)
    preprocessed['X_28'] = dataset['X_28'].apply(lambda x : 2.25 if x > 2.25 else x)
    preprocessed['X_29'] = dataset['X_29'].apply(lambda x : 2.3 if x > 2.3 else x)
    preprocessed['X_31'] = dataset['X_31'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_33'] = dataset['X_33'].apply(lambda x : 2.0 if x > 2.0 else x)
    preprocessed['X_38'] = dataset['X_38'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_39'] = dataset['X_39'].apply(lambda x : -2.66 if x > -2.65 else x)
    preprocessed['X_49'] = dataset['X_49'].apply(lambda x : 20000.0 if x > 20000 else x)
    preprocessed['X_50'] = dataset['X_50'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_51'] = dataset['X_51'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_52'] = dataset['X_52'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_53'] = dataset['X_53'].apply(lambda x : 120.0 if x < 120 else x)
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_54'] = dataset['X_54'].apply(lambda x : 150.0 if x > 150 else x)
    preprocessed['X_55'] = dataset['X_55'].apply(lambda x : 120.0 if x < 120 else x)    
    preprocessed['X_56'] = dataset['X_56'].apply(lambda x : 120.0 if x < 120 else x)
    
    preprocessed['X_57'] = dataset['X_30'] * dataset['X_34'] # 스크류삽입깊이 x 스크류 체결시 분당 회전수
    preprocessed['X_58'] = dataset['X_31'] * dataset['X_35']
    preprocessed['X_59'] = dataset['X_32'] * dataset['X_36']
    preprocessed['X_60'] = dataset['X_33'] * dataset['X_37']
    preprocessed = preprocessed.drop(['X_30','X_31','X_32','X_33','X_34','X_35','X_36','X_37'],axis=1)
    
    return preprocessed[sorted(preprocessed.T.index)]

In [5]:
processed = my_preprocessing(train)
train_x = processed[processed.columns[:-14]]
print(f'train_x : {train_x.shape}')
train_y = processed[processed.columns[-14:]]
print(f'train_y : {train_y.shape}')

train_x : (39607, 48)
train_y : (39607, 14)


In [6]:
scaler = StandardScaler()
scaler.fit(train_x)
scaled_x = scaler.transform(train_x)
scaled_x.shape

(39607, 48)

In [7]:
label = np.array(train_y)
label.shape

(39607, 14)

# Split the dataset

In [8]:
x_train, x_test, y_train, y_test = tts(scaled_x, label, test_size = 0.2, random_state = 1)
print(f'x_train : {x_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'x_test  : {x_test.shape}')
print(f'y_test  : {y_test.shape}')

x_train : (31685, 48)
y_train : (31685, 14)
x_test  : (7922, 48)
y_test  : (7922, 14)


# Regressor

In [10]:
rf = RandomForestRegressor(random_state=1,
                          n_estimators = 300,
                          min_samples_split = 16,
                          min_samples_leaf = 8,
                          max_depth = 30)
rf.fit(x_train ,y_train )

In [14]:
feauture_importances = rf.feature_importances_

In [18]:
feauture_importances.shape

(48,)

In [17]:
x_train.shape

(31685, 48)

In [39]:
feature_df = pd.DataFrame({
    'importances' : feauture_importances
})

In [40]:
most_performance_index = feature_df.sort_values(by = 'importances', ascending=False).iloc[:14].index #importances > 0.03

In [43]:
new_x_train = pd.DataFrame(x_train)[most_performance_index]
new_x_test = pd.DataFrame(x_test)[most_performance_index]

In [None]:
new_x_train

In [44]:
new_rf = RandomForestRegressor(random_state=1,
                          n_estimators = 300,
                          min_samples_split = 16,
                          min_samples_leaf = 8,
                          max_depth = 30)
new_rf.fit(new_x_train ,y_train)

In [45]:
predicted = new_rf.predict(new_x_test)

In [46]:
lg_nrmse(y_test,predicted)

1.9688205509022627

# Inferenece

In [38]:
my_test = my_preprocessing(test)

In [40]:
scaled_x = scaler.transform(my_test)
scaled_x.shape

(39608, 48)

In [77]:
res = clf.predict(scaled_x)

In [80]:
label_names = submission.columns;label_names

Index(['ID', 'Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08',
       'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14'],
      dtype='object')

In [84]:
submit = pd.concat([submission[['ID']],pd.DataFrame(res)],axis=1)
submit.columns = label_names
submit.to_csv('m0818.csv', index=False)