In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


%matplotlib inline

# Data Loading

train=pd.read_csv('/Users/kdlee/Downloads/vehicle/Out_1.csv',index_col=0) #Seq 빠짐: index로
test=pd.read_csv('/Users/kdlee/Downloads/vehicle/Out_2.csv',index_col=0)

In [10]:
#제외할 칼럼들 선택
fm6andTime = ['FLFX', 'FLFY', 'FLFZ', 'FLMX', 'FLMY', 'FLMZ', 'FRFX', 'FRFY', 'FRFZ', 'FRMX', 'FRMY', 'FRMZ', 
         'RLFX', 'RLFY', 'RLFZ', 'RLMX', 'RLMY', 'RLMZ', 'RRFX', 'RRFY', 'RRFZ', 'RRMX', 'RRMY', 'RRMZ', 'Time']

#칼럼 제거
x_train = train.drop(fm6andTime, axis=1)
x_test = test.drop(fm6andTime, axis=1)
print("X_train.shape: {}, X_test.shape: {}".format(X_train.shape, X_test.shape))

# 표준 정규화 실행

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(np.mean(X_train_scaled), np.std(X_train_scaled))
print(np.mean(X_test_scaled), np.std(X_test_scaled))
print(X_train.describe())
print(X_test.describe())



X_train.shape: (222227, 41), X_test.shape: (90821, 41)


In [12]:
fm6 = ['FLFX', 'FLFY', 'FLFZ', 'FLMX', 'FLMY', 'FLMZ', 'FRFX', 'FRFY', 'FRFZ', 'FRMX', 'FRMY', 'FRMZ', 
       'RLFX', 'RLFY', 'RLFZ', 'RLMX', 'RLMY', 'RLMZ', 'RRFX', 'RRFY', 'RRFZ', 'RRMX', 'RRMY', 'RRMZ']


# 칼럼 하나씩 랜덤포레스트 적용
for case in fm6:
    
    y_train = train[case]
    y_test = test[case]
    print("y_train.shape: {}, y_test.shape: {}".format(y_train.shape, y_test.shape))
    
    
    # model 생성
    # n_estimators는 의사결정트리 갯수를 의미 많으면 많을 수록 좋으나 컴퓨터의 충분한 메모리와 훈련 시간이 필요함
    # n_jobs= 사용할 코어의 수 -1은 컴퓨터의 모든 코어를 사용한다.
    # max_features 각 트리가 얼마나 무작위가 될지를 결정하며 작은 max_features는 과대적합을 줄여준다.
    RF = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
    print(RF)

    
    # 모델학습
    RF.fit(X_train_scaled, y_train)
    print(RF)


    # 모델 예측
    predictions = RF.predict(X_test_scaled)
    print(predictions.shape)
    
    
    # MSE | mean squared error 
    mse = mean_squared_error(y_test, predictions)

    
    # RMSE
    rmse = np.sqrt(mse)


    # MAE | mean absloute error
    mae = mean_absolute_error(y_test, predictions)


    # R2 / R square
    r2 = r2_score(y_test, predictions)

    
    # PearsonCorrelationSquare
    predict_data = pd.DataFrame(data = [y_test.values.reshape(-1), predictions])
    predict_data = predict_data.transpose()
    pc2 = predict_data.corr(method='pearson') ** 2
    
    
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    print("MAE: ", mae)
    print("R^2: ", r2)
    print("PearsonCorrelation^2: ", pc2[1][0])

    
    # predict_data를 csv로 저장
    col = ["y_test", "predictions"]
    predict_data.columns = col
    predict_data.to_csv("/home/iotadmin/notebooks/RandomForest_FINAL/RandomForest_{}.csv".format(case))

    
    print("___________________________finished_{}".format(case))


Unnamed: 0_level_0,FLSD,FLSARF,FLSBLF,FLLCABFX,FLLCABFY,FLTBF,FRSD,FRSARF,FRSBLF,FRLCABFX,...,Rolling,VS,BRAKE_ACT,SAS_Angle,SAS_Speed,PV_AV_CAN,WHL_SPD_FL,WHL_SPD_FR,WHL_SPD_RL,WHL_SPD_RR
Seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,376.9357,24.46212,-3.528690,11.83236,19.266850,-64.46651,355.7152,-18.63433,-0.429658,16.637950,...,-0.488891,0.0,2.0,6553.0,0.0,0.0,0.0,0.0,0.0,0.0
2,376.9435,25.05665,-3.457477,11.06294,21.817470,-65.31600,355.7256,-18.10647,-0.429658,16.199060,...,-0.488891,0.0,2.0,6553.0,0.0,0.0,0.0,0.0,0.0,0.0
3,376.9070,21.62867,-3.452931,10.73560,22.880840,-65.52584,355.7334,-19.62172,-0.492160,15.972750,...,-0.488891,0.0,2.0,6553.0,0.0,0.0,0.0,0.0,0.0,0.0
4,376.9123,21.68386,-3.490810,11.47127,20.620780,-64.93395,355.7322,-20.26447,-0.398407,16.528230,...,-0.494384,0.0,2.0,6553.0,0.0,0.0,0.0,0.0,0.0,0.0
5,376.9357,24.65653,-3.508992,11.87623,20.032680,-64.59241,355.7152,-18.80447,-0.429658,16.853970,...,-0.494384,0.0,2.0,6553.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90817,377.6777,22.19310,-9.059075,-36.61396,8.209370,-32.74905,358.9059,-24.90766,2.164187,5.651866,...,-1.774291,0.0,2.0,6552.7,8.0,0.0,0.0,0.0,0.0,0.0
90818,377.7141,25.42040,-9.059075,-36.00653,6.882755,-32.15861,358.8956,-22.61296,2.164187,6.351354,...,-1.785277,0.0,2.0,6552.7,8.0,0.0,0.0,0.0,0.0,0.0
90819,377.7123,25.53328,-9.096954,-36.70508,9.163319,-32.94731,358.9163,-21.89751,2.190439,5.994752,...,-1.785277,0.0,2.0,6552.7,8.0,0.0,0.0,0.0,0.0,0.0
90820,377.6671,22.30975,-9.086349,-37.16066,10.267690,-33.28016,358.9265,-24.21983,2.132936,5.579859,...,-1.785277,0.0,2.0,6552.7,8.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_all = pd.DataFrame()
df_all

# 각 오차값 리스트 생성
scoring_list  = pd.DataFrame(["MSE", "RMSE", "MAE", "PearsonCorrelation^2"])
scoring_list.columns = ["metrics"]
scoring_list

fm6 = ['FLFX', 'FLFY', 'FLFZ', 'FLMX', 'FLMY', 'FLMZ', 'FRFX', 'FRFY', 'FRFZ', 'FRMX', 'FRMY', 'FRMZ', 
       'RLFX', 'RLFY', 'RLFZ', 'RLMX', 'RLMY', 'RLMZ', 'RRFX', 'RRFY', 'RRFZ', 'RRMX', 'RRMY', 'RRMZ']


for i in fm6:
    df = pd.read_csv('RandomForest_{}.csv'.format(i), index_col=0)    

    y_test = df['y_test']
    predictions = df['predictions']
    
    # MSE | mean squared error 
    mse = mean_squared_error(y_test, predictions)

    # RMSE
    rmse = np.sqrt(mse)

    # MAE | mean absloute error
    mae = mean_absolute_error(y_test, predictions)
    
    # PearsonCorrelationSquare
    predict_data = pd.DataFrame(data = [y_test.values.reshape(-1), predictions])
    predict_data = predict_data.transpose()
    pc2 = predict_data.corr(method='pearson') ** 2
    
    print("__________________________{}".format(i))
    print(df.head())
    
    scoring = pd.concat([pd.Series(["MSE", "RMSE", "MAE", "PearsonCorrelation^2"]), pd.Series([mse, rmse, mae, pc2[1][0]])], axis=1)
    scoring.columns = ["metrics", i]
    scoring_list = pd.merge(scoring_list,scoring)
    print(scoring_list)

#     print("MSE: ", mse)
#     print("RMSE: ", rmse)
#     print("MAE: ", mae)
#     print("PearsonCorrelation^2: ", pc2[1][0])

    plt.figure(figsize=(40, 20))
    plt.plot(df)
    plt.show()
    
    col = ["{}_actual".format(i), "predictions"]
    df.columns = col

    df_all = pd.concat([df_all, df], axis=1)
    print(df_all.head())
    
scoring = pd.concat([pd.Series(["MSE", "RMSE", "MAE", "PearsonCorrelation^2"]), pd.Series([mse, rmse, mae, pc2[1][0]])], axis=1)
scoring.columns = []

scoring
