### 모델링 시작

In [4]:
# 전처리 과정 가져오기
import pandas as pd

data = pd.read_csv('../DATA/scale_2_to_4.csv')
data.head()

Unnamed: 0,E_scr_pv,E_scr_sv,c_temp_pv,k_rpm_pv,k_rpm_sv,n_temp_pv,n_temp_sv,scale_pv,s_temp_pv,scr_dv,ctemp_dv,rpm_dv,ntemp_dv,stemp_dv,scale_pv_diff,weekday
0,8,8,69.6,189,180,67.2,70,3.01,67.1,0,0.4,-9,2.8,2.9,0.0,3
1,8,8,69.8,189,180,67.2,70,3.01,67.0,0,0.2,-9,2.8,3.0,0.0,3
2,8,8,69.7,189,180,67.9,70,3.08,65.9,0,0.3,-9,2.1,4.1,0.07,3
3,8,8,69.7,189,180,67.8,70,3.08,65.9,0,0.3,-9,2.2,4.1,0.0,3
4,8,8,69.7,189,180,67.8,70,3.08,65.9,0,0.3,-9,2.2,4.1,0.0,3


In [5]:
# # 다중공선성 파악
# from statsmodels.stats.outliers_influence import variance_inflation_factor

# vif = pd.DataFrame()
# vif["VIF Factor"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
# vif["features"] = data.columns
# vif

### Modeling

In [10]:
# Multiple Linear Regression
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# 데이터 분할
X = data.drop('scale_pv', axis=1)
y = data['scale_pv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 예측
pred = model.predict(X_test)

# 평가
mae = mean_absolute_error(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print('Multiple Linear Regression')
print(f'coef : {model.coef_}')
print(f'MAE: {mae:.4f}')
print(f'MAPE: {mape:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2 Score: {r2:.4f}')
print()

# Random Forest
from sklearn.ensemble import RandomForestRegressor

# 모델 학습
model = RandomForestRegressor()
model.fit(X_train, y_train)

# 예측
pred = model.predict(X_test)

# 평가
mae = mean_absolute_error(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print('Random Forest')
print(f'MAE: {mae:.4f}')
print(f'MAPE: {mape:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2 Score: {r2:.4f}')
print()

# LightGBM
from lightgbm import LGBMRegressor

# 모델 학습
model = LGBMRegressor()
model.fit(X_train, y_train)

# 예측
pred = model.predict(X_test)

# 평가
mae = mean_absolute_error(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print('LightGBM')
print(f'MAE: {mae:.4f}')
print(f'MAPE: {mape:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2 Score: {r2:.4f}')

Multiple Linear Regression
coef : [ 7.97232923e+09 -7.97232923e+09  1.38964754e+10  1.62237782e+09
 -1.62237782e+09  1.55292264e+09 -1.55292264e+09  3.20831262e+09
  7.97232923e+09  1.38964754e+10  1.62237782e+09  1.55292264e+09
  3.20831262e+09  4.83414944e-01 -1.05601549e-03]
MAE: 0.0259
MAPE: 0.0085
MSE: 0.0015
RMSE: 0.0383
R2 Score: 0.1524

Random Forest
MAE: 0.0193
MAPE: 0.0063
MSE: 0.0009
RMSE: 0.0305
R2 Score: 0.4650

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 530
[LightGBM] [Info] Number of data points in the train set: 22499, number of used features: 13
[LightGBM] [Info] Start training from score 3.041821
LightGBM
MAE: 0.0227
MAPE: 0.0075
MSE: 0.0011
RMSE: 0.0327
R2 Score: 0.3848


### Parameter Tuning

In [14]:
# Random Forest Tuning
from sklearn.model_selection import GridSearchCV

# 하이퍼파라미터 그리드
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 50, 100],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
}

# 모델 정의
model = RandomForestRegressor()

# 그리드 탐색
grid = GridSearchCV(model, param_grid, cv=3, verbose=3)
grid.fit(X_train, y_train)

# 최적 파라미터
print('Random Forest Tuning')
print(f'Best Parameters: {grid.best_params_}')

# 최적 모델
best_model = grid.best_estimator_

# 예측
pred = best_model.predict(X_test)

# 평가
mae = mean_absolute_error(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

print(f'MAE: {mae:.4f}')
print(f'MAPE: {mape:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2 Score: {r2:.4f}')
print()

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV 1/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.310 total time=   2.5s
[CV 2/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.282 total time=   2.5s
[CV 3/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.352 total time=   2.5s
[CV 1/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.311 total time=   5.1s
[CV 2/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.281 total time=   5.1s
[CV 3/3] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.347 total time=   5.1s
[CV 1/3] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=100;, score=0.309 total time=   2.5s
[CV 2/3] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=100;, score=0.284 total time=  