In [3]:
### 데이터 및 모듈 로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

train_file = '../data_final/Train_data.csv'
train_data = pd.read_csv(train_file)
print(train_data.describe())

           c_temp_pv       k_rpm_pv      n_temp_pv       scale_pv  \
count  198228.000000  198228.000000  198228.000000  198228.000000   
mean       70.282977     179.860867      67.831982       0.563432   
std         0.516095       7.198430       1.094014       1.181768   
min        68.000000     118.000000      65.700000       0.000000   
25%        70.000000     176.000000      67.000000       0.000000   
50%        70.300000     180.000000      67.400000       0.000000   
75%        70.600000     185.000000      68.600000       0.000000   
max        71.900000     198.000000      71.200000       3.490000   

           s_temp_pv  
count  198228.000000  
mean       67.628409  
std         1.258855  
min        65.500000  
25%        66.600000  
50%        67.500000  
75%        68.400000  
max        72.400000  


In [4]:
### 모델 생성을 위한 Train 데이터 증강 by KNN
# scale_pv가 0이 아닌 데이터와 0인 데이터로 분리
non_zero_data = train_data[train_data['scale_pv'] != 0]
zero_data = train_data[train_data['scale_pv'] == 0]
print(non_zero_data.shape, zero_data.shape)


# KNN을 위한 피처와 타겟 설정 (scale_pv가 0이 아닌 데이터)
features = non_zero_data.drop(columns=['scale_pv'])
target = non_zero_data['scale_pv']


# 데이터 스케일링
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_zero_features = scaler.transform(zero_data.drop(columns=['scale_pv']))


# KNN 모델 적합
knn = KNeighborsRegressor(n_neighbors=50) # 설정 이웃 수 2->1->5->10->100->50->25->50
knn.fit(scaled_features, target)


# 미측정 데이터에 대한 예측값 생성 (scale_pv가 0인 데이터, 소수점 아래 둘째자리까지)
predicted_scale_pv = np.round(knn.predict(scaled_zero_features), 2)


# 예측값을 미측정 데이터에 추가
zero_data.loc[:, 'scale_pv'] = predicted_scale_pv


# 원본 데이터와 새로 생성된 데이터를 결합하여 인덱스 기준으로 재정렬
augmented_data = pd.concat([non_zero_data, zero_data]).sort_index()
print(augmented_data.shape)
augmented_data = augmented_data.loc[train_data.index]
print(augmented_data.shape)

(36720, 5) (161508, 5)
(198228, 5)
(198228, 5)


In [5]:
### 모델링 및  Train 평가
# 중복값 제거
augmented_data.drop_duplicates(inplace=True)


# 피처와 타겟 분리
X = augmented_data.drop(columns=['scale_pv'])
y = augmented_data['scale_pv']


# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, scaler):
    model.fit(X_train, y_train)
    y_train_pred_scaled = model.predict(X_train)
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_train_pred = scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
    y_test_pred = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    y_train_original = scaler.inverse_transform(y_train.values.reshape(-1, 1))
    y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
    
    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return train_mae, test_mae, train_mape, test_mape, y_train_pred, y_test_pred

# 타겟 스케일링
target_scaler = StandardScaler()
target_scaler.fit(target.values.reshape(-1, 1))


# Multiple Regression
lr_model = LinearRegression()
lr_train_mae, lr_test_mae, lr_train_mape, lr_test_mape, lr_y_train_pred, lr_y_test_pred = train_and_evaluate_model(lr_model, X_train, X_test, y_train, y_test, target_scaler)


# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_train_mae, rf_test_mae, rf_train_mape, rf_test_mape, rf_y_train_pred, rf_y_test_pred = train_and_evaluate_model(rf_model, X_train, X_test, y_train, y_test, target_scaler)


# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_train_mae, lgb_test_mae, lgb_train_mape, lgb_test_mape, lgb_y_train_pred, lgb_y_test_pred = train_and_evaluate_model(lgb_model, X_train, X_test, y_train, y_test, target_scaler)


# 결과 출력
print(f"Linear Regression - Train MAE: {lr_train_mae}, Train MAPE: {lr_train_mape*100}")
print(f"Linear Regression - Test MAE: {lr_test_mae}, Test MAPE: {lr_test_mape*100}")
print()
print(f"Random Forest - Train MAE: {rf_train_mae}, Train MAPE: {rf_train_mape*100}")
print(f"Random Forest - Test MAE: {rf_test_mae}, Test MAPE: {rf_test_mape*100}")
print()
print(f"LightGBM - Train MAE: {lgb_train_mae}, Train MAPE: {lgb_train_mape*100}")
print(f"LightGBM - Test MAE: {lgb_test_mae}, Test MAPE: {lgb_test_mape*100}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003594 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 226
[LightGBM] [Info] Number of data points in the train set: 104170, number of used features: 4
[LightGBM] [Info] Start training from score 3.043966
Linear Regression - Train MAE: 0.0005044547973532573, Train MAPE: 0.01596356110706408
Linear Regression - Test MAE: 0.0005078379969068374, Test MAPE: 0.016070703792725342

Random Forest - Train MAE: 0.00019386937021284733, Train MAPE: 0.006135114240261651
Random Forest - Test MAE: 0.00041560089178695767, Test MAPE: 0.013152007630517652

LightGBM - Train MAE: 0.00041225551686836434, Train MAPE: 0.013045828284349859
LightGBM - Test MAE: 0.0004222054436115086, Test MAPE: 0.013360800773748086


In [6]:
### Test 평가
test_file = '../data_final/Test_data.csv'
test_data = pd.read_csv(test_file)


# scale_pv 값이 2초과 4미만인 데이터 선택
test_data_filtered = test_data[(test_data['scale_pv'] > 2) & (test_data['scale_pv'] < 4)]


# 피처와 타겟 분리
X_test_final = test_data_filtered.drop(columns=['scale_pv'])
y_test_final = test_data_filtered['scale_pv']


# 최종 테스트 데이터 예측 및 평가
def final_evaluate_model(model, X_test, y_test, scaler):
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_test_pred = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
    
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return test_mae, test_mape, y_test_pred


# 최종 평가 결과
lr_test_mae_final, lr_test_mape_final, lr_y_test_pred_final = final_evaluate_model(lr_model, X_test_final, y_test_final, target_scaler)
rf_test_mae_final, rf_test_mape_final, rf_y_test_pred_final = final_evaluate_model(rf_model, X_test_final, y_test_final, target_scaler)
lgb_test_mae_final, lgb_test_mape_final, lgb_y_test_pred_final = final_evaluate_model(lgb_model, X_test_final, y_test_final, target_scaler)

print(f"Final Test - Linear Regression MAE: {lr_test_mae_final}, MAPE: {lr_test_mape_final*100}")
print(f"Final Test - Random Forest MAE: {rf_test_mae_final}, MAPE: {rf_test_mape_final*100}")
print(f"Final Test - LightGBM MAE: {lgb_test_mae_final}, MAPE: {lgb_test_mape_final*100}")

Final Test - Linear Regression MAE: 0.0009103039918948051, MAPE: 0.02880019210423469
Final Test - Random Forest MAE: 0.0010451038655071077, MAPE: 0.03306669498185942
Final Test - LightGBM MAE: 0.0009440230667424337, MAPE: 0.02986812973042499
