In [1]:
### 데이터 및 모듈 로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

train_file = '../data_final/Train_data.csv'
train_data = pd.read_csv(train_file)

In [2]:
### 모델 생성을 위한 Train 데이터 증강 by KNN
# scale_pv가 0이 아닌 데이터와 0인 데이터로 분리
non_zero_data = train_data[train_data['scale_pv'] != 0]
zero_data = train_data[train_data['scale_pv'] == 0]


# KNN을 위한 피처와 타겟 설정 (scale_pv가 0이 아닌 데이터)
features = non_zero_data.drop(columns=['scale_pv'])
target = non_zero_data['scale_pv']


# 데이터 스케일링
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_zero_features = scaler.transform(zero_data.drop(columns=['scale_pv']))


# KNN 모델 적합
knn = KNeighborsRegressor(n_neighbors=50) # 설정 이웃 수 2->1->5->10->100->50->25->50
knn.fit(scaled_features, target)


# 미측정 데이터에 대한 예측값 생성 (scale_pv가 0인 데이터)
predicted_scale_pv = knn.predict(scaled_zero_features)


# 예측값을 미측정 데이터에 추가
zero_data.loc[:, 'scale_pv'] = predicted_scale_pv


# 원본 데이터와 새로 생성된 데이터를 결합하여 인덱스 기준으로 재정렬
augmented_data = pd.concat([non_zero_data, zero_data]).sort_index()
augmented_data = augmented_data.loc[zero_data.index]

In [3]:
### 모델링 및 평가
# 중복값 제거
augmented_data.drop_duplicates(inplace=True)


# 피처와 타겟 분리
X = augmented_data.drop(columns=['scale_pv'])
y = augmented_data['scale_pv']


# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, scaler):
    model.fit(X_train, y_train)
    y_train_pred_scaled = model.predict(X_train)
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_train_pred = scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
    y_test_pred = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    y_train_original = scaler.inverse_transform(y_train.values.reshape(-1, 1))
    y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
    
    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return train_mae, test_mae, train_mape, test_mape, y_train_pred, y_test_pred

# 타겟 스케일링
target_scaler = StandardScaler()
target_scaler.fit(target.values.reshape(-1, 1))


# Multiple Regression
lr_model = LinearRegression()
lr_train_mae, lr_test_mae, lr_train_mape, lr_test_mape, lr_y_train_pred, lr_y_test_pred = train_and_evaluate_model(lr_model, X_train, X_test, y_train, y_test, target_scaler)


# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_train_mae, rf_test_mae, rf_train_mape, rf_test_mape, rf_y_train_pred, rf_y_test_pred = train_and_evaluate_model(rf_model, X_train, X_test, y_train, y_test, target_scaler)


# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_train_mae, lgb_test_mae, lgb_train_mape, lgb_test_mape, lgb_y_train_pred, lgb_y_test_pred = train_and_evaluate_model(lgb_model, X_train, X_test, y_train, y_test, target_scaler)


# 결과 출력
print(f"Linear Regression - Train MAE: {lr_train_mae}, Train MAPE: {lr_train_mape*100}")
print(f"Linear Regression - Test MAE: {lr_test_mae}, Test MAPE: {lr_test_mape*100}")
print()
print(f"Random Forest - Train MAE: {rf_train_mae}, Train MAPE: {rf_train_mape*100}")
print(f"Random Forest - Test MAE: {rf_test_mae}, Test MAPE: {rf_test_mape*100}")
print()
print(f"LightGBM - Train MAE: {lgb_train_mae}, Train MAPE: {lgb_train_mape*100}")
print(f"LightGBM - Test MAE: {lgb_test_mae}, Test MAPE: {lgb_test_mape*100}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 79432, number of used features: 4
[LightGBM] [Info] Start training from score 3.044623
Linear Regression - Train MAE: 0.00031141501145188317, Train MAPE: 0.009854309598823335
Linear Regression - Test MAE: 0.00031504865216259623, Test MAPE: 0.009969341446794783

Random Forest - Train MAE: 3.2814606881516294e-05, Train MAPE: 0.0010383804481109346
Random Forest - Test MAE: 8.786275978682431e-05, Test MAPE: 0.0027803282048494897

LightGBM - Train MAE: 0.00019598107879385286, Train MAPE: 0.006201629746673207
LightGBM - Test MAE: 0.000200997876260598, Test MAPE: 0.006360405745252052


In [4]:
### Test 평가
test_file = '../data_final/Test_data.csv'
test_data = pd.read_csv(test_file)


# scale_pv 값이 2초과 4미만인 데이터 선택
test_data_filtered = test_data[(test_data['scale_pv'] > 2) & (test_data['scale_pv'] < 4)]


# 피처와 타겟 분리
X_test_final = test_data_filtered.drop(columns=['scale_pv'])
y_test_final = test_data_filtered['scale_pv']


# 최종 테스트 데이터 예측 및 평가
def final_evaluate_model(model, X_test, y_test, scaler):
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_test_pred = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
    
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return test_mae, test_mape, y_test_pred


# 최종 평가 결과
lr_test_mae_final, lr_test_mape_final, lr_y_test_pred_final = final_evaluate_model(lr_model, X_test_final, y_test_final, target_scaler)
rf_test_mae_final, rf_test_mape_final, rf_y_test_pred_final = final_evaluate_model(rf_model, X_test_final, y_test_final, target_scaler)
lgb_test_mae_final, lgb_test_mape_final, lgb_y_test_pred_final = final_evaluate_model(lgb_model, X_test_final, y_test_final, target_scaler)

print(f"Final Test - Linear Regression MAE: {lr_test_mae_final}, MAPE: {lr_test_mape_final*100}")
print(f"Final Test - Random Forest MAE: {rf_test_mae_final}, MAPE: {rf_test_mape_final*100}")
print(f"Final Test - LightGBM MAE: {lgb_test_mae_final}, MAPE: {lgb_test_mape_final*100}")

Final Test - Linear Regression MAE: 0.0009071171996330224, MAPE: 0.0286994529013356
Final Test - Random Forest MAE: 0.0009728531653467838, MAPE: 0.03077964999324502
Final Test - LightGBM MAE: 0.0009319051489468961, MAPE: 0.02948381780318609


In [5]:
# Grid Search CV
from sklearn.model_selection import GridSearchCV

# Random Forest
param_grid = {
    'n_estimators': [50, 100, 150],     # default=100
    'max_depth': [None, 5, 10],         # default=None
    'min_samples_split': [2, 4, 8],     # default=2
    'min_samples_leaf': [1, 2, 4]       # default=1
}

rf_model = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=3, verbose=1)


### 모델링 및 평가
# 중복값 제거
augmented_data.drop_duplicates(inplace=True)


# 피처와 타겟 분리
X = augmented_data.drop(columns=['scale_pv'])
y = augmented_data['scale_pv']


# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




rf_grid.fit(X_train, y_train)

# 최적의 하이퍼파라미터
print(rf_grid.best_params_)
print(rf_grid.best_score_)
print(rf_grid.best_estimator_)
print(rf_grid.cv_results_)
print(rf_grid.cv_results_['mean_test_score'])



Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 

### Scaling 오류 수정

In [None]:
# 타겟 스케일링
target_scaler = StandardScaler()
# - 수정 전 : target_scaler.fit(target.values.reshape(-1, 1))
# - 수정 : y_train과 y_test를 나눈 휘에 scaling
# - 이유 : y_train과 y_test를 나누기 전에 scaling을 진행하면 y_train과 y_test의 scaling 기준이 달라질 수 있음 (같은 기준으로 scaling을 진행되버림)
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

# Multiple Regression
lr_model = LinearRegression()
# - 수정 전 : lr_train_mae, lr_test_mae, lr_train_mape, lr_test_mape, lr_y_train_pred, lr_y_test_pred = train_and_evaluate_model(lr_model, X_train, X_test, y_train, y_test, target_scaler)
# - 수정 : y_train_scaled, y_test_scaled로 변경
# - 이유 : 나눠진 scaling을 반영
lr_train_mae, lr_test_mae, lr_train_mape, lr_test_mape, lr_y_train_pred, lr_y_test_pred = train_and_evaluate_model(lr_model, X_train, X_test, y_train_scaled, y_test_scaled, target_scaler)


# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, target_scaler):
    model.fit(X_train, y_train)
    y_train_pred_scaled = model.predict(X_train)
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_train_pred = target_scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).flatten()
    y_test_pred = target_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()
    y_train_original = target_scaler.inverse_transform(y_train.reshape(-1, 1)).flatten()
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
    
    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return train_mae, test_mae, train_mape, test_mape, y_train_pred, y_test_pred


### 수정된 결과

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

# 데이터 준비 및 분리
non_zero_data = train_data[train_data['scale_pv'] != 0]
zero_data = train_data[train_data['scale_pv'] == 0]

features = non_zero_data.drop(columns=['scale_pv'])
target = non_zero_data['scale_pv']

# 피처 스케일링
feature_scaler = StandardScaler()
scaled_features = feature_scaler.fit_transform(features)
scaled_zero_features = feature_scaler.transform(zero_data.drop(columns=['scale_pv']))

# KNN 모델 적합
knn = KNeighborsRegressor(n_neighbors=50)
knn.fit(scaled_features, target)

# 예측값 생성
predicted_scale_pv = knn.predict(scaled_zero_features)

# 예측값 추가
zero_data.loc[:, 'scale_pv'] = predicted_scale_pv

# 데이터 결합 및 정렬
augmented_data = pd.concat([non_zero_data, zero_data]).sort_index()
augmented_data = augmented_data.loc[zero_data.index]

# 중복값 제거
augmented_data.drop_duplicates(inplace=True)

# 피처와 타겟 분리
X = augmented_data.drop(columns=['scale_pv'])
y = augmented_data['scale_pv']

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 타겟 스케일링
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, target_scaler):
    model.fit(X_train, y_train)
    y_train_pred_scaled = model.predict(X_train)
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_train_pred = target_scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).flatten()
    y_test_pred = target_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()
    y_train_original = target_scaler.inverse_transform(y_train.reshape(-1, 1)).flatten()
    y_test_original = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
    
    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return train_mae, test_mae, train_mape, test_mape, y_train_pred, y_test_pred

# Multiple Regression
lr_model = LinearRegression()
lr_train_mae, lr_test_mae, lr_train_mape, lr_test_mape, lr_y_train_pred, lr_y_test_pred = train_and_evaluate_model(lr_model, X_train, X_test, y_train_scaled, y_test_scaled, target_scaler)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_train_mae, rf_test_mae, rf_train_mape, rf_test_mape, rf_y_train_pred, rf_y_test_pred = train_and_evaluate_model(rf_model, X_train, X_test, y_train_scaled, y_test_scaled, target_scaler)

# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_train_mae, lgb_test_mae, lgb_train_mape, lgb_test_mape, lgb_y_train_pred, lgb_y_test_pred = train_and_evaluate_model(lgb_model, X_train, X_test, y_train_scaled, y_test_scaled, target_scaler)

# 결과 출력
print(f"Linear Regression - Train MAE: {lr_train_mae}, Train MAPE: {lr_train_mape*100}")
print(f"Linear Regression - Test MAE: {lr_test_mae}, Test MAPE: {lr_test_mape*100}")
print()
print(f"Random Forest - Train MAE: {rf_train_mae}, Train MAPE: {rf_train_mape*100}")
print(f"Random Forest - Test MAE: {rf_test_mae}, Test MAPE: {rf_test_mape*100}")
print()
print(f"LightGBM - Train MAE: {lgb_train_mae}, Train MAPE: {lgb_train_mape*100}")
print(f"LightGBM - Test MAE: {lgb_test_mae}, Test MAPE: {lgb_test_mape*100}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000740 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 79432, number of used features: 4
[LightGBM] [Info] Start training from score -0.000000
Linear Regression - Train MAE: 0.008000065747821082, Train MAPE: 0.2626429065684918
Linear Regression - Test MAE: 0.008093411808610328, Test MAPE: 0.2657457067460429

Random Forest - Train MAE: 0.0008429934786987631, Train MAPE: 0.027679116457716316
Random Forest - Test MAE: 0.002256703761518715, Test MAPE: 0.0741064297789399

LightGBM - Train MAE: 0.005034636909243942, Train MAPE: 0.16533452532913664
LightGBM - Test MAE: 0.0051635154239708545, Test MAPE: 0.16958519675386313
