### 승민님 분석 정리하기

In [2]:
# 1. Load the data
import pandas as pd

data = pd.read_csv('../DATA/raw_2023051820231018_경대기업맞춤형.csv')
data.tail()

Unnamed: 0,time,E_scr_pv,E_scr_sv,c_temp_pv,c_temp_sv,k_rpm_pv,k_rpm_sv,n_temp_pv,n_temp_sv,scale_pv,s_temp_pv,s_temp_sv,Unnamed: 12
235408,2023-10-16T09:34:12.837701Z,8,8,69.8,70,191,1910,67.5,70,0.0,67.0,70,0.0
235409,2023-10-16T09:34:13.915681Z,8,8,69.7,70,191,1910,67.4,70,0.0,66.8,70,0.0
235410,2023-10-16T09:34:14.978068Z,8,8,69.7,70,191,1910,67.3,70,0.0,66.7,70,0.0
235411,2023-10-16T09:34:16.040468Z,8,8,69.8,70,191,1910,67.2,70,0.0,66.6,70,0.0
235412,2023-10-16T09:34:17.087220Z,8,8,69.8,70,0,1910,67.3,70,0.0,66.6,70,


In [3]:
# 2. Preprocess the data
data.drop('Unnamed: 12', axis=1, inplace=True)
data['time'] = pd.to_datetime(data['time']).dt.tz_localize(None)

# 2.1 Split the data
oct_1 = pd.Timestamp('2023-10-01')
before_data = data[data['time'] < oct_1]
after_data = data[data['time'] >= oct_1]
mydata = before_data.drop('time', axis=1)

# 무의미 sv 제거
mydata.drop(['s_temp_sv', 'c_temp_sv'], axis=1, inplace=True)

# E_scr_sv가 8인 데이터만 남기기
mydata = mydata[mydata['E_scr_sv']==8]


# E_scr_sv, n_temp_sv 제거
mydata.drop(['E_scr_sv', 'n_temp_sv'], axis=1, inplace=True)

# scale_pv < 4
mydata2 = mydata[mydata['scale_pv'] < 4]

# c_temp_pv >= 68
# - 65.1 일때 scale_pv는 모두 0
mydata2 = mydata2[mydata2['c_temp_pv'] >= 68]

# k_rpm_sv, E_scr_sv 제거 
mydata2.drop('k_rpm_sv', axis=1, inplace=True)
mydata2.drop('E_scr_pv', axis=1, inplace=True)

# filtering 분석 : k_rpm_pv < 50 확인
# - 결과 : 2.5이상이 22개, 해당 구간 삭제
mydata2 = mydata2[mydata2['k_rpm_pv'] >= 50]

# scale_pv : 0 초과 2.5 미만 삭제
# - .unique()보단 hist나 box로 보는 게 이상치 판단에 도움될 듯!
# - 결과 : 0 초과 2.5 미만인 데이터 삭제
mydata2 = mydata2[(mydata2['scale_pv'] <= 0) | (mydata2['scale_pv'] >= 2.5)]

# 이상치로 판단해 k_rpm_pv < 162.5 제거
mydata2 = mydata2[mydata2['k_rpm_pv'] >= 162.5]

# save the data
mydata2.to_csv('../DATA/before_data.csv', index=False)

- 확인할 부분에 filter 변수를 만들어 그래프 확인하는 방법은 정말 좋습니다
- 두 가지 조건을 &로 묶어서 필터링하기도 했네요
- 

In [4]:
# 3. KNN 증강
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

data = pd.read_csv('../DATA/before_data.csv')

non_zero_data = data[data['scale_pv'] != 0]
zero_data = data[data['scale_pv'] == 0]

features = non_zero_data.drop(columns=['scale_pv'])
target = non_zero_data['scale_pv']

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_zero_features = scaler.transform(zero_data.drop(columns=['scale_pv']))

knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(scaled_features, target)

predicted_scale_pv = knn.predict(scaled_zero_features)

zero_data.loc[:, 'scale_pv'] = predicted_scale_pv

augmented_data = pd.concat([non_zero_data, zero_data]).sort_index()
augmented_data = augmented_data.loc[data.index]

augmented_data.drop_duplicates(inplace=True)

4. 학습

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [6]:
X = augmented_data.drop(columns=['scale_pv'])
y = augmented_data['scale_pv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

- 스케일링 & 역스케일링?

In [7]:
# 모델 학습 및 평가 함수
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, scaler):
    model.fit(X_train, y_train)
    y_train_pred_scaled = model.predict(X_train)
    y_test_pred_scaled = model.predict(X_test)
    
    # 역스케일링
    y_train_pred = scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1))
    y_test_pred = scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    y_train_original = scaler.inverse_transform(y_train.values.reshape(-1, 1))
    y_test_original = scaler.inverse_transform(y_test.values.reshape(-1, 1))
    
    train_mae = mean_absolute_error(y_train_original, y_train_pred)
    test_mae = mean_absolute_error(y_test_original, y_test_pred)
    train_mape = mean_absolute_percentage_error(y_train_original, y_train_pred)
    test_mape = mean_absolute_percentage_error(y_test_original, y_test_pred)
    
    return train_mae, test_mae, train_mape, test_mape, y_train_pred, y_test_pred

# 역스케일링
target_scaler = StandardScaler()
target_scaler.fit(target.values.reshape(-1, 1))

In [8]:
# Multiple Regression
lr_model = LinearRegression()
lr_train_mae, lr_test_mae, lr_train_mape, lr_test_mape, lr_y_train_pred, lr_y_test_pred = train_and_evaluate_model(lr_model, X_train, X_test, y_train, y_test, target_scaler)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_train_mae, rf_test_mae, rf_train_mape, rf_test_mape, rf_y_train_pred, rf_y_test_pred = train_and_evaluate_model(rf_model, X_train, X_test, y_train, y_test, target_scaler)

# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_train_mae, lgb_test_mae, lgb_train_mape, lgb_test_mape, lgb_y_train_pred, lgb_y_test_pred = train_and_evaluate_model(lgb_model, X_train, X_test, y_train, y_test, target_scaler)

print(f"Linear Regression - Train MAE: {lr_train_mae}, Train MAPE: {lr_train_mape*100}")
print(f"Linear Regression - Test MAE: {lr_test_mae}, Test MAPE: {lr_test_mape*100}")
print()
print(f"Random Forest - Train MAE: {rf_train_mae}, Train MAPE: {rf_train_mape*100}")
print(f"Random Forest - Test MAE: {rf_test_mae}, Test MAPE: {rf_test_mape*100}")
print()
print(f"LightGBM - Train MAE: {lgb_train_mae}, Train MAPE: {lgb_train_mape*100}")
print(f"LightGBM - Test MAE: {lgb_test_mae}, Test MAPE: {lgb_test_mape*100}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001537 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 103004, number of used features: 4
[LightGBM] [Info] Start training from score 3.042988
Linear Regression - Train MAE: 0.0005966546560440541, Train MAPE: 0.01893030573511891
Linear Regression - Test MAE: 0.0005914186866800776, Test MAPE: 0.018763765599087552

Random Forest - Train MAE: 0.00017786994794854595, Train MAPE: 0.0056433573507702894
Random Forest - Test MAE: 0.00037988455880022466, Test MAPE: 0.01205248917724515

LightGBM - Train MAE: 0.0005026233173996973, Train MAPE: 0.015946624900180676
LightGBM - Test MAE: 0.0005059680618125167, Test MAPE: 0.016052509584770225


5. 테스트 : 10월 데이터

- 전처리

In [11]:
test_data = pd.read_csv('../DATA/raw_2023051820231018_경대기업맞춤형.csv')
test_data.drop('Unnamed: 12', axis=1, inplace=True)

# 10월 1일 이후 데이터
oct_1 = pd.Timestamp('2023-10-01')
test_data['time'] = pd.to_datetime(test_data['time']).dt.tz_localize(None)
test_data = test_data[test_data['time'] >= oct_1]

# 2 < scale_pv < 4
test_data = test_data[(test_data['scale_pv'] > 2) & (test_data['scale_pv'] < 4)]

# 컬럼 제거
test_data = test_data.drop(columns=['time', 's_temp_sv', 'c_temp_sv', 'E_scr_sv', 'n_temp_sv', 'k_rpm_sv', 'E_scr_pv'])

test_data.head()

Unnamed: 0,c_temp_pv,k_rpm_pv,n_temp_pv,scale_pv,s_temp_pv
205770,69.6,168,70.1,3.06,68.0
205771,69.5,169,70.1,3.06,68.0
205784,69.6,169,70.6,3.16,69.4
205785,69.6,173,70.5,3.16,69.5
205824,69.6,180,69.3,3.17,68.0


- 예측

In [12]:
# Predict without KNN

X_oct = test_data.drop(columns=['scale_pv'])
y_oct = test_data['scale_pv']

# rf_model
y_oct_pred_scaled = rf_model.predict(X_oct)
y_oct_pred = target_scaler.inverse_transform(y_oct_pred_scaled.reshape(-1, 1))

oct_mae = mean_absolute_error(y_oct, y_oct_pred)
oct_mape = mean_absolute_percentage_error(y_oct, y_oct_pred)

print(f"October - MAE: {oct_mae}, MAPE: {oct_mape*100}")

October - MAE: 0.10530561946370168, MAPE: 3.465242154006414
