### Pipeline으로 전처리 - 모델링 - 예측까지 한번에

In [64]:
# scikitlearn update
# %conda install -c conda-forge scikit-learn

In [65]:
# 1. Load the data
import pandas as pd

data = pd.read_csv('../DATA/바웰공정데이터.csv')

# 2. Preprocessing : 목요일까지의 전처리
# (1) 2 < scale_pv < 4
data = data[(data['scale_pv'] > 2) & (data['scale_pv'] < 4)]

# (2) k_rpm_pv 가 100 이하인 행 제거
data = data[data['k_rpm_pv'] > 100]

# (3) n_temp_sv=0 인 행 제거
data = data[data['n_temp_sv'] != 0]

# (4) 컬럼 제거 : E_scr_sv, c_temp_sv, n_temp_sv, s_temp_sv, k_rpm_sv, time
data = data.drop(['E_scr_sv', 'c_temp_sv', 'n_temp_sv', 's_temp_sv', "k_rpm_sv", 'time'], axis=1)

data.head()

Unnamed: 0,E_scr_pv,c_temp_pv,k_rpm_pv,n_temp_pv,scale_pv,s_temp_pv
0,8,69.6,189,67.2,3.01,67.1
1,8,69.8,189,67.2,3.01,67.0
2,8,69.7,189,67.9,3.08,65.9
3,8,69.7,189,67.8,3.08,65.9
4,8,69.7,189,67.8,3.08,65.9


In [66]:
# 2-2. Preprocessing : 추가 전처리


In [67]:
# 중복값 확인
print(data.duplicated().sum())

# 중복값 제거
data = data.drop_duplicates()
print(data.duplicated().sum())

2519
0


In [68]:
# 

In [69]:
# 3. Feature Engineering
# - Pileline으로 스케일링 및 모델링을 한번에 처리
# - Scaling : MinMaxScaler, StandardScaler, RobustScaler, 스케일링 없이 하나
# - Model : LinearRegression, ElasticNet, RandomForest, LightGBM
# - Evaluation : MAE, MAPE, R2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

# 3-1. 데이터 분할
X = data.drop('scale_pv', axis=1)
y = data['scale_pv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create pipelines for each scaling method and model
pipelines = {
    'MinMaxScaler_LinearRegression': Pipeline([('scaler', MinMaxScaler()), ('model', LinearRegression())]),
    'StandardScaler_LinearRegression': Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())]),
    'RobustScaler_LinearRegression': Pipeline([('scaler', RobustScaler()), ('model', LinearRegression())]),
    'MinMaxScaler_ElasticNet': Pipeline([('scaler', MinMaxScaler()), ('model', ElasticNet())]),
    'StandardScaler_ElasticNet': Pipeline([('scaler', StandardScaler()), ('model', ElasticNet())]),
    'RobustScaler_ElasticNet': Pipeline([('scaler', RobustScaler()), ('model', ElasticNet())]),
    'MinMaxScaler_RandomForestRegressor': Pipeline([('scaler', MinMaxScaler()), ('model', RandomForestRegressor())]),
    'StandardScaler_RandomForestRegressor': Pipeline([('scaler', StandardScaler()), ('model', RandomForestRegressor())]),
    'RobustScaler_RandomForestRegressor': Pipeline([('scaler', RobustScaler()), ('model', RandomForestRegressor())]),
    'MinMaxScaler_LGBMRegressor': Pipeline([('scaler', MinMaxScaler()), ('model', LGBMRegressor())]),
    'StandardScaler_LGBMRegressor': Pipeline([('scaler', StandardScaler()), ('model', LGBMRegressor())]),
    'RobustScaler_LGBMRegressor': Pipeline([('scaler', RobustScaler()), ('model', LGBMRegressor())]),
}

# Accessing a specific pipeline
pipeline = pipelines['MinMaxScaler_LinearRegression']

# Fit the pipeline
for pipeline in pipelines.values():
    pipeline.fit(X_train, y_train)
    
# Evaluate the pipelines
for name, pipeline in pipelines.items():
    y_pred = pipeline.predict(X_test)
    print(f'{name} - MAE : {mean_absolute_error(y_test, y_pred):.4f}')
    print(f'{name} - MAPE : {mean_absolute_percentage_error(y_test, y_pred):.6f}')
    print(f'{name} - R2 : {r2_score(y_test, y_pred):.4f}')
    print('----------------------------------')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 20462, number of used features: 5
[LightGBM] [Info] Start training from score 3.042010
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 20462, number of used features: 5
[LightGBM] [Info] Start training from score 3.042010
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2

In [70]:
# Scaling 안한 pipelines
pipelines = {
    'NoScaler_LinearRegression': Pipeline([('model', LinearRegression())]),
    'NoScaler_ElasticNet': Pipeline([('model', ElasticNet())]),
    'NoScaler_RandomForestRegressor': Pipeline([('model', RandomForestRegressor())]),
    'NoScaler_LGBMRegressor': Pipeline([('model', LGBMRegressor())]),
}

# Fit the pipeline
for pipeline in pipelines.values():
    pipeline.fit(X_train, y_train)
    
# Evaluate the pipelines
for name, pipeline in pipelines.items():
    y_pred = pipeline.predict(X_test)
    print(f'{name} - MAE : {mean_absolute_error(y_test, y_pred):.4f}')
    print(f'{name} - MAPE : {mean_absolute_percentage_error(y_test, y_pred):.6f}')
    print(f'{name} - R2 : {r2_score(y_test, y_pred):.4f}')
    print('----------------------------------')
    

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 214
[LightGBM] [Info] Number of data points in the train set: 20462, number of used features: 5
[LightGBM] [Info] Start training from score 3.042010
NoScaler_LinearRegression - MAE : 0.0282
NoScaler_LinearRegression - MAPE : 0.009285
NoScaler_LinearRegression - R2 : 0.0344
----------------------------------
NoScaler_ElasticNet - MAE : 0.0288
NoScaler_ElasticNet - MAPE : 0.009469
NoScaler_ElasticNet - R2 : -0.0000
----------------------------------
NoScaler_RandomForestRegressor - MAE : 0.0225
NoScaler_RandomForestRegressor - MAPE : 0.007421
NoScaler_RandomForestRegressor - R2 : 0.4203
----------------------------------
NoScaler_LGBMRegressor - MAE : 0.0256
NoScaler_LGBMRegressor - MAPE : 0.008420
NoScaler_LGBMRegressor - R2 : 0.2656
--

In [76]:
# StandardScaler, RandomForestRegressor로 최종 모델 선택
final_pipeline = Pipeline([('scaler', StandardScaler()), ('model', RandomForestRegressor())])
final_pipeline.fit(X_train, y_train)
y_pred = final_pipeline.predict(X_test)
print(f'Final Model - MAE : {mean_absolute_error(y_test, y_pred):.4f}')
print(f'Final Model - MAPE : {mean_absolute_percentage_error(y_test, y_pred):.6f}')
print(f'Final Model - R2 : {r2_score(y_test, y_pred):.4f}')
print('----------------------------------')

Final Model - MAE : 0.0225
Final Model - MAPE : 0.007392
Final Model - R2 : 0.4266
----------------------------------


In [80]:
# LSTM 모델링
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.callbacks import EarlyStopping

# 데이터 전처리
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# LSTM 모델링
model = Sequential()
model.add(LSTM(64, input_shape=(X_train_scaled.shape[1], 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

# EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', patience=10, verbose=1)

# Reshape
X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Fit
model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=1000, batch_size=32, callbacks=[es])

# Predict
y_pred = model.predict(X_test_scaled)
y_pred = y_pred.reshape(-1)

# Evaluate
print(f'LSTM - MAE : {mean_absolute_error(y_test, y_pred):.4f}')
print(f'LSTM - MAPE : {mean_absolute_percentage_error(y_test, y_pred):.6f}')
print(f'LSTM - R2 : {r2_score(y_test, y_pred):.4f}')
print('----------------------------------')

Epoch 1/1000


  super().__init__(**kwargs)


[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 1.6856 - val_loss: 0.0141
Epoch 2/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0119 - val_loss: 0.0038
Epoch 3/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0033 - val_loss: 0.0021
Epoch 4/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0022 - val_loss: 0.0023
Epoch 5/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0018 - val_loss: 0.0018
Epoch 6/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0020 - val_loss: 0.0018
Epoch 7/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0019 - val_loss: 0.0020
Epoch 8/1000
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0019 - val_loss: 0.0017
Epoch 9/1000
[1m512/512[0m [32m━━━

In [72]:
# 4. Hyperparameter Tuning