### Pipeline으로 전처리 - 모델링 - 예측까지 한번에

In [12]:
# scikitlearn update
# %conda install -c conda-forge scikit-learn

In [13]:
# 1. Load the data
import pandas as pd

data = pd.read_csv('../DATA/바웰공정데이터.csv')

# 2. Preprocessing : 목요일까지의 전처리
# (1) 2 < scale_pv < 3.6
data = data[(data['scale_pv'] > 2) & (data['scale_pv'] < 3.6)]

# (2) k_rpm_dv 생성
data['k_rpm_dv'] = data['k_rpm_sv'] - data['k_rpm_pv']

# (3) n_temp_sv=0 인 행 제거
data = data[data['n_temp_sv'] != 0]

# (4) 컬럼 제거 : E_scr_sv, c_temp_sv, n_temp_sv, k_rpm_sv, k_rpm_pv, s_temp_sv
data = data.drop(['E_scr_sv', 'c_temp_sv', 'n_temp_sv', 'k_rpm_sv', 'k_rpm_pv', 's_temp_sv'], axis=1)

data.head()

Unnamed: 0,time,E_scr_pv,c_temp_pv,n_temp_pv,scale_pv,s_temp_pv,k_rpm_dv
0,2023-05-18T15:09:19.018900Z,8,69.6,67.2,3.01,67.1,-9
1,2023-05-18T15:09:20.128150Z,8,69.8,67.2,3.01,67.0,-9
2,2023-05-18T15:12:38.819460Z,8,69.7,67.9,3.08,65.9,-9
3,2023-05-18T15:12:39.958188Z,8,69.7,67.8,3.08,65.9,-9
4,2023-05-18T15:12:41.050178Z,8,69.7,67.8,3.08,65.9,-9


In [14]:
# 2-2. Preprocessing : 추가 전처리
# (1) 시간 데이터 변환 - Month만 유의미했음
data['time'] = pd.to_datetime(data['time'])
data['Month'] = data['time'].dt.month
data.drop('time', axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

# (2) scale_pv 변화량 생성
data['scale_pv_dv'] = data['scale_pv'].diff().fillna(0)
data.head(10)


Unnamed: 0,E_scr_pv,c_temp_pv,n_temp_pv,scale_pv,s_temp_pv,k_rpm_dv,Month,scale_pv_dv
0,8,69.6,67.2,3.01,67.1,-9,5,0.0
1,8,69.8,67.2,3.01,67.0,-9,5,0.0
2,8,69.7,67.9,3.08,65.9,-9,5,0.07
3,8,69.7,67.8,3.08,65.9,-9,5,0.0
4,8,69.7,67.8,3.08,65.9,-9,5,0.0
5,8,69.7,67.4,3.01,65.8,-9,5,-0.07
6,8,69.8,67.4,3.01,66.0,-9,5,0.0
7,8,69.8,66.7,3.02,68.1,-9,5,0.01
8,8,69.9,66.8,3.02,67.9,-9,5,0.0
9,8,69.7,67.8,3.04,66.2,-9,5,0.02


In [20]:
# 극단적인 전처리
# (1) rpm_dv가 100 이상인 데이터 제거
data = data[data['k_rpm_dv'] < 100]

# (2) scr=7을 분리할까했지만 과대적합이 우려되어 제거

# (3) temp끼리, 혹은 scale 곱하기 등을 했지만 큰 상관관계가 없어 제거

# (4) temp끼리 더하기 : temp들과의 상관관게는 높음

# (5) temp에 패턴이 보이는 게 시계열 데이터로서 의미가 있을 수 있음
# 시계열 분석 : ARIMA, Prophet, LSTM 등

Unnamed: 0,E_scr_pv,c_temp_pv,n_temp_pv,scale_pv,s_temp_pv,k_rpm_dv,Month,scale_pv_dv,temp_sum,temp_mul
E_scr_pv,1.0,0.04258,-0.028117,0.025773,-0.01298,0.065748,0.167831,-0.000218,-0.010322,-0.010876
c_temp_pv,0.04258,1.0,0.264383,0.011624,0.137873,0.243408,0.410217,-0.003836,0.438271,0.428598
n_temp_pv,-0.028117,0.264383,1.0,0.143362,0.58564,0.039726,-0.113377,0.001125,0.8597,0.861643
scale_pv,0.025773,0.011624,0.143362,1.0,0.073616,0.053707,-0.069813,0.291902,0.111123,0.111962
s_temp_pv,-0.01298,0.137873,0.58564,0.073616,1.0,0.037655,-0.089807,-0.018845,0.873684,0.875944
k_rpm_dv,0.065748,0.243408,0.039726,0.053707,0.037655,1.0,0.476403,0.028634,0.097599,0.095311
Month,0.167831,0.410217,-0.113377,-0.069813,-0.089807,0.476403,1.0,-4.2e-05,-0.00638,-0.011118
scale_pv_dv,-0.000218,-0.003836,0.001125,0.291902,-0.018845,0.028634,-4.2e-05,1.0,-0.011122,-0.010989
temp_sum,-0.010322,0.438271,0.8597,0.111123,0.873684,0.097599,-0.00638,-0.011122,1.0,0.999893
temp_mul,-0.010876,0.428598,0.861643,0.111962,0.875944,0.095311,-0.011118,-0.010989,0.999893,1.0


In [21]:
# data set
X = data.drop('scale_pv', axis=1)
y = data['scale_pv']

# 3. Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

### Pipeline 만들기

In [23]:
# 3. Pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.ensemble import RandomForestRegressor


MAE : 0.0204451424700744


In [31]:
for i in Pipeline.__getattribute__():
    print(i)

TypeError: descriptor '__getattribute__' of 'object' object needs an argument

In [24]:

pipeline = make_pipeline(RandomForestRegressor())
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)

print('MAE :', mean_absolute_error(y_test, pred))


MAE : 0.02032199083493613
