### 빠르게 1차 데이터 전처리 방식대로 구현

In [1]:
# Load Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Data Preprocessing : 1st 분석 결과
# - 05.27 수정 : 학습용 위해 2~4 미규제, E_scr_pv == 8, k_rpm_pv > 100 제거, 10월 기준 분리
data = pd.read_csv('../DATA/raw_2023051820231018_경대기업맞춤형.csv')
data.drop('Unnamed: 12', axis=1, inplace=True)  # Unnamed: 12 컬럼 제거
print('기본 데이터 shape :', data.shape)
# 1) 행 제거 (규제) 
# - 2 < scale_pv < 4 이외 데이터 제거 => 증강용으로 보류
# - scale_pv < 5 만 남김
# - E_scr_pv != 8 외 데이터 제거
# - k_rpm_pv < 100 제거
# data = data[(data['scale_pv'] > 2) & (data['scale_pv'] < 4)]
data = data[data['scale_pv'] < 5]  # 약 1800개 제거
print('scale_pv < 5 shape :', data.shape)
data = data[data['E_scr_pv'] == 8]  # 약 3800개 제거
print('E_scr_pv == 8 shape :', data.shape)
data = data[data['k_rpm_pv'] > 100] # 약 170개 제거
print('k_rpm_pv > 100 shape :', data.shape)

# 2) 컬럼 제거
# - E_scr_sv, c_temp_sv, n_temp_sv, s_temp_sv, k_rpm_sv, n_temp_sv 제거
data.drop(['E_scr_sv', 'E_scr_pv', 'c_temp_sv', 's_temp_sv', 'k_rpm_sv', 'n_temp_sv'], axis=1, inplace=True)
# data

# 3) oct_data, train_data 분리
data['time'] = pd.to_datetime(data['time'])

oct_data = data[data['time'].dt.month == 10]
oct_data = oct_data.drop('time', axis=1)
print('oct_data shape :', oct_data.shape)

train_data = data[data['time'].dt.month != 10]
train_data = train_data.drop('time', axis=1)
print('train_data shape :', train_data.shape)

# 4) 데이터 저장
oct_data.to_csv('../DATA/oct_data.csv', index=False)
train_data.to_csv('../DATA/train_data.csv', index=False)

기본 데이터 shape : (235413, 12)
scale_pv < 5 shape : (233676, 12)
E_scr_pv == 8 shape : (229983, 12)
k_rpm_pv > 100 shape : (229810, 12)
oct_data shape : (29651, 5)
train_data shape : (200159, 5)


### 증강 없이 학습 & 예측
1. Oct_data : scale만 조정한 10월 데이터
2. train_data : scale, k_rpm_pv만 조정한 10월 이전 데이터

In [3]:
# 데이터 규제 : 2 < scale_pv < 4
train_data = train_data[(train_data['scale_pv'] > 2) & (train_data['scale_pv'] < 4)]
oct_data = oct_data[(oct_data['scale_pv'] > 2) & (oct_data['scale_pv'] < 4)]
print('train_data shape :', train_data.shape)
print('oct_data shape :', oct_data.shape)

oct_data.scale_pv.describe()

train_data shape : (36720, 5)
oct_data shape : (1405, 5)


count    1405.000000
mean        3.047580
std         0.032548
min         2.850000
25%         3.030000
50%         3.050000
75%         3.060000
max         3.280000
Name: scale_pv, dtype: float64

In [4]:
# 증강 없이 학습
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

X = train_data.drop('scale_pv', axis=1)
y = train_data['scale_pv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
# param_grid = {
#     'n_estimators': [100, 200],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1, verbose=3, scoring='neg_mean_absolute_error')
# grid_search.fit(X_train, y_train)
# print(grid_search.best_params_)

# rf = grid_search.best_estimator_
y_pred = rf.predict(X_test)

print('MAE :', mean_absolute_error(y_test, y_pred))
print('MAPE :', mean_absolute_percentage_error(y_test, y_pred)*100)
print('R2 :', r2_score(y_test, y_pred))

# 증강 없이 예측
X_oct = oct_data.drop('scale_pv', axis=1)
y_oct = oct_data['scale_pv']
y_oct_pred = rf.predict(X_oct)

print('MAE :', mean_absolute_error(y_oct, y_oct_pred))
print('MAPE :', mean_absolute_percentage_error(y_oct, y_oct_pred)*100)
print('R2 :', r2_score(y_oct, y_oct_pred))

# - rf에서 criterion='absolute_error'로 설정하면 너무 오래걸림

MAE : 0.020739688478127995
MAPE : 0.6832850969599515
R2 : 0.4061375618241577
MAE : 0.027396616076884737
MAPE : 0.8969436832994633
R2 : -0.2623783404725153


In [5]:
# 사실상 위 값은 피처 공학적 요소가 거의 없었다;

### 다중공선성 확인

### 시명님의 증강 방법

In [6]:
from datetime import timedelta

# DF1 = data.copy()

# DF1['time'] = pd.to_datetime(DF1['time'])

# for idx, row in DF1.iterrows():
#     if 2 <= row['scale_pv'] <= 4:
#         target_time6 = row['time'] - timedelta(seconds=10)
#         mask = (DF1['time'] <= row['time']) & (DF1['time'] >= target_time6)
#         previous_rows = DF1[mask]
        
#         if len(previous_rows) > 1:
#             previous_row = previous_rows.iloc[-2]  # 조건을 만족하는 가장 마지막에서 두 번째 행
#             if previous_row['scale_pv'] <= 1:
#                 # 조건을 만족하는 모든 행의 scale_pv 값을 패딩
#                 for i in range(len(previous_rows)):
#                     if previous_rows.iloc[i]['scale_pv'] <= 1:
#                         DF1.loc[previous_rows.index[i], 'scale_pv'] = row['scale_pv']
# DF1_2to4 = DF1[DF1.scale_pv.between(2,4)]
# DF1_2to4

# gpt가 수정 ================================
# data = pd.read_csv('../DATA/raw_2023051820231018_경대기업맞춤형.csv')

# DF1 = data.copy()
# DF1['time'] = pd.to_datetime(DF1['time'])
# DF1 = DF1.sort_values('time')  # 시간 순으로 정렬

# # scale_pv가 2에서 4 사이인지 아닌지에 대한 boolean mask 생성
# mask_2to4 = DF1['scale_pv'].between(2, 4)

# # 10초 이내의 이전 행들 중 scale_pv가 1 이하인 행을 찾기 위한 mask 생성
# mask_10s = (DF1['time'].values - DF1['time'].values[:, None]) <= np.timedelta64(10, 's')
# mask_le1 = DF1['scale_pv'].values <= 1
# mask_prev_le1 = np.tril(mask_10s & mask_le1)

# # scale_pv가 2에서 4 사이인 행의 이전 행들 중 scale_pv가 1 이하인 행을 찾아서 업데이트
# for idx in np.where(mask_2to4)[0]:
#     prev_rows = np.where(mask_prev_le1[idx])[0]
#     if len(prev_rows) > 0:
#         DF1.loc[DF1.index[prev_rows], 'scale_pv'] = DF1.loc[DF1.index[idx], 'scale_pv']

# DF1_2to4 = DF1[mask_2to4]
# DF1_2to4

In [7]:
# 전처리 진행
# data_2to4, oct_data_2to4, train_data_2to4 = prep_1st(DF1_2to4)

# print(data_2to4.shape, oct_data_2to4.shape, train_data_2to4.shape)

==> 메모리 문제,,

In [8]:
# 증강하여 학습
# - train_data의 1 미만인 값에서 KNN으로 증강
from sklearn.impute import KNNImputer

data = pd.read_csv('../DATA/raw_2023051820231018_경대기업맞춤형.csv')

# 10월 이전 데이터만 사용
data['time'] = pd.to_datetime(data['time'])
train_data = data[data['time'].dt.month != 10]
oct_data = data[data['time'].dt.month == 10]

# scale_pv < 4 데이터만 사용
train_data = train_data[train_data['scale_pv'] < 4]

# n_temp_sv == 0 제거
train_data = train_data[train_data['n_temp_sv'] != 0]

# 100 < k_rpm_pv
train_data = train_data[train_data['k_rpm_pv'] > 100]

# scale_pv < 2 => NaN
train_data['scale_pv'] = train_data['scale_pv'].apply(lambda x: np.nan if x < 2 else x)
print('NaN 개수 :', train_data['scale_pv'].isnull().sum())

# drop columns
train_data.drop(['Unnamed: 12', 'E_scr_sv', 'c_temp_sv', 'n_temp_sv', 's_temp_sv', 'k_rpm_sv', 'time'], axis=1, inplace=True)

# KNN Imputer : target은 scale_pv, NaN은 2 미만인 값
X = train_data.drop('scale_pv', axis=1)
y = train_data['scale_pv']

# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

# imputer
imputer = KNNImputer(n_neighbors=5)
# y = imputer.fit_transform(y.values.reshape(-1, 1)).reshape(-1)
print('NaN 개수 :', np.isnan(y).sum())

# 증강된 데이터로 학습
train_data['scale_pv'] = y

X = train_data.drop('scale_pv', axis=1)
y = train_data['scale_pv']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('MAE :', mean_absolute_error(y_test, y_pred))
print('MAPE :', mean_absolute_percentage_error(y_test, y_pred)*100)
print('R2 :', r2_score(y_test, y_pred))

# 증강된 데이터로 예측
# - oct_data : 2 < scale_pv < 4
oct_data = oct_data[(oct_data['scale_pv'] > 2) & (oct_data['scale_pv'] < 4)]
oct_data.drop(['Unnamed: 12', 'E_scr_sv', 'c_temp_sv', 'n_temp_sv', 's_temp_sv', 'k_rpm_sv', 'time'], axis=1, inplace=True)

X_oct = oct_data.drop('scale_pv', axis=1)
y_oct = oct_data['scale_pv']
y_oct_pred = rf.predict(X_oct)

print('MAE :', mean_absolute_error(y_oct, y_oct_pred))
print('MAPE :', mean_absolute_percentage_error(y_oct, y_oct_pred)*100)
print('R2 :', r2_score(y_oct, y_oct_pred))

NaN 개수 : 163440
NaN 개수 : 163440


ValueError: Input y contains NaN.

KNN 증강에서 오류, 수정해야함