In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
import pickle
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. 데이터 전처리 관련 모듈 import
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, LabelEncoder, OrdinalEncoder

# 2. 데이터 분할
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, cross_val_predict, cross_validate, StratifiedKFold

# 3. 사용할 알고리즘 import
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.inspection import partial_dependence, PartialDependenceDisplay

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from pdpbox import pdp, info_plots

# 4. 성능 평가
from sklearn.metrics import r2_score, root_mean_squared_error, root_mean_squared_log_error, mean_absolute_percentage_error

In [3]:
def getScore(name, y_test, y_pred):
    r2 = r2_score(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    # rmsle = root_mean_squared_log_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'{name}, r2: {r2:.3f}, mape: {mape:.3f}, rmse: {rmse:.3f}')

In [4]:
df = pd.read_csv('./df_23rdtrial.csv', index_col = 'Datetime')
x = df.iloc[:,:-1]
y = df['SHFT_avg']
df.shape, x.shape, y.shape

((2167, 30), (2167, 29), (2167,))

In [None]:
# 오토 인코더

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

# 샘플 데이터 생성 및 스케일링
X = np.random.rand(2000, 30)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 데이터 분할
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# 오토인코더 모델 정의
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        # 인코더
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
        )
        # 디코더
        self.decoder = nn.Sequential(
            nn.Linear(16, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# 모델 초기화
input_dim = X_train.shape[1]
model = Autoencoder(input_dim)

# 손실 함수와 옵티마이저 설정
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# 모델 학습
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, X_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# 테스트 데이터에 대해 재구성 오차 계산
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    reconstruction_error = torch.mean((X_test - test_outputs) ** 2, dim=1).numpy()

# 임계치 설정 (훈련 데이터의 평균 재구성 오차 + 3표준편차)
threshold = reconstruction_error.mean() + 3 * reconstruction_error.std()

# 이상치 탐지
anomalies = reconstruction_error > threshold
print("임계치를 벗어나는 데이터 개수:", np.sum(anomalies))
print("임계치를 벗어나는 인덱스:", np.where(anomalies)[0])