In [None]:
# 세팅
!pip install numpy pandas datetime torch torchvision scikit-learn matplotlib pytz google

In [1]:
# 라이브러리
import numpy as np
import pandas as pd
# from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
from pytz import timezone
import os
from google.colab import drive
import copy

In [2]:
# 데이터 가져오기
drive.mount('/content/drive')

path = "/content/drive/MyDrive/4-1_다변량통계분석/final_dataset_complete.csv"

# drive.mount('VGG')

Mounted at /content/drive


In [19]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
WINDOW_SIZE = 168
BATCH_SIZE = 128
LEARNING_RATE = 1e-2
EPOCHS = 100
PATIENCE = 10
MODEL_NAME = "vgg_model.h5"
MODEL_PATH = f"/content/drive/MyDrive/4-1_다변량통계분석/models/{MODEL_NAME}"

In [4]:
df = pd.read_csv(path)

print(df.iloc[:168].isnull().values.any()) # nan 존재
# print(df.iloc[165].isnull().values.any()) # 임의의 행에 nan 존재하는지 확인
print(df.iloc[169:].isnull().values.any()) # 그다음 행부터는 nan 존재하지 않음 확인

True
False


In [26]:
df = pd.read_csv(path)

# 1-1. 테스트셋(2021년도) 분리
df['datetime'] = pd.to_datetime(df['datetime'])
df_test = df[df['datetime'] >= '2021-01-01'].copy().reset_index(drop=True)
df = df[df['datetime']<'2021-01-01'].reset_index(drop=True)
print(f"train&eval 데이터 개수: {len(df)}")
print(f"test 데이터셋 개수: {len(df_test)}")

if 'datetime' in df.columns: df = df.drop(columns=['datetime']) # datetime 제거
Y = '청주 지역공급량(Gcal)'

# 1-2. 첫 168행 삭제
df = df.iloc[168:].reset_index(drop=True)
print(f"Nan 존재여부: {df.isnull().values.any()}") # 전체 데이터에 nan 있는지 확인

# 1-3. Y-X 형태로
cols = [Y] + [c for c in df.columns if c!=Y]
data_vals = df[cols].values
test_vals = df_test[cols].values
# y_test = df_test.iloc[:,0].values
# x_test = df_test.iloc[:,2:].values

# 1-4. train/val 분할
val_size = int(len(data_vals) * 0.2)
val_raw = data_vals[:val_size]
train_raw = data_vals[val_size:]

# 1-5. scaling - leakage 방지하기 위해 분할 먼저
scaler = StandardScaler()
train_data = scaler.fit_transform(train_raw)
val_data = scaler.transform(val_raw)
test_scaled = scaler.transform(test_vals)

train&eval 데이터 개수: 35064
test 데이터셋 개수: 8760
Nan 존재여부: False


In [None]:
df.head(10)

In [None]:
df.tail(10)

In [29]:
print(f"scaled_df 모양: {data_vals.shape}; scaled_df 길이: {data_vals.size}")
print(f"train_raw 모양: {train_raw.shape}; train_raw 길이: {train_raw.size}")

print(f"val_size: {val_size}")
print(f"cols size: {len(cols)}")
print(f"train size: {train_data.shape}")

scaled_df 모양: (34896, 503); scaled_df 길이: 17552688
train_raw 모양: (27917, 503); train_raw 길이: 14042251
val_size: 6979
cols size: 503
train size: (27917, 503)


In [17]:
# Sliding window dataset
class SlidingWindowDataset(Dataset):
  def __init__(self, data, window_size):
    self.data = data
    self.window_size = window_size

  def __len__(self):
    return len(self.data) - self.window_size

  def __getitem__(self, idx):
    # 입력 범위: idx ~ idx + window_size (과거 데이터)
    # 타겟 시점: idx + window_size (맞춰야 할 현재 시점)

    # 1. 입력 시퀀스 추출 (과거 window_size 개 + 현재 시점 1개)
    sequence = self.data[idx : idx + self.window_size + 1].copy() # shape: (window_size + 1, features)

    # 2. 정답 추출 (마지막 시점의 0번째 컬럼 = 지역공급량)
    target = sequence[-1, 0] # Y

    # 3. 마스크 채널
    # mask = np.ones((sequence.shape[0],1)) # (169, 1) 꼴의 행렬 [1,1,1, ..., 1]
    # 마지막 시점 마스킹 (데이터 0, 마스크 0)
    sequence[-1,0] = 0.0
    # mask[-1,0] = 0.0
    # 데이터와 마스크 합치기 (Concatenate)
    # sequence_masked = np.concatenate([sequence, mask], axis=1) # 결과 shape: (169, feature+1)

    sequence_tensor = torch.tensor(sequence.transpose(), dtype=torch.float32) # PyTorch Conv1d는 (Channel, Length) 순서를 원하므로 Transpose
    target_tensor = torch.tensor(target, dtype=torch.float32) # 결과 shape: (Features, Window_Size + 1)

    return sequence_tensor, target_tensor

# VGG 모델
class TimeSeriesVGG(nn.Module):
    def __init__(self, num_features):
        super(TimeSeriesVGG, self).__init__()

        # 입력: (Batch, Features, Time_Steps)
        # Features: 변수 개수 (500개 등)
        # Time_Steps: window_size + 1 (25개 등)

        self.features = nn.Sequential(
            # Block 1
            nn.Conv1d(in_channels=num_features, out_channels=64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            # 시계열 길이가 짧으면 Pooling은 조심해서 써야 함 (정보 손실)
            # 여기서는 차원 유지를 위해 Pooling 대신 Stride를 쓰거나 생략 가능
            # VGG 스타일을 살려 MaxPool 적용
            # nn.MaxPool1d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1) # 시간 차원을 1로 압축 (Global Pooling)
        )

        self.regressor = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 1) # 최종 예측값 1개
        )

    def forward(self, x):
        x = self.features(x)
        x = self.regressor(x)
        return x

In [20]:
# Early Stopping 구현
best_val_loss = float('inf')
patience_counter = 0
best_model_weights = None
loss_history = {'train':[], 'val':[]}

# 데이터셋 구성
train_dataset = SlidingWindowDataset(train_data, WINDOW_SIZE)
val_dataset = SlidingWindowDataset(val_data, WINDOW_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 모델 학습
model = TimeSeriesVGG(num_features=data_vals.shape[1]).to(DEVICE)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"학습 시작 (Max Epochs: {EPOCHS}, Patience: {PATIENCE})")

for epoch in range(EPOCHS): # Epoch 수 조절
    model.train()
    train_loss = 0.0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    loss_history['train'].append(avg_train_loss)

    # 검증
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE).unsqueeze(1)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    loss_history['val'].append(avg_val_loss)

    print(f"Epoch {epoch+1:03d} | Train Loss: {avg_train_loss:.5f} | Val Loss: {avg_val_loss:.5f}")

    if avg_val_loss < best_val_loss:
        # 성능이 좋아졌을 때
        best_val_loss = avg_val_loss
        patience_counter = 0
        # 현재 모델의 가중치를 깊은 복사(Deep Copy)로 저장해둠
        best_model_weights = copy.deepcopy(model.state_dict())
        # (선택) 파일로 저장하고 싶다면:
        torch.save(model.state_dict(), MODEL_PATH)
        print("  <-- Best Model Saved")
    else:
        # 성능이 안 좋아졌거나 같을 때
        patience_counter += 1
        print(f"  | Patience {patience_counter}/{PATIENCE}")

        if patience_counter >= PATIENCE:
            print(f"\n[Early Stopping] {epoch+1} 에폭에서 학습 조기 종료.")
            break

print("학습 완료.")
# 학습이 끝난 후(혹은 중단된 후), 저장해둔 최고의 가중치로 모델을 되돌립니다.
if best_model_weights is not None:
    model.load_state_dict(best_model_weights)
    print("최적의 검증 성능을 낸 모델로 가중치를 복구했습니다.")



학습 시작 (Max Epochs: 100, Patience: 10)
Epoch 001 | Train Loss: 0.72015 | Val Loss: 0.78732
  <-- Best Model Saved
Epoch 002 | Train Loss: 0.78381 | Val Loss: 0.71770
  <-- Best Model Saved
Epoch 003 | Train Loss: 0.87238 | Val Loss: 0.68218
  <-- Best Model Saved
Epoch 004 | Train Loss: 0.87108 | Val Loss: 0.67375
  <-- Best Model Saved
Epoch 005 | Train Loss: 0.87070 | Val Loss: 0.67184
  <-- Best Model Saved
Epoch 006 | Train Loss: 0.87032 | Val Loss: 0.67155
  <-- Best Model Saved
Epoch 007 | Train Loss: 0.87007 | Val Loss: 0.67181
  | Patience 1/10
Epoch 008 | Train Loss: 0.86978 | Val Loss: 0.67207
  | Patience 2/10
Epoch 009 | Train Loss: 0.86958 | Val Loss: 0.67227
  | Patience 3/10
Epoch 010 | Train Loss: 0.86942 | Val Loss: 0.67233
  | Patience 4/10
Epoch 011 | Train Loss: 0.86934 | Val Loss: 0.67258
  | Patience 5/10
Epoch 012 | Train Loss: 0.86921 | Val Loss: 0.67250
  | Patience 6/10
Epoch 013 | Train Loss: 0.86918 | Val Loss: 0.67272
  | Patience 7/10
Epoch 014 | Train Loss

In [31]:
# 1. 전체 데이터를 한 번에 가져오기 위해 배치를 통째로 설정
test_dataset = SlidingWindowDataset(test_scaled, WINDOW_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 2. 데이터 딱 한 번만 꺼내기 (iter, next 사용)
# inputs, targets = next(iter(full_batch_loader))

predictions = []
actuals = []

# 3. 모델 예측
model.eval()
with torch.no_grad():
  for inputs, targets in test_loader:
    inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
    outputs = model(inputs)
    predictions.extend(outputs.cpu().numpy().flatten())
    actuals.extend(targets.cpu().numpy().flatten())

# 4. MAE 계산
predictions = np.array(predictions)
actuals = np.array(actuals)

mae = mean_absolute_error(predictions, actuals)

print(f"최종 MAE: {mae:.4f}")

# y_mean = scaler.mean_[0]
# y_std = scaler.scale_[0]

# # 공식: (예측값 * 표준편차) + 평균
# y_pred_real = (predictions * y_std) + y_mean
# y_true_real = (actuals * y_std) + y_mean

# real_mae = mean_absolute_error(y_true_real, y_pred_real)
# print(f"원래 단위(Gcal) 기준 MAE: {real_mae:.4f}")

# (데이터 개수, 전체 변수 개수) 모양의 0으로 된 빈 행렬 생성
dummy_pred = np.zeros((len(predictions), scaler.n_features_in_))
dummy_pred_2 = np.zeros((len(predictions), scaler.n_features_in_))

# 0번째 열에만 예측값 집어넣기
dummy_pred[:, 0] = predictions
dummy_pred_2[:, 0] = actuals

# 전체를 역변환한 뒤, 다시 0번째 열만 뽑아내기
y_pred_real = scaler.inverse_transform(dummy_pred)[:, 0]
y_true_real = scaler.inverse_transform(dummy_pred_2)[:, 0]

real_mae = mean_absolute_error(y_true_real, y_pred_real)
print(f"원래 단위(Gcal) 기준 MAE: {real_mae:.4f}")

원래 단위(Gcal) 기준 MAE: 42.5672
