In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

scaler = StandardScaler()


# -------------------------------------------------
# 1. CSV 파일 읽기 + 최근 30000개만 사용
# -------------------------------------------------
file_path = "/content/final_heatmap_lag.csv"
sd = pd.read_csv(file_path)

# 최근 35064개 사용
df = sd.head(35064).reset_index(drop=True)
# 최근 168행 제거
df = df.iloc[168:].reset_index(drop=True)
sd = sd.tail(8760)
y_t = sd.iloc[:, 0].values
X_t = sd.iloc[:, 2:].values

# -------------------------------------------------
# 2. 첫 번째 열이 y, 나머지 열이 X
# -------------------------------------------------
y = df.iloc[:, 0].values
X = df.iloc[:, 2:].values  # 두 번째 열 제외한 나머지 feature


print("X shape:", X.shape)
print("y shape:", y.shape)

# -------------------------------------------------
# 3. Train/Test Split (앞쪽 1/4 = test)
# -------------------------------------------------
test_size = len(df) // 4

X_train, X_test = X[test_size:], X[:test_size]
y_train, y_test = y[test_size:], y[:test_size]

scaler.fit(X_train)

# train과 test 모두 transform
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------------------------------
# 4. CatBoost 모델 (MAE + Early Stopping)
# -------------------------------------------------
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.03,
    depth=8,

    # ◆◆ MAE 기준으로 변경됨 ◆◆
    loss_function='MAE',
    eval_metric='MAE',

    random_seed=42,
    l2_leaf_reg=3,
    subsample=0.8,
    bootstrap_type='Bernoulli',

    # Early stopping
    od_type='Iter',
    od_wait=200,
    verbose=200
)

# -------------------------------------------------
# 5. Fit
# -------------------------------------------------
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

model.fit(
    train_pool,
    eval_set=test_pool,
    use_best_model=True
)

# -------------------------------------------------
# 6. Predict
# -------------------------------------------------
X_t = scaler.transform(X_t)
y_pred = model.predict(X_t)

print("Sample predictions:", y_pred[:5])

mae = mean_absolute_error(y_t, y_pred)
print("Test MAE (last 15% of data):", mae)

model.save_model("/content/catboost_model.cbm")

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# -------------------------------------------------
# 1. CSV 읽기 + 최근 30000개만 사용
# -------------------------------------------------
file_path = "/content/final_dataset_complete.csv"
sd = pd.read_csv(file_path)
df = sd.head(35064).reset_index(drop=True)
df = df.iloc[168:].reset_index(drop=True)

sd = sd.tail(8760)
y_t = sd.iloc[:, 0].values
X_t = sd.iloc[:, 2:].values

# -------------------------------------------------
# 2. 첫 번째 열 = y, 나머지 = X
# -------------------------------------------------
y = df.iloc[:, 0].values
X = df.iloc[:, 2:].values

# -------------------------------------------------
# 3. Train/Test Split (앞쪽 1/4 test)
# -------------------------------------------------
test_size = len(df) // 4
X_train, X_test = X[test_size:], X[:test_size]
y_train, y_test = y[test_size:], y[:test_size]

scaler2 = StandardScaler()
scaler2.fit(X_train)

# train과 test 모두 transform
X_train = scaler2.transform(X_train)
X_test = scaler2.transform(X_test)

# -------------------------------------------------
# 4. Dataset 생성
# -------------------------------------------------
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test)

# -------------------------------------------------
# 5. 파라미터
# -------------------------------------------------
params = {
    "objective": "regression_l1",   # ← MAE 기반 학습
    "metric": "l1",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 2.0,
    "random_state": 42,
}

# -------------------------------------------------
# 6. Train (callback 기반 early stopping)
# -------------------------------------------------
model = lgb.train(
    params,
    train_data,
    num_boost_round=5000,
    valid_sets=[valid_data],

    # ← early stopping 처리
    callbacks=[
        early_stopping(stopping_rounds=200),  # 200회 개선 없으면 stop
        log_evaluation(200)                   # 200 iteration마다 로그 출력
    ]
)

# -------------------------------------------------
# 7. Predict
# -------------------------------------------------
X_t = scaler.transform(X_t)
y_pred = model.predict(X_t)

print("Sample predictions:", y_pred[:5])

mae = mean_absolute_error(y_t, y_pred)
print("Test MAE (last 15% of data):", mae)

model.save_model("/content/lightgbm_model.cbm")

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# =========================
# 설정
# =========================
WINDOW_SIZE = 168
BATCH_SIZE = 64
NUM_EPOCHS = 400
LEARNING_RATE = 1e-3
PATIENCE = 20        # early stopping patience
MIN_DELTA = 1e-4       # 최소 개선량

MODEL_PATH = "/content/best_cnn_lstm.pt"
SCALER_PATH = "/content/scaler.pkl"

DATA_CSV = "/content/final_dataset_complete.csv"  # 전체가 합쳐진 csv 파일 이름으로 바꿔줘

# scaler를 fit할 구간: 앞에서 (35064 - 168)번째 행까지
SCALER_FIT_END = 35064 - 168   # = 34896

# 마지막 8760개 샘플을 test set으로 사용
NUM_TEST_SAMPLES = 8760

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# =========================
# Dataset / Model / EarlyStopping 정의
# =========================
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(-1)  # (N, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class CNNLSTM(nn.Module):
    def __init__(self, num_features, cnn_channels=64, lstm_hidden=64,
                 lstm_layers=1, dropout=0.2):
        super().__init__()

        # Conv1d: (batch, channels, seq_len)
        self.conv1 = nn.Conv1d(
            in_channels=num_features,
            out_channels=cnn_channels,
            kernel_size=3,
            padding=1
        )
        self.bn1 = nn.BatchNorm1d(cnn_channels)
        self.pool = nn.MaxPool1d(kernel_size=2)

        # LSTM: 입력 차원은 conv 출력 채널 수
        self.lstm = nn.LSTM(
            input_size=cnn_channels,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_hidden, 1)

    def forward(self, x):
        # x: (batch, seq_len, num_features)
        x = x.transpose(1, 2)     # (batch, num_features, seq_len)

        x = self.conv1(x)         # (batch, cnn_channels, seq_len)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.pool(x)          # (batch, cnn_channels, seq_len')

        x = x.transpose(1, 2)     # (batch, seq_len', cnn_channels)

        lstm_out, (h_n, c_n) = self.lstm(x)
        last_hidden = h_n[-1]     # (batch, hidden)

        out = self.dropout(last_hidden)
        out = self.fc(out)        # (batch, 1)
        return out


class EarlyStopping:
    def __init__(self, patience=PATIENCE, min_delta=MIN_DELTA):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = np.inf
        self.counter = 0

    def step(self, val_loss, model):
        improved = val_loss < self.best_loss - self.min_delta

        if improved:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"  -> Validation loss improved. Model saved to {MODEL_PATH}")
        else:
            self.counter += 1
            print(f"  -> No improvement. EarlyStopping counter = {self.counter}/{self.patience}")

        return self.counter >= self.patience


# =========================
# 윈도우 생성 함수
# =========================
def make_sequences(X_matrix, y_array, window_size):
    """
    X_matrix: (M, num_features)
    y_array : (M,)  - 각 시점에 대응하는 타깃 y
    window_size: 168

    리턴:
        X_seq: (num_samples, window_size, num_features)
        y_seq: (num_samples,)
    """
    M = X_matrix.shape[0]
    X_list = []
    y_list = []

    # end_idx: 윈도우의 마지막 시간 인덱스
    for end_idx in range(window_size - 1, M):
        start_idx = end_idx - window_size + 1
        window = X_matrix[start_idx:end_idx + 1, :]   # (window_size, num_features)
        target = y_array[end_idx]                    # 마지막 시점의 y

        X_list.append(window)
        y_list.append(target)

    X_seq = np.stack(X_list, axis=0)
    y_seq = np.array(y_list)
    return X_seq, y_seq


# =========================
# 메인
# =========================
def main():
    # ---------- 1. 데이터 로드 ----------
    df = pd.read_csv(DATA_CSV)
    col_to_drop = df.columns[1]

    # 그 컬럼을 드롭
    df = df.drop(columns=col_to_drop)
    print("Original data shape:", df.shape)  # (N, num_cols)
    num_rows, num_cols = df.shape

    if SCALER_FIT_END > num_rows:
        raise ValueError(f"SCALER_FIT_END({SCALER_FIT_END})가 전체 행 개수({num_rows})보다 큼.")

    # ---------- 2. StandardScaler fit & transform ----------
    # 앞에서 (35064-168)번째까지의 행으로 fit
    fit_df = df.iloc[:SCALER_FIT_END]   # 0 ~ SCALER_FIT_END-1
    scaler = StandardScaler()
    scaler.fit(fit_df.values)

    # 전체 데이터에 스케일러 적용
    scaled_values = scaler.transform(df.values)            # (N, num_cols)
    joblib.dump(scaler, SCALER_PATH)
    print(f"Scaler fitted on first {SCALER_FIT_END} rows and saved to {SCALER_PATH}")

    # ---------- 3. y 생성 ----------
    # 첫 번째 열을 카피해서 첫 행만 빼고 y로 사용
    y_all = scaled_values[:, 0]        # (N,)
    y_all = y_all[1:]                  # (N-1,)
    M = y_all.shape[0]

    # ---------- 4. X 행렬 만들기 ----------
    # 첫 번째 열: 마지막 행 삭제 → (N-1, 1)
    col0 = scaled_values[:-1, 0:1]

    # 나머지 열: 첫 번째 행 삭제 → (N-1, num_cols-1)
    others = scaled_values[1:, 1:]     # row 1~N-1, col 1~end

    # 두 부분을 합쳐 X_matrix: (N-1, num_cols)
    X_matrix = np.concatenate([col0, others], axis=1)
    assert X_matrix.shape[0] == M, "X와 y의 길이가 맞지 않습니다."

    print("After shift, X_matrix shape:", X_matrix.shape)  # (N-1, num_cols)
    print("After shift, y_all shape    :", y_all.shape)     # (N-1,)

    # ---------- 5. 윈도우(168)로 시퀀스 생성 ----------
    X_seq, y_seq = make_sequences(X_matrix, y_all, WINDOW_SIZE)
    num_samples, T, F = X_seq.shape
    print("Sequence X shape:", X_seq.shape)  # (num_samples, 168, num_features)
    print("Sequence y shape:", y_seq.shape)  # (num_samples,)

    if num_samples <= NUM_TEST_SAMPLES:
        raise ValueError(f"윈도우로 만든 샘플 수({num_samples})가 NUM_TEST_SAMPLES({NUM_TEST_SAMPLES})보다 적음.")

    # ---------- 6. Train/Val/Test 분할 ----------
    # 마지막 8760개를 test set
    X_test = X_seq[-NUM_TEST_SAMPLES:]
    y_test = y_seq[-NUM_TEST_SAMPLES:]

    X_trainval = X_seq[:-NUM_TEST_SAMPLES]
    y_trainval = y_seq[:-NUM_TEST_SAMPLES]

    n_trainval = X_trainval.shape[0]
    n_val = n_trainval // 4  # 맨 처음 1/4을 validation

    X_val = X_trainval[:n_val]
    y_val = y_trainval[:n_val]

    X_train = X_trainval[n_val:]
    y_train = y_trainval[n_val:]

    print("Train samples:", X_train.shape[0])
    print("Val samples  :", X_val.shape[0])
    print("Test samples :", X_test.shape[0])

    # ---------- 7. Dataset / DataLoader ----------
    train_dataset = TimeSeriesDataset(X_train, y_train)
    val_dataset   = TimeSeriesDataset(X_val, y_val)
    test_dataset  = TimeSeriesDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)
    test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

    # ---------- 8. 모델/손실함수/옵티마이저 ----------
    num_features = F
    model = CNNLSTM(num_features=num_features).to(DEVICE)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    early_stopping = EarlyStopping(patience=PATIENCE, min_delta=MIN_DELTA)

    # ---------- 9. 학습 루프 (Early Stopping 포함) ----------
    for epoch in range(1, NUM_EPOCHS + 1):
        model.train()
        train_losses = []

        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        avg_train_loss = np.mean(train_losses)

        # ----- validation -----
        model.eval()
        val_losses = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(DEVICE)
                y_batch = y_batch.to(DEVICE)

                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_losses.append(loss.item())

        avg_val_loss = np.mean(val_losses)
        print(f"[Epoch {epoch:03d}] Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}")

        # Early stopping 체크 (개선되면 모델 저장)
        stop = early_stopping.step(avg_val_loss, model)
        if stop:
            print("Early stopping triggered.")
            break

    # ---------- 10. 베스트 모델 로드 후 Test MSE ----------
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
        print(f"Best model loaded from {MODEL_PATH}")
    else:
        print("Warning: MODEL_PATH not found. Using last epoch model.")

    model.eval()
    test_losses = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            test_losses.append(loss.item())

    avg_test_loss = np.mean(test_losses)
    print(f"Final Test MSE: {avg_test_loss:.6f}")


if __name__ == "__main__":
    main()
