센서 컬럼 자동 탐색
통계 기반 이상치(outlier) 플래그
Z-score 정규화
슬라이딩 윈도우 + 라벨
시계열 분할
WeightedRandomSampler 기반 배치 불균형 보정
LSTM 분류기 학습
PR-Curve 기반 임계치 결정 → 최종 평가

하이퍼파라미터(W, S, hidden_dim, batch_size 등)만 조정

In [27]:
# 전체 파이프라인 코드 (지정 센서 컬럼 + SMOTE 오버샘플링 + RandomUnderSampler)

import pandas as pd
import numpy as np
from scipy import stats
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, classification_report, average_precision_score
from scipy.special import expit
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 1. 데이터 로드 및 정렬
df = pd.read_csv('../train2.csv', sep=';', parse_dates=['time'])
df = df.sort_values('time').reset_index(drop=True)

# 2. 사용할 센서 컬럼 명시
sensor_cols = [
    'P1_LCV01D', 'P1_B2016', 'P1_PIT01',
    'P3_LCV01D', 'P2_SIT01', 'P1_LIT01',
    'P1_FT03',   'P1_PCV01D'
]

# 3. 통계 기반 이상치 플래그 (|z| ≥ 3)
zscores = df[sensor_cols].apply(stats.zscore)
for col in sensor_cols:
    df[f'{col}_outlier'] = (zscores[col].abs() >= 3).astype(float)

# 4. Z-score 정규화
for col in sensor_cols:
    μ, σ = df[col].mean(), df[col].std()
    df[f'{col}_norm'] = (df[col] - μ) / σ

# 5. 슬라이딩 윈도우 생성 및 노이즈 윈도우 제거
W, S = 100, 10
feat_cols = [f'{c}_norm'    for c in sensor_cols] + \
            [f'{c}_outlier' for c in sensor_cols]

windows, labels = [], []
L = len(df)
for i in range(0, L - W + 1, S):
    win = df.iloc[i:i+W]
    # 노이즈 윈도우: attack==0 이면서 any outlier True
    if (win['attack'] == 0).all() and win[[f'{c}_outlier' for c in sensor_cols]].any(axis=None):
        continue
    windows.append(win[feat_cols].values.astype(np.float32))
    labels.append(int(win['attack'].any()))

X = np.stack(windows)                # (n_windows, W, 2*len(sensor_cols))
y = np.array(labels, dtype=np.int64) # 0=정상, 1=실제 이상

# 6. Stratified train/val/test 분할 (70/15/15)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42
)
val_frac = 0.15 / 0.85
X_train_raw, X_val, y_train_raw, y_val = train_test_split(
    X_temp, y_temp, test_size=val_frac, stratify=y_temp, random_state=42
)

# 7. SMOTE + RandomUnderSampler 파이프라인
n_train, Wc, Fc = X_train_raw.shape
X_flat = X_train_raw.reshape(n_train, Wc * Fc)
imb_pipeline = ImbPipeline([
    ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
    ('under', RandomUnderSampler(sampling_strategy=0.6, random_state=42))
])
X_res, y_res = imb_pipeline.fit_resample(X_flat, y_train_raw)
X_train = X_res.reshape(-1, Wc, Fc)
y_train = y_res.astype(np.int64)

# 8. PyTorch Dataset & DataLoader
class SensorDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(SensorDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader   = DataLoader(SensorDataset(X_val,   y_val),   batch_size=64, shuffle=False)
test_loader  = DataLoader(SensorDataset(X_test,  y_test),  batch_size=64, shuffle=False)

# 9. LSTM 분류기 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers,
                            batch_first=True, bidirectional=True)
        self.fc   = nn.Linear(hidden_dim*2, 1)
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        h = torch.cat([h_n[-2], h_n[-1]], dim=1)
        return self.fc(h).squeeze(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = LSTMClassifier(len(feat_cols)).to(device)

# 10. Loss & Optimizer 설정
counts = np.bincount(y_train, minlength=2)
pos_weight = torch.tensor([counts[0]/counts[1]], device=device) if counts[1]>0 else torch.tensor([1.0], device=device)
criterion  = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer  = torch.optim.Adam(model.parameters(), lr=1e-3)

# 11. 학습 루프
for epoch in range(1, 21):
    model.train()
    total_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.float().to(device)
        logits = model(xb)
        loss   = criterion(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    print(f'Epoch {epoch:02d}  Loss: {total_loss/len(train_loader.dataset):.6f}')

# 12. Validation → Threshold Tuning
model.eval()
val_logits = []
with torch.no_grad():
    for xb, _ in val_loader:
        xb = xb.to(device)
        val_logits.extend(model(xb).cpu().numpy())

val_logits = np.array(val_logits)
mask = np.isfinite(val_logits)
val_probs = expit(val_logits[mask])
y_val_filt = y_val[mask]
prec, rec, th = precision_recall_curve(y_val_filt, val_probs)
f1_scores = 2*prec*rec/(prec+rec+1e-8)
best_thresh = th[np.nanargmax(f1_scores)]
print(f'Best threshold = {best_thresh:.4f}')

# 13. Test 평가
test_probs, y_pred = [], []
with torch.no_grad():
    for xb, _ in test_loader:
        xb = xb.to(device)
        logits = model(xb).cpu().numpy()
        probs = expit(logits)
        test_probs.extend(probs)
        y_pred.extend((probs >= best_thresh).astype(int))

test_probs = np.array(test_probs)
print(classification_report(y_test, y_pred, digits=4))
print("Test PR-AUC:", average_precision_score(y_test, test_probs))


Epoch 01  Loss: 0.082406
Epoch 02  Loss: 0.007162
Epoch 03  Loss: 0.018146
Epoch 04  Loss: 0.013875
Epoch 05  Loss: 0.012732
Epoch 06  Loss: 0.004660
Epoch 07  Loss: 0.003619
Epoch 08  Loss: 0.000752
Epoch 09  Loss: 0.000101
Epoch 10  Loss: 0.000051
Epoch 11  Loss: 0.000032
Epoch 12  Loss: 0.000023
Epoch 13  Loss: 0.000018
Epoch 14  Loss: 0.000015
Epoch 15  Loss: 0.000013
Epoch 16  Loss: 0.000010
Epoch 17  Loss: 0.000009
Epoch 18  Loss: 0.000008
Epoch 19  Loss: 0.000007
Epoch 20  Loss: 0.000006
Best threshold = 1.0000
              precision    recall  f1-score   support

           0     0.9994    1.0000    0.9997      1743
           1     1.0000    0.9333    0.9655        15

    accuracy                         0.9994      1758
   macro avg     0.9997    0.9667    0.9826      1758
weighted avg     0.9994    0.9994    0.9994      1758

Test PR-AUC: 0.9958333333333333


추가로 5-fold함

In [28]:
import pandas as pd
import numpy as np
from scipy import stats
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import recall_score, classification_report
from scipy.special import expit
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# 1. 데이터 로드 및 정렬
df = pd.read_csv('../train2.csv', sep=';', parse_dates=['time'])
df = df.sort_values('time').reset_index(drop=True)

# 2. 사용할 센서 컬럼 명시
sensor_cols = [
    'P1_LCV01D','P1_B2016','P1_PIT01',
    'P3_LCV01D','P2_SIT01','P1_LIT01',
    'P1_FT03','P1_PCV01D'
]

# 3. 이상치 플래그 및 정규화
zscores = df[sensor_cols].apply(stats.zscore)
for c in sensor_cols:
    df[f'{c}_outlier'] = (zscores[c].abs() >= 3).astype(float)
    mu, sigma = df[c].mean(), df[c].std()
    df[f'{c}_norm'] = (df[c] - mu) / sigma

# 4. 슬라이딩 윈도우 생성 및 노이즈 제거
W, S = 100, 10
feat_cols = [f'{c}_norm' for c in sensor_cols] + [f'{c}_outlier' for c in sensor_cols]
windows, labels = [], []
L = len(df)
for i in range(0, L - W + 1, S):
    win = df.iloc[i:i+W]
    if (win['attack'] == 0).all() and win[[f'{c}_outlier' for c in sensor_cols]].any(axis=None):
        continue
    windows.append(win[feat_cols].values.astype(np.float32))
    labels.append(int(win['attack'].any()))

X = np.stack(windows)
y = np.array(labels, dtype=int)

# 5. Test 분리 (15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)

# 6. 5-Fold Stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_recalls = []

# LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        h_cat = torch.cat([h[-2], h[-1]], dim=1)
        return self.fc(h_cat).squeeze(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
for fold, (train_idx, val_idx) in enumerate(skf.split(X_temp, y_temp), 1):
    X_tr, y_tr = X_temp[train_idx], y_temp[train_idx]
    X_val, y_val = X_temp[val_idx], y_temp[val_idx]

    # SMOTE + RandomUnderSampler
    n, w, f = X_tr.shape
    X_flat = X_tr.reshape(n, w * f)
    imb = ImbPipeline([
        ('smote', SMOTE(sampling_strategy=0.5, random_state=42)),
        ('under', RandomUnderSampler(sampling_strategy=0.6, random_state=42))
    ])
    X_res, y_res = imb.fit_resample(X_flat, y_tr)
    X_train = X_res.reshape(-1, w, f)
    y_train = y_res.astype(int)

    # Dataset & DataLoader
    class SSData(Dataset):
        def __init__(self, X, y):
            self.X = torch.from_numpy(X)
            self.y = torch.from_numpy(y)
        def __len__(self):
            return len(self.y)
        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]

    train_loader = DataLoader(SSData(X_train, y_train), batch_size=64, shuffle=True)
    val_loader = DataLoader(SSData(X_val, y_val), batch_size=64, shuffle=False)

    # Model, Loss, Optimizer
    model = LSTMClassifier(len(feat_cols)).to(device)
    counts = np.bincount(y_train, minlength=2)
    pw = counts[0] / counts[1] if counts[1] > 0 else 1.0
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pw], device=device))
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Training
    for epoch in range(1, 11):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.float().to(device)
            logits = model(xb)
            loss = criterion(logits, yb)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        #     total_loss += loss.item() * xb.size(0)

        # # 배치 루프가 끝난 후, 여기서 한 번만 출력
        # avg_loss = total_loss / len(train_loader.dataset)
        # print(f'Epoch {epoch:02d}  Loss: {avg_loss:.6f}')

    # Validation (Threshold 0.5)
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            probs = expit(model(xb.to(device)).cpu().numpy())
            preds = (probs >= 0.5).astype(int)
            y_true.extend(yb.numpy())
            y_pred.extend(preds)
    r = recall_score(y_true, y_pred)
    cv_recalls.append(r)
    print(f'Fold {fold} Recall: {r:.4f}')

print(f'CV Recall: {np.mean(cv_recalls):.4f} ± {np.std(cv_recalls):.4f}')

# 7. Final Test Evaluation (Threshold 0.5)
SS_dataset = SSData(X_test, y_test)
test_loader = DataLoader(SS_dataset, batch_size=64, shuffle=False)
y_pred = []
with torch.no_grad():
    for xb, _ in test_loader:
        probs = expit(model(xb.to(device)).cpu().numpy())
        y_pred.extend((probs >= 0.5).astype(int))
 
print(classification_report(y_test, y_pred, digits=4))

# 학습 루프 끝난 뒤
torch.save(scalers, 'scalers.pth')
torch.save(model.state_dict(), 'model_final.pth')
print("Saved scalers.pth and model_final.pth")



cuda
Fold 1 Recall: 1.0000
Fold 2 Recall: 1.0000
Fold 3 Recall: 0.9375
Fold 4 Recall: 1.0000
Fold 5 Recall: 0.9412
CV Recall: 0.9757 ± 0.0297
              precision    recall  f1-score   support

           0     1.0000    0.9994    0.9997      1743
           1     0.9375    1.0000    0.9677        15

    accuracy                         0.9994      1758
   macro avg     0.9688    0.9997    0.9837      1758
weighted avg     0.9995    0.9994    0.9994      1758

Saved scalers.pth and model_final.pth


In [None]:
import pandas as pd
import numpy as np
import torch
from scipy import stats
from torch.utils.data import Dataset, DataLoader
from scipy.special import expit

# 1. 학습 때 사용한 센서+윈도우 파라미터
sensor_cols = [
    'P1_LCV01D','P1_B2016','P1_PIT01',
    'P3_LCV01D','P2_SIT01','P1_LIT01',
    'P1_FT03','P1_PCV01D'
]
W, S = 100, 10
feat_cols = [f'{c}_norm' for c in sensor_cols] + [f'{c}_outlier' for c in sensor_cols]
THRESH = 0.5  # 이상 판단 임계치

# 2. 스케일러와 모델 불러오기
scalers = torch.load('scalers.pth', weights_only=False)    # {sensor: (mu, sigma), ...}
model = LSTMClassifier(len(feat_cols), hidden_dim=128, num_layers=2)
model.load_state_dict(torch.load('model_final.pth'))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device).eval()

# 3. test1.csv 전처리
df = pd.read_csv('../test1.csv', sep=';', parse_dates=['time']) \
       .sort_values('time') \
       .reset_index(drop=True)

# 이상치 플래그 & 정규화
zscores = df[sensor_cols].apply(stats.zscore)
for c in sensor_cols:
    df[f'{c}_outlier'] = (zscores[c].abs() >= 3).astype(float)
    mu, sigma = scalers[c]
    df[f'{c}_norm'] = (df[c] - mu) / sigma

# 4. 슬라이딩 윈도우 Dataset
class TestDataset(Dataset):
    def __init__(self, df, feat_cols, W, S):
        windows, times = [], []
        L = len(df)
        for i in range(0, L - W + 1, S):
            win = df.iloc[i:i+W]
            windows.append(win[feat_cols].values.astype(np.float32))
            # Timestamp → ISO string 변환
            times.append(win['time'].iloc[-1].isoformat())
        self.X = torch.from_numpy(np.stack(windows))  # (num_windows, W, num_features)
        self.times = times

    def __len__(self):
        return len(self.times)

    def __getitem__(self, idx):
        return self.X[idx], self.times[idx]

test_ds = TestDataset(df, feat_cols, W, S)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

# 5. 추론
results = []
with torch.no_grad():
    for xb, times in test_loader:
        xb = xb.to(device)
        logits = model(xb).cpu().numpy()
        probs  = expit(logits).squeeze(-1)   # shape: (batch,)
        preds  = (probs >= THRESH).astype(int)
        for t, p, pr in zip(times, preds, probs):
            results.append({
                'time':       t,
                'anomaly':    int(p),
                'probability': float(pr)
            })

# 6. 결과 저장
df_out = pd.DataFrame(results)
df_out.to_csv('test1_predictions.csv', index=False)
print(df_out.head())


                  time  anomaly  probability
0  2019-10-29 11:01:39        0     0.000130
1  2019-10-29 11:01:49        0     0.000143
2  2019-10-29 11:01:59        0     0.000230
3  2019-10-29 11:02:09        0     0.000119
4  2019-10-29 11:02:19        0     0.000115
