In [21]:
import pandas as pd

# CSV 읽기 (세미콜론 구분, time 컬럼을 datetime)
df = pd.read_csv('../train2.csv', sep=';', parse_dates=['time'])
# 시간 오름차순 정렬
df = df.sort_values('time').reset_index(drop=True)

df[df['attack'] == 1]


Unnamed: 0,time,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,...,P4_ST_FD,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,attack,attack_P1,attack_P2,attack_P3
70285,2019-11-02 15:31:25,0.1017,1.191700,401.9155,1111.3052,32.3673,100.0000,2858.8655,35.5922,100.0000,...,-0.00245,266.5835,277.1539,0.0,10026.0,27582.0,1,1,0,0
70286,2019-11-02 15:31:26,0.1017,1.143164,401.9155,1111.3052,32.3673,100.0000,2857.4136,35.5888,100.0000,...,-0.00270,265.8962,276.7289,0.0,10026.0,27584.0,1,1,0,0
70287,2019-11-02 15:31:27,0.1017,1.141243,401.9155,1111.3052,32.3673,100.0000,2859.7896,35.5838,100.0000,...,-0.00140,265.4080,275.0470,0.0,10026.0,27580.0,1,1,0,0
70288,2019-11-02 15:31:28,0.1017,1.087857,401.9155,1111.3052,32.3673,100.0000,2858.2058,35.5640,100.0000,...,0.00030,265.3356,274.2287,0.0,10026.0,27580.0,1,1,0,0
70289,2019-11-02 15:31:29,0.1017,1.089331,401.9155,1111.3052,32.3673,100.0000,2859.5254,35.5682,100.0000,...,0.00090,263.9070,273.4104,0.0,10027.0,27575.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157069,2019-11-03 15:37:49,0.0997,1.359031,393.2480,1092.0895,32.1553,0.0038,34.9568,35.5473,0.7154,...,-0.00110,318.4136,299.3345,0.0,10053.0,27627.0,1,1,0,0
157070,2019-11-03 15:37:50,0.0997,1.369522,393.2480,1092.0895,32.1553,0.0000,33.9638,35.5717,0.6942,...,-0.00450,320.5657,301.3419,0.0,10053.0,27627.0,1,1,0,0
157071,2019-11-03 15:37:51,0.0997,1.324430,393.2480,1092.0895,32.1553,0.0090,33.9638,35.5885,0.6942,...,-0.00160,322.2837,302.8972,0.0,10053.0,27627.0,1,1,0,0
157072,2019-11-03 15:37:52,0.0997,1.336914,393.2480,1092.0895,32.1553,0.0191,33.9638,35.6186,0.7195,...,0.00030,323.3868,305.0492,0.0,10053.0,27627.0,1,1,0,0


In [12]:
sensor_ranges = {
    'P1_LCV01D': (0.0, 100.0),
    'P1_B2016':  (0.0,  10.0),
    'P1_PIT01':  (0.0,  10.0),
    # 'P3_LCP01':(,),
    'P3_LCV01D':(0, 27648),
    'P2_SIT01':(0, 3200),
    'P1_LIT01':(0, 720),
    'P1_FT03':(0, 2500),
    'P1_PCV01D':( 0, 100)
}

# ① 이상치 플래그 생성 (attack==0 구간에서만 범위 벗어나면 True)
for s, (mn, mx) in sensor_ranges.items():
    df[f'{s}_outlier'] = (
        (~df[s].between(mn, mx))
        & (df['attack'] == 0)
    )

# ② 라벨: 윈도우 생성 전에 attack 컬럼 그대로 사용하셔도 되고,
#    윈도우 단위 라벨링 시 “윈도우 안 outlier 하나라도 True → 이상(1)” 로 재정의


In [13]:
scalers = {}
for s in sensor_ranges:
    μ, σ = df[s].mean(), df[s].std()
    df[f'{s}_norm'] = (df[s] - μ) / σ
    scalers[s] = (μ, σ)


In [14]:
import numpy as np

W, S = 100, 10
feat_cols    = [f'{s}_norm'    for s in sensor_ranges] \
             + [f'{s}_outlier' for s in sensor_ranges]

# 4. 슬라이딩 윈도우 & 라벨 생성 (이상치 제거 추가)
windows, labels = [], []
L = len(df)
for i in range(0, L - W + 1, S):
    win_df = df.iloc[i:i+W]
    # ① 윈도우 내 'attack==0 이면서 센서 outlier==True' 가 하나라도 있으면 스킵
    #    즉, noise로 정의한 이상치가 섞여 있는 윈도우는 건너뛴다
    if win_df[[f'{s}_outlier' for s in sensor_ranges]].any(axis=None):
        continue

    # ② 여기는 정상 윈도우와 실제 attack(1) 윈도우만 남음
    windows.append(win_df[feat_cols].values)
    # ③ 라벨은 원래 attack 컬럼 기준 (0 또는 1)
    labels.append(int(win_df['attack'].any()))

X = np.stack(windows).astype(np.float32)
y = np.array(labels, dtype=np.int64)


In [15]:
n = len(X)
i1, i2 = int(n*0.7), int(n*0.85)
X_train, y_train = X[:i1], y[:i1]
X_val,   y_val   = X[i1:i2], y[i1:i2]
X_test,  y_test  = X[i2:],   y[i2:]

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

class SensorWindowDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = torch.from_numpy(X), torch.from_numpy(y)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_ds = SensorWindowDataset(X_train, y_train)

# 클래스별 inverse frequency weight
counts = np.bincount(y_train)
weight_per_class = {i: 1.0/count for i, count in enumerate(counts)}
sample_weights = [weight_per_class[int(l)] for l in y_train]

sampler = WeightedRandomSampler(sample_weights,
                                num_samples=len(sample_weights),
                                replacement=True)

train_loader = DataLoader(train_ds,
                          batch_size=64,
                          sampler=sampler)
val_loader   = DataLoader(SensorWindowDataset(X_val, y_val),
                          batch_size=64, shuffle=False)


In [17]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, in_dim, hid_dim=128, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hid_dim, num_layers,
                            batch_first=True, bidirectional=True)
        self.fc   = nn.Linear(hid_dim*2, 1)  # 이진 분류
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)  # h_n: (num_layers*2, B, hid_dim)
        h = torch.cat([h_n[-2], h_n[-1]], dim=1)  # 마지막 layer 양방향
        return self.fc(h).squeeze(1)  # 로짓 반환


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(X.shape[2]).to(device)
device

device(type='cuda')

In [19]:
# pos_weight = N_neg/N_pos
pos_weight = torch.tensor([counts[0]/counts[1]], device=device)
criterion  = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer  = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, 21):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.float().to(device)
        logits = model(xb)
        loss   = criterion(logits, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    print(f'Epoch {epoch}, Loss={total_loss/len(train_ds):.4f}')

Epoch 1, Loss=2.0475
Epoch 2, Loss=0.1860
Epoch 3, Loss=0.1525
Epoch 4, Loss=0.1199
Epoch 5, Loss=0.0356
Epoch 6, Loss=0.0296
Epoch 7, Loss=0.0718
Epoch 8, Loss=0.0118
Epoch 9, Loss=0.0602
Epoch 10, Loss=0.0454
Epoch 11, Loss=0.0219
Epoch 12, Loss=0.0072
Epoch 13, Loss=0.0092
Epoch 14, Loss=0.0068
Epoch 15, Loss=0.0420
Epoch 16, Loss=0.0171
Epoch 17, Loss=0.0502
Epoch 18, Loss=0.0106
Epoch 19, Loss=0.0079
Epoch 20, Loss=0.0073


In [20]:
import numpy as np
from sklearn.metrics import precision_recall_curve, f1_score

# ① Val set에서 확률(p = sigmoid(logit)) 수집
model.eval()
probs = []
with torch.no_grad():
    for xb, _ in val_loader:
        logits = model(xb.to(device))
        probs.extend(torch.sigmoid(logits).cpu().numpy())

# ② PR-curve & 최적 F1 threshold 찾기
prec, rec, th = precision_recall_curve(y_val, probs)
f1_scores = 2*prec*rec/(prec+rec)
best_t = th[np.nanargmax(f1_scores)]

# ③ Test 평가
test_ds = SensorWindowDataset(X_test, y_test)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)
y_pred = []
with torch.no_grad():
    for xb, _ in test_loader:
        prob = torch.sigmoid(model(xb.to(device))).cpu().numpy()
        y_pred.extend((prob >= best_t).astype(int))

from sklearn.metrics import classification_report, average_precision_score
print(classification_report(y_test, y_pred))
print("PR-AUC:", average_precision_score(y_test, probs))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00    1447.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00    1447.0
   macro avg       0.00      0.00      0.00    1447.0
weighted avg       0.00      0.00      0.00    1447.0

PR-AUC: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
