In [1]:
# Contrastive Learning + Zero-Day Detection for IoMT IDS (Memperbaiki SPOOFING)
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
def load_dataset_from_structure(root_path):
    data = []
    for file in root_path.glob('*/*/*.csv'):
        try:
            df = pd.read_csv(file)

            category = file.parents[1].name  # DDoS, DoS, etc.
            attack = file.parent.name        # DDoS ICMP, DoS TCP, etc.
            label_class = 'Benign' if category.upper() == 'BENIGN' else 'Attack'

            df['category'] = category
            df['attack'] = attack
            df['class'] = label_class

            data.append(df)
        except Exception as e:
            print(f"[ERROR] Failed to read file {file}: {e}")
    return pd.concat(data, ignore_index=True)

# Load train
train_root = Path('../../../Data/CICIoMT2024/train')
train_df = load_dataset_from_structure(train_root)

# Load test
test_root = Path('../../../Data/CICIoMT2024/test')
test_df = load_dataset_from_structure(test_root)

# Cek ringkasan
print("Train set:", train_df.shape)
print(train_df[['category', 'attack', 'class']].value_counts())
print("\nTest set:", test_df.shape)
print(test_df[['category', 'attack', 'class']].value_counts())

# Gabungkan train dan test menjadi satu DataFrame
df = pd.concat([train_df, test_df], ignore_index=True)

Train set: (7160831, 48)
category  attack              class 
DDoS      DDoS UDP            Attack    1635956
          DDoS ICMP           Attack    1537476
          DDoS TCP            Attack     804465
          DDoS SYN            Attack     801962
DoS       DoS UDP             Attack     566950
          DoS SYN             Attack     441903
          DoS ICMP            Attack     416292
          DoS TCP             Attack     380384
BENIGN    BENIGN              Benign     192732
MQTT      DDoS Connect Flood  Attack     173036
RECON     Port Scan           Attack      83981
MQTT      DoS Publish Flood   Attack      44376
          DDoS Publish Flood  Attack      27623
RECON     OS Scan             Attack      16832
SPOOFING  SPOOFING            Attack      16047
MQTT      DoS Connect Flood   Attack      12773
          Malformed Data      Attack       5130
RECON     Recon VulScan       Attack       2173
          Ping Sweep          Attack        740
Name: count, dtype: int64


In [3]:
# Contrastive Learning + Zero-Day Detection for IoMT IDS (Triplet Loss for Better Embedding)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, roc_auc_score
from scipy.spatial import distance

# === 1. Triplet Loss Function ===
def triplet_loss(margin=1.0):
    def loss(y_true, y_pred):
        anchor, positive, negative = y_pred[:, :32], y_pred[:, 32:64], y_pred[:, 64:]
        pos_dist = K.sum(K.square(anchor - positive), axis=1)
        neg_dist = K.sum(K.square(anchor - negative), axis=1)
        return K.mean(K.maximum(pos_dist - neg_dist + margin, 0.0))
    return loss

# === 2. Load Data & Sampling Triplets ===
if 'class' not in df.columns or 'attack' not in df.columns:
    raise ValueError("Dataset harus memiliki kolom 'class' dan 'attack'")

df['attack'] = df['attack'].str.upper().str.strip()
df['class'] = df['class'].str.upper().str.strip()

benign_df = df[df['class'] == 'BENIGN'].copy()
spoof_df = df[df['attack'] == 'SPOOFING'].copy()
attack_df = df[(df['class'] == 'ATTACK') & (df['attack'] != 'SPOOFING')].copy()

triplet_size = min(len(benign_df), len(spoof_df), len(attack_df))
triplet_benign = benign_df.sample(n=triplet_size, random_state=42)
triplet_attack = attack_df.sample(n=triplet_size, random_state=42)
triplet_spoof = spoof_df.sample(n=triplet_size, random_state=42)

# === 3. Preprocessing ===
feature_cols = df.select_dtypes(include='number').columns
scaler = StandardScaler()
scaler.fit(df[feature_cols])

anchor = scaler.transform(triplet_benign[feature_cols])
positive = scaler.transform(triplet_attack[feature_cols])
negative = scaler.transform(triplet_spoof[feature_cols])

X_triplet = np.concatenate([anchor, positive, negative], axis=1)
y_dummy = np.zeros((X_triplet.shape[0],))

# === 4. Build Triplet Network ===
def build_base_network(input_shape):
    inp = Input(shape=(input_shape,))
    x = Dense(128, activation='relu')(inp)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation='linear')(x)
    return Model(inp, x)

input_shape = anchor.shape[1]
base_network = build_base_network(input_shape)

anchor_input = Input(shape=(input_shape,), name='anchor_input')
positive_input = Input(shape=(input_shape,), name='positive_input')
negative_input = Input(shape=(input_shape,), name='negative_input')

encoded_anchor = base_network(anchor_input)
encoded_positive = base_network(positive_input)
encoded_negative = base_network(negative_input)

merged_output = Lambda(lambda x: K.concatenate(x, axis=1))([encoded_anchor, encoded_positive, encoded_negative])
triplet_model = Model(inputs=[anchor_input, positive_input, negative_input], outputs=merged_output)

triplet_model.compile(loss=triplet_loss(margin=1.0), optimizer=Adam(0.001))
triplet_model.fit(
    [anchor, positive, negative], y_dummy,
    batch_size=64,
    epochs=15
)

# === 5. Ambil Embedding & Simpan Model Encoder ===
encoder = base_network
embedding_spoof = encoder.predict(scaler.transform(spoof_df[feature_cols]))
embedding_benign = encoder.predict(scaler.transform(benign_df[feature_cols]))

# === 6. Evaluasi Visual & Logika Deteksi (dapat dilanjutkan) ===
print("✅ Triplet embedding training selesai.")



Epoch 1/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 2.6897
Epoch 2/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4531
Epoch 3/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4116
Epoch 4/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.2259
Epoch 5/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1633
Epoch 6/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1555
Epoch 7/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1086
Epoch 8/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0708
Epoch 9/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0688
Epoch 10/15
[1m278/278[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - l

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, roc_auc_score
from scipy.spatial import distance
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
# === 6. Anomaly Detection: Isolation Forest, SVM, Mahalanobis ===
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(embedding_benign)
iso_pred = [1 if p == -1 else 0 for p in np.concatenate([iso.predict(embedding_benign), iso.predict(embedding_spoof)])]

svm = OneClassSVM(kernel='rbf', gamma='auto')
svm.fit(embedding_benign)
svm_pred = [1 if p == -1 else 0 for p in np.concatenate([svm.predict(embedding_benign), svm.predict(embedding_spoof)])]

mean_vec = np.mean(embedding_benign, axis=0)
cov_inv = np.linalg.pinv(np.cov(embedding_benign, rowvar=False))
d_mahal_benign = [distance.mahalanobis(x, mean_vec, cov_inv) for x in embedding_benign]
d_mahal_spoof = [distance.mahalanobis(x, mean_vec, cov_inv) for x in embedding_spoof]
thresh = np.percentile(d_mahal_benign, 95)
mahal_pred = [1 if d > thresh else 0 for d in d_mahal_benign + d_mahal_spoof]

ensemble_pred = [1 if (s == 1 and m == 1) else 0 for s, m in zip(svm_pred, mahal_pred)]
y_true = [0] * len(embedding_benign) + [1] * len(embedding_spoof)

print("\n[Ensemble Detection Evaluation]")
print(classification_report(y_true, ensemble_pred, target_names=['Benign', 'Spoofing']))
print("ROC-AUC:", roc_auc_score(y_true, ensemble_pred))
