# Data Preprocessing for ToN-IoT Network Dataset
Tiền xử lý dữ liệu cho các mô hình phát hiện tấn công.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# ---------- 1. Read & basic cleaning ----------
df = pd.read_csv("D:/vhproj/Ton_IoT/data/Ton_IoT_Network.csv")
df.replace('-', np.nan, inplace=True)
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()

In [None]:
# ---------- 2. Make sure label is numeric ----------
if df['label'].dtype == object:
    try:
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
    except Exception:
        df['label'] = df['label'].map({'normal':0, 'attack':1}).astype(float)
df = df[~df['label'].isna()].copy()
df['label'] = df['label'].astype(int)

In [None]:
# ---------- 3. Feature selection: drop noisy columns ----------
drop_cols = ['ts']
possible_drop = ['src_ip', 'dst_ip', 'http_uri', 'http_user_agent', 'ssl_subject', 'ssl_issuer',
                 'weird_name', 'weird_addl', 'weird_notice']
for c in possible_drop:
    if c in df.columns:
        drop_cols.append(c)
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

In [None]:
# ---------- 4. Convert numeric-like columns to numeric ----------
for col in df.columns:
    if col == 'label':
        continue
    if df[col].dtype == object:
        parsed = pd.to_numeric(df[col], errors='coerce')
        frac_numeric = parsed.notna().mean()
        if frac_numeric >= 0.7:
            df[col] = parsed
        else:
            df[col] = df[col].replace('nan', np.nan)

In [None]:
# ---------- 5. Split features / label ----------
X = df.drop(columns=['label'])
y = df['label']
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train size:', X_train_raw.shape, 'Test size:', X_test_raw.shape)
print('Label distribution (train):\n', y_train.value_counts(normalize=True))
print('Label distribution (test):\n', y_test.value_counts(normalize=True))

In [None]:
# ---------- 6. Preprocessing functions ----------
def preprocess_fit_transform(X_train):
    X = X_train.copy()
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    num_imputer = SimpleImputer(strategy='median')
    X_num = pd.DataFrame(num_imputer.fit_transform(X[numeric_cols]), columns=numeric_cols, index=X.index)
    scaler = StandardScaler()
    X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=numeric_cols, index=X.index)
    low_card_cols = []
    high_card_cols = []
    freq_maps = {}
    for c in cat_cols:
        nunique = X[c].nunique(dropna=True)
        if nunique <= 20:
            low_card_cols.append(c)
        else:
            high_card_cols.append(c)
            freq = X[c].fillna('::MISSING::').value_counts(normalize=True).to_dict()
            freq_maps[c] = freq
    if low_card_cols:
        X_low = pd.get_dummies(X[low_card_cols].fillna('::MISSING::'), dummy_na=False, drop_first=False)
    else:
        X_low = pd.DataFrame(index=X.index)
    X_high = pd.DataFrame(index=X.index)
    for c in high_card_cols:
        X_high[c + '_freq'] = X[c].fillna('::MISSING::').map(freq_maps[c]).fillna(0.0)
    X_proc = pd.concat([X_num_scaled, X_low, X_high], axis=1)
    meta = {
        'numeric_cols': numeric_cols,
        'num_imputer': num_imputer,
        'scaler': scaler,
        'low_card_cols': low_card_cols,
        'high_card_cols': high_card_cols,
        'freq_maps': freq_maps,
        'onehot_columns': X_low.columns.tolist(),
    }
    return X_proc, meta

In [None]:
def preprocess_transform(X_test, meta):
    X = X_test.copy()
    numeric_cols = meta['numeric_cols']
    num_imputer = meta['num_imputer']
    scaler = meta['scaler']
    for c in numeric_cols:
        if c not in X.columns:
            X[c] = np.nan
    X_num = pd.DataFrame(num_imputer.transform(X[numeric_cols]), columns=numeric_cols, index=X.index)
    X_num_scaled = pd.DataFrame(scaler.transform(X_num), columns=numeric_cols, index=X.index)
    low_card_cols = meta['low_card_cols']
    if low_card_cols:
        X_low = pd.get_dummies(X[low_card_cols].fillna('::MISSING::'), dummy_na=False, drop_first=False)
        for c in meta['onehot_columns']:
            if c not in X_low.columns:
                X_low[c] = 0
        X_low = X_low[meta['onehot_columns']]
    else:
        X_low = pd.DataFrame(index=X.index)
    X_high = pd.DataFrame(index=X.index)
    for c in meta['high_card_cols']:
        fmap = meta['freq_maps'].get(c, {})
        X_high[c + '_freq'] = X[c].fillna('::MISSING::').map(fmap).fillna(0.0)
    X_proc = pd.concat([X_num_scaled, X_low, X_high], axis=1)
    return X_proc