In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

# ---------- 1. Read & basic cleaning ----------
df = pd.read_csv("D:/vhproj\Ton_IoT\data\Ton_IoT_Network.csv")

# Replace '-' placeholders with NaN (very important)
df.replace('-', np.nan, inplace=True)

# If there are trailing/leading spaces in string cells, strip them:
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()

# ---------- 2. Make sure label is numeric ----------
# Header: ... , label, type  -> in your sample label is numeric (0/1) and "type" is text
# If label is string like 'normal'/'attack', map it. Here we assume numeric already:
if df['label'].dtype == object:
    # attempt to convert
    try:
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
    except Exception:
        # fallback map common text labels:
        df['label'] = df['label'].map({'normal':0, 'attack':1}).astype(float)

# Drop rows where label is missing after conversion
df = df[~df['label'].isna()].copy()
df['label'] = df['label'].astype(int)

# ---------- 3. Feature selection decisions ----------
# Many IPs, URIs, certificates, user agents are high-cardinality and noisy.
# You can either drop them or frequency-encode them. Here we:
drop_cols = ['ts']  # timestamp usually not useful as raw numeric here (optional)
# common noisy textual columns to drop (customize as you wish)
possible_drop = ['src_ip', 'dst_ip', 'http_uri', 'http_user_agent', 'ssl_subject', 'ssl_issuer',
                 'weird_name', 'weird_addl', 'weird_notice']
for c in possible_drop:
    if c in df.columns:
        drop_cols.append(c)

df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# ---------- 4. Convert numeric-like columns to numeric (coerce errors -> NaN) ----------
# Try to coerce any column that looks numeric when strings are present
for col in df.columns:
    if col == 'label':
        continue
    # if dtype is object but values look numeric, convert
    if df[col].dtype == object:
        # check fraction of values that can be parsed as numeric
        parsed = pd.to_numeric(df[col], errors='coerce')
        frac_numeric = parsed.notna().mean()
        if frac_numeric >= 0.7:
            # treat as numeric
            df[col] = parsed
        else:
            # leave as object (categorical/text)
            df[col] = df[col].replace('nan', np.nan)  # string 'nan' -> actual NaN

# ---------- 5. Split features / label (stratified) ----------
X = df.drop(columns=['label'])
y = df['label']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train_raw.shape, "Test size:", X_test_raw.shape)
print("Label distribution (train):\n", y_train.value_counts(normalize=True))
print("Label distribution (test):\n", y_test.value_counts(normalize=True))

# ---------- 6. Preprocessing functions ----------
def preprocess_fit_transform(X_train):
    """
    Fit imputers/encoders on X_train and transform X_train -> numeric matrix.
    Strategy:
      - Numeric cols: impute median, scale (for logistic; scaling optional for tree models)
      - Categorical cols:
         * If nunique <= 20 -> one-hot via pandas.get_dummies (keeps simple)
         * Else (high cardinality) -> frequency encoding (category -> frequency in train)
    Returns: X_train_proc (DataFrame), and dicts needed to transform test set
    """
    X = X_train.copy()
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()

    # numeric imputer + scaler
    num_imputer = SimpleImputer(strategy='median')
    X_num = pd.DataFrame(
        num_imputer.fit_transform(X[numeric_cols]),
        columns=numeric_cols,
        index=X.index
    )
    scaler = StandardScaler()
    X_num_scaled = pd.DataFrame(
        scaler.fit_transform(X_num),
        columns=numeric_cols,
        index=X.index
    )

    # categorical handling
    low_card_cols = []
    high_card_cols = []
    freq_maps = {}

    for c in cat_cols:
        nunique = X[c].nunique(dropna=True)
        if nunique <= 20:
            low_card_cols.append(c)
        else:
            high_card_cols.append(c)
            freq = X[c].fillna('::MISSING::').value_counts(normalize=True).to_dict()
            freq_maps[c] = freq

    # one-hot for low-cardinality (use get_dummies)
    if low_card_cols:
        X_low = pd.get_dummies(X[low_card_cols].fillna('::MISSING::'), dummy_na=False, drop_first=False)
    else:
        X_low = pd.DataFrame(index=X.index)

    # frequency encoding for high-card
    X_high = pd.DataFrame(index=X.index)
    for c in high_card_cols:
        X_high[c + "_freq"] = X[c].fillna('::MISSING::').map(freq_maps[c]).fillna(0.0)

    # concat all
    X_proc = pd.concat([X_num_scaled, X_low, X_high], axis=1)
    # Save metadata
    meta = {
        'numeric_cols': numeric_cols,
        'num_imputer': num_imputer,
        'scaler': scaler,
        'low_card_cols': low_card_cols,
        'high_card_cols': high_card_cols,
        'freq_maps': freq_maps,
        'onehot_columns': X_low.columns.tolist()
    }
    return X_proc, meta

def preprocess_transform(X_test, meta):
    X = X_test.copy()
    numeric_cols = meta['numeric_cols']
    num_imputer = meta['num_imputer']
    scaler = meta['scaler']

    # Ensure numeric_cols exist in test (if missing, create NaN)
    for c in numeric_cols:
        if c not in X.columns:
            X[c] = np.nan

    X_num = pd.DataFrame(
        num_imputer.transform(X[numeric_cols]),
        columns=numeric_cols,
        index=X.index
    )
    X_num_scaled = pd.DataFrame(
        scaler.transform(X_num),
        columns=numeric_cols,
        index=X.index
    )

    # low-card one-hot: create same columns as training one-hot columns
    low_card_cols = meta['low_card_cols']
    if low_card_cols:
        X_low = pd.get_dummies(X[low_card_cols].fillna('::MISSING::'), dummy_na=False, drop_first=False)
        # ensure same columns as train
        for c in meta['onehot_columns']:
            if c not in X_low.columns:
                X_low[c] = 0
        X_low = X_low[meta['onehot_columns']]  # same order
    else:
        X_low = pd.DataFrame(index=X.index)

    # high-card frequency encoding
    X_high = pd.DataFrame(index=X.index)
    for c in meta['high_card_cols']:
        fmap = meta['freq_maps'].get(c, {})
        X_high[c + "_freq"] = X[c].fillna('::MISSING::').map(fmap).fillna(0.0)

    X_proc = pd.concat([X_num_scaled, X_low, X_high], axis=1)
    return X_proc

# ---------- 7. Fit preprocess on train and transform test ----------
X_train_proc, meta = preprocess_fit_transform(X_train_raw)
X_test_proc = preprocess_transform(X_test_raw, meta)

print("Processed feature counts:", X_train_proc.shape[1])

# ---------- 8. Balance the training set (SMOTE on numerical features) ----------
# SMOTE works on numeric arrays; our features are numeric after preprocessing
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)   # <-- bỏ n_jobs
X_train_bal, y_train_bal = sm.fit_resample(X_train_proc, y_train)

print("After SMOTE, train shape:", X_train_bal.shape)
print("Label distribution after SMOTE:", pd.Series(y_train_bal).value_counts())






KeyError: "['ts', 'ssl_subject', 'ssl_issuer', 'http_uri', 'http_user_agent'] not found in axis"

In [None]:
# ---------- 9. Train RandomForest ----------
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train_bal, y_train_bal)
y_pred_rf = rf.predict(X_test_proc)
print("\nRandom Forest results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
try:
    print("ROC-AUC:", roc_auc_score(y_test, rf.predict_proba(X_test_proc)[:,1]))
except Exception:
    pass


Random Forest results:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     60000
           1       1.00      1.00      1.00     32209

    accuracy                           1.00     92209
   macro avg       1.00      1.00      1.00     92209
weighted avg       1.00      1.00      1.00     92209

ROC-AUC: 1.0


In [None]:
# ---------- 10. Train XGBoost ----------
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train_bal, y_train_bal)
y_pred_xgb = xgb.predict(X_test_proc)
print("\nXGBoost results:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))
try:
    print("ROC-AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test_proc)[:,1]))
except Exception:
    pass


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost results:
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     60000
           1       1.00      1.00      1.00     32209

    accuracy                           1.00     92209
   macro avg       1.00      1.00      1.00     92209
weighted avg       1.00      1.00      1.00     92209

ROC-AUC: 1.0


In [None]:
# ---------- 11. Confusion matrices ----------
print("RF conf matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("XGB conf matrix:\n", confusion_matrix(y_test, y_pred_xgb))


RF conf matrix:
 [[60000     0]
 [    0 32209]]
XGB conf matrix:
 [[60000     0]
 [    0 32209]]
