In [9]:
import pandas as pd
import numpy as np
import os

In [10]:
# --- CONFIG ---
DATA_PATH = "../data/raw"
PROCESSED_PATH = "../data/processed"
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [11]:
# Correct UNSW-NB15 Column Names
COL_NAMES = [
    "srcip", "sport", "dstip", "dsport", "proto", "state", "dur", "sbytes", "dbytes", 
    "sttl", "dttl", "sloss", "dloss", "service", "Sload", "Dload", "Spkts", "Dpkts", 
    "swin", "dwin", "stcpb", "dtcpb", "smeansz", "dmeansz", "trans_depth", "res_bdy_len", 
    "Sjit", "Djit", "Stime", "Ltime", "Sintpkt", "Dintpkt", "tcprtt", "synack", "ackdat", 
    "is_sm_ips_ports", "ct_state_ttl", "ct_flw_http_mthd", "is_ftp_login", "ct_ftp_cmd", 
    "ct_srv_src", "ct_srv_dst", "ct_dst_ltm", "ct_src_ltm", "ct_src_dport_ltm", 
    "ct_dst_sport_ltm", "ct_dst_src_ltm", "attack_cat", "Label"
]
print("Loading Data...")

Loading Data...


In [12]:
# Load all 4 CSV files
files = [f"{DATA_PATH}/UNSW-NB15_{i}.csv" for i in range(1, 5)]
dfs = [pd.read_csv(f, header=None, names=COL_NAMES) for f in files]
df = pd.concat(dfs, ignore_index=True)

  dfs = [pd.read_csv(f, header=None, names=COL_NAMES) for f in files]
  dfs = [pd.read_csv(f, header=None, names=COL_NAMES) for f in files]


In [13]:
# --- CRITICAL CLEANING ---
# 1. Drop High Cardinality Features (IPs and Times cause overfitting)
# We drop 'sport' and 'dsport' because they are just numbers, but often act like categories. 
# For a robust model, we focus on flow metrics (dur, sbytes, etc.)
drop_cols = ['srcip', 'dstip', 'Stime', 'Ltime', 'sport', 'dsport']
df.drop(columns=drop_cols, errors='ignore', inplace=True)

In [14]:
# 2. Clean Text
# Replace NaNs in text columns with "None" so the encoder doesn't crash
df['attack_cat'] = df['attack_cat'].fillna('Normal').astype(str).str.strip().str.lower()
df['proto'] = df['proto'].fillna('none').astype(str).str.lower()
df['service'] = df['service'].fillna('none').astype(str).str.lower()
df['state'] = df['state'].fillna('none').astype(str).str.lower()

In [15]:
# --- 3. Numeric Cleanup ---

# We need to force these specific columns to be Integers.
# 'ct_ftp_cmd' is the one causing your crash. 
# We use 'errors=coerce' to turn weird text into NaN, then fill with 0.
numeric_cols_to_clean = ['ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd']

for col in numeric_cols_to_clean:
    # 1. Force convert to number (invalid strings become NaN)
    df[col] = pd.to_numeric(df[col], errors='coerce')
    # 2. Fill empty values with 0
    df[col] = df[col].fillna(0)
    # 3. Convert to clean integer
    df[col] = df[col].astype(int)

# Double check that no object columns remain that should be numbers
print("Data types after cleanup:")
print(df.dtypes)

Data types after cleanup:
proto                object
state                object
dur                 float64
sbytes                int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
Sload               float64
Dload               float64
Spkts                 int64
Dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smeansz               int64
dmeansz               int64
trans_depth           int64
res_bdy_len           int64
Sjit                float64
Djit                float64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports       int64
ct_state_ttl          int64
ct_flw_http_mthd      int32
is_ftp_login          int32
ct_ftp_cmd            int32
ct_srv_src            

In [16]:
# 4. Save
print(f"Saving Cleaned Data ({df.shape})...")
df.to_parquet(f"{PROCESSED_PATH}/cleaned_data.parquet")
print("Done.")

Saving Cleaned Data ((2540047, 43))...
Done.
