In [None]:
import os, numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [None]:
ROOT = "/scratch/Malware/iot23/mal_family_capture"

In [None]:
all_column = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'label', 'detailed-label', 'Label']
exclude_column = ['ts','uid','id.orig_h', 'id.resp_h', 'local_orig', 'local_resp','detailed-label','tunnel_parents','service','history']

column_names = [column for column in all_column if column not in exclude_column]

In [None]:
dfs = []
for csv in os.listdir(ROOT):
    if csv.endswith(".csv"):
        df = pd.read_csv(os.path.join(ROOT, csv))
        dfs.append(df)
data = pd.concat(dfs, ignore_index=True)

In [None]:
features_dict = {
    'id.orig_p': 'Int64',
    'id.resp_p': 'Int64',
    'proto': 'string',
    'duration': 'Float64',  
    'conn_state': 'string',
    'missed_bytes': 'Int64',
    'orig_pkts': 'Int64',
    'orig_ip_bytes': 'Int64',
    'resp_pkts': 'Int64',
    'resp_ip_bytes': 'Int64',
    'label': 'string',
    'Label': 'string'
}

data = data[data['label'] == 'Malicious']

reduced_iot_dataset = data.loc[:, column_names].replace('-', pd.NA)
before = len(reduced_iot_dataset)
reduced_iot_dataset = reduced_iot_dataset.drop_duplicates()
after = len(reduced_iot_dataset)

for col in ['orig_bytes', 'resp_bytes']:
    reduced_iot_dataset[col] = pd.to_numeric(reduced_iot_dataset[col], errors='coerce')
    reduced_iot_dataset[col] = reduced_iot_dataset[col].astype('Int64')

print(f"before exclude duplicates: {before:,} rows")
print(f"after excluding duplicates: {after:,} rows")
print(f"before - after: {before - after:,} rows")
print(f"rate of reduction: {(before - after) / before:.2%}")

reduced_iot_dataset = reduced_iot_dataset.astype(features_dict)

In [None]:
print(reduced_iot_dataset.isna().sum().sort_values(ascending=False))


In [None]:
target_columns = ['orig_bytes', 'resp_bytes', 'missed_bytes']

for col in target_columns:
    non_numeric = reduced_iot_dataset[col][
        ~pd.to_numeric(reduced_iot_dataset[col], errors='coerce').notna()
    ].unique()
    print(f"Non-numeric in '{col}':", non_numeric)


In [None]:
int_columns = [col for col, dtype in features_dict.items() if dtype == 'Int64']

print("Int64 column info:")
for col in int_columns:
    col_data = reduced_iot_dataset[col]
    print(f"{col:<15} | min: {col_data.min():>10} | max: {col_data.max():>10} | mean: {col_data.mean():>10.2f} | "
          f"NA: {col_data.isna().sum():>6} | unique: {col_data.nunique():>6}")


In [None]:
log_cols = ['id.orig_p', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']
for col in log_cols:
    reduced_iot_dataset[col] = np.log1p(reduced_iot_dataset[col])


In [None]:
cols_to_log = ['id.orig_p', 'orig_pkts', 'orig_ip_bytes', 'resp_ip_bytes']
print(data[cols_to_log].isna().sum())

In [None]:
print(reduced_iot_dataset['Label'].value_counts())

In [None]:
print("proto unique values:", data['proto'].dropna().unique())
print("conn_state unique values:", data['conn_state'].dropna().unique())


In [None]:
X = reduced_iot_dataset.drop(columns=['label', 'Label'])
y = reduced_iot_dataset['Label']  

le = LabelEncoder()
y_enc = le.fit_transform(y)

print("X_shapr", X.shape)
print("y_enc shape", y_enc.shape)
print(X.head(10))
print("X feature:", X.shape[1])
print(y_enc[:10])

In [None]:
# One-hot encoding
X_encoded = pd.get_dummies(X, columns=['proto', 'conn_state'])

# Results
print("One-hot encoded shape:", X_encoded.shape)
print("New columns:", [col for col in X_encoded.columns if 'proto_' in col or 'conn_state_' in col])

In [None]:
print(X_encoded.columns.tolist())

In [None]:
# 1. train / temp (valid+test)
X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
    X_encoded, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# 2. valid / test
X_valid_raw, X_test_raw, y_valid, y_test = train_test_split(
    X_temp_raw, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Step 1: Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_raw)
X_test_imputed = imputer.transform(X_test_raw)
X_valid_imputed = imputer.transform(X_valid_raw)

# Step 2: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
X_valid_scaled = scaler.transform(X_valid_imputed)

In [None]:
print(X_encoded.head())

In [None]:
# save
np.save("/scratch/Malware/iot23/data/X_train.npy", X_train_scaled)
np.save("/scratch/Malware/iot23/data/y_train.npy", y_train)
np.save("/scratch/Malware/iot23/data/X_valid.npy",  X_valid_scaled)
np.save("/scratch/Malware/iot23/data/y_valid.npy",  y_valid)
np.save("/scratch/Malware/iot23/data/X_test.npy",  X_test_scaled)
np.save("/scratch/Malware/iot23/data/y_test.npy",  y_test)


print("Train shape :", X_train_scaled.shape, "   Test shape :", X_test_scaled.shape)
print("Classes:", le.classes_)

print("X_train type:", type(X_train_scaled))
print("y_train type:", type(y_train))
print("X_test  type:", type(X_test_scaled))
print("y_test  type:", type(y_test))
print("X_test  type:", type(X_valid_scaled))
print("y_test  type:", type(y_valid))

# Feature names & name matching
feature_names = column_names
print("\n=== feature index and name matching ===")
for i, feature_name in enumerate(X_encoded.columns):
    print(f"Index {i:3d}: {feature_name}")

print("end")