#  Clean and Prepare IoT Malware Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.exceptions import ConvergenceWarning
import warnings

# Load and Clean Data

In [2]:
file_path = "/Users/ayda/code/aydaafsh/FlowGuard/FlowGuard/raw_data/merged_data.csv"
df = pd.read_csv(file_path, sep='|', low_memory=False, usecols=['label', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state'])

# Drop duplicates and reset index
df_cleaned = df.drop_duplicates().reset_index(drop=True)

# Drop irrelevant columns
# drop_cols = ['uid', 'history', 'tunnel_parents', 'detailed-label', 'local_resp', 'local_orig', 'missed_bytes']
# df_cleaned.drop(columns=drop_cols, inplace=True, errors='ignore')

# Drop rows with missing critical fields
df_cleaned.dropna(subset=['label', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes'], inplace=True)

# Clean numeric fields
for col in ['duration', 'orig_bytes', 'resp_bytes']:
    df_cleaned[col] = df_cleaned[col].replace('-', np.nan)
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    df_cleaned[col].fillna(0, inplace=True)

# Final duplicate check
df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(0, inplace=True)


In [4]:
df.label

0           Malicious   C&C
1           Malicious   C&C
2           Malicious   C&C
3                    Benign
4                    Benign
                 ...       
25010998             Benign
25010999             Benign
25011000             Benign
25011001             Benign
25011002             Benign
Name: label, Length: 25011003, dtype: object

# Check Zero Ratios

In [5]:
columns = list(df_cleaned.columns)
zero_ratios = {col: (df_cleaned[col] == 0).mean() for col in columns}

for col, ratio in zero_ratios.items():
    print(f"{col}: {ratio:.2%} zeros")


proto: 0.00% zeros
service: 0.00% zeros
duration: 0.00% zeros
orig_bytes: 90.97% zeros
resp_bytes: 97.93% zeros
conn_state: 0.00% zeros
label: 0.00% zeros


# Explore Categorical Features

In [6]:
# Number of unique values
categorical_features = ['proto', 'conn_state']
for col in categorical_features:
    print(f"{col}: {df_cleaned[col].nunique()} unique values")

# Percentage distribution
for col in categorical_features:
    print(f"\nValue percentages for '{col}':")
    value_counts = df_cleaned[col].value_counts(normalize=True) * 100
    print(value_counts.round(2).to_string())


proto: 3 unique values
conn_state: 13 unique values

Value percentages for 'proto':
proto
tcp     98.35
udp      1.52
icmp     0.13

Value percentages for 'conn_state':
conn_state
RSTOS0    79.66
S0        10.76
OTH        5.96
SF         1.82
REJ        1.36
S3         0.21
RSTR       0.16
RSTO       0.05
S2         0.01
S1         0.01
RSTRH      0.00
SH         0.00
SHR        0.00


# Encode Labels

In [7]:
df_cleaned['label'] = df_cleaned['label'].apply(lambda x: 0 if x == 'Benign' else 1)

In [8]:
df_cleaned.label.unique()

array([1, 0])

# Train and Split Data

In [9]:
from sklearn.model_selection import train_test_split

X = df_cleaned.drop("label", axis=1)
y = df_cleaned["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print("\nTrain class distribution:\n", y_train.value_counts())
print("\nTest class distribution:\n", y_test.value_counts())


Train set: (787983, 6)
Test set: (337708, 6)

Train class distribution:
 label
1    726193
0     61790
Name: count, dtype: int64

Test class distribution:
 label
1    311226
0     26482
Name: count, dtype: int64


# Build Preprocessing Pipeline

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Pipeline for numeric features
num_transformer = make_pipeline(
    StandardScaler()
)

# Show the pipeline
num_transformer

# ColumnTransformer → Parallel Processing

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

# One-hot encoder for categorical features
cat_transformer = OneHotEncoder(sparse=False, handle_unknown='ignore')

# Parallel processing of features
preprocessor = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=np.number)),
    (cat_transformer, make_column_selector(dtype_include=['object', 'bool'])),
    remainder='passthrough'  # Keep other columns
)

In [12]:
from sklearn import set_config
set_config(display='diagram')  # Visualize nicely in notebooks

preprocessor

# Transform Train

Best Params: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}

In [13]:
#  Train final model on the full training set using optimized params

# Extract and clean up only relevant hyperparameters
final_params = {
    'penalty':'l1',
    'C':1,
    'solver':'saga',
    'random_state': 42,
    'max_iter': 500,            # Reduce to speed up final training
    'tol': 0.05,                # Relax tolerance further
    'class_weight': 'balanced' # Maintain balance handling
}

# Train the final model
final_lr_model = LogisticRegression(**final_params)

In [14]:
# Add estimator
flowguard_pipeline = make_pipeline(preprocessor, final_lr_model)
flowguard_pipeline

# Model Training: Logistic Regression

In [15]:
# Fit the pipeline
flowguard_pipeline.fit(X_train, y_train)



# Evaluate the Model

In [16]:
from sklearn.metrics import accuracy_score, classification_report

# Predict
y_pred = flowguard_pipeline.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.934683217454132
Classification Report:
               precision    recall  f1-score   support

           0       0.55      1.00      0.71     26482
           1       1.00      0.93      0.96    311226

    accuracy                           0.93    337708
   macro avg       0.77      0.96      0.83    337708
weighted avg       0.96      0.93      0.94    337708



# Save Pipeline

In [17]:
import pickle

# Save your full preprocessing + model pipeline
with open("flowguard_pipeline.pkl", "wb") as file:
    pickle.dump(flowguard_pipeline, file)

# Load it back anytime
# loaded_pipeline = pickle.load(open("flowguard_pipeline.pkl", "rb"))

In [18]:
# Load it back anytime
loaded_pipeline = pickle.load(open("flowguard_pipeline.pkl", "rb"))