## Training model on DNS Over Https(DoH) dataset for detecting malicious network
## Made by: K Raja Sundar (26/11/2025)

### Loading Data and Analysis Data

In [2]:
import pandas as pd

df1 = pd.read_csv("l1-doh.csv")
df2 = pd.read_csv("l1-nondoh.csv")
df3 = pd.read_csv("l2-benign.csv")
df4 = pd.read_csv("l2-malicious.csv")

data = pd.concat([df1, df2, df3, df4], ignore_index=True)
data["TimeStamp"] = pd.to_datetime(data["TimeStamp"], dayfirst=True, errors='coerce')
data["LabelEncoded"] = data["Label"].map({
    "DoH": 0,
    "NonDoH": 1,
    "Benign": 2,
    "Malicious": 3,
})

  data["TimeStamp"] = pd.to_datetime(data["TimeStamp"], dayfirst=True, errors='coerce')


In [75]:
data.shape

(1436779, 36)

In [76]:
data.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
SourcePort,1436779.0,46790.079847,443.0,41584.0,51288.0,57254.0,65534.0,16145.935903
DestinationPort,1436779.0,4361.509343,443.0,443.0,443.0,443.0,65534.0,13260.887715
TimeStamp,1436779.0,2020-02-01 07:57:49.643911168,2019-12-09 21:53:15,2019-12-19 23:48:36,2020-01-13 20:18:54,2020-03-27 08:49:27,2020-04-01 23:50:38,
Duration,1436779.0,26.601489,0.0,0.129604,4.542526,34.069895,179.021144,39.006605
FlowBytesSent,1436779.0,26362.967547,0.0,305.0,1677.0,2746.0,15975607.0,123752.212098
FlowSentRate,1436779.0,9490.247342,-1.0,67.836913,799.288792,3623.188406,94360360.36036,238367.856082
FlowBytesReceived,1436779.0,53793.862462,0.0,223.0,4699.0,8080.0,52680110.0,330408.905769
FlowReceivedRate,1436779.0,20405.198886,-1.0,147.450338,1633.28466,5534.277032,30444444.444444,171964.801413
PacketLengthVariance,1436779.0,304249.050461,0.0,319.41,126668.501561,282083.65766,81092992.897377,987988.498964
PacketLengthStandardDeviation,1436779.0,359.29634,0.0,17.872045,355.905186,531.115484,9005.16479,418.515606


In [77]:
data.columns

Index(['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort',
       'TimeStamp', 'Duration', 'FlowBytesSent', 'FlowSentRate',
       'FlowBytesReceived', 'FlowReceivedRate', 'PacketLengthVariance',
       'PacketLengthStandardDeviation', 'PacketLengthMean',
       'PacketLengthMedian', 'PacketLengthMode', 'PacketLengthSkewFromMedian',
       'PacketLengthSkewFromMode', 'PacketLengthCoefficientofVariation',
       'PacketTimeVariance', 'PacketTimeStandardDeviation', 'PacketTimeMean',
       'PacketTimeMedian', 'PacketTimeMode', 'PacketTimeSkewFromMedian',
       'PacketTimeSkewFromMode', 'PacketTimeCoefficientofVariation',
       'ResponseTimeTimeVariance', 'ResponseTimeTimeStandardDeviation',
       'ResponseTimeTimeMean', 'ResponseTimeTimeMedian',
       'ResponseTimeTimeMode', 'ResponseTimeTimeSkewFromMedian',
       'ResponseTimeTimeSkewFromMode',
       'ResponseTimeTimeCoefficientofVariation', 'Label', 'LabelEncoded'],
      dtype='object')

### Feature Engineering

In [3]:
CORE_FEATURES = [
    "SourcePort",
    "DestinationPort",
    "Duration",
    "FlowBytesSent",
    "FlowSentRate",
    "FlowBytesReceived",
    "FlowReceivedRate",
    "PacketLengthMean",
    "PacketLengthVariance",
    "PacketLengthStandardDeviation",
    "PacketLengthCoefficientofVariation",
    "PacketTimeMean",
    "PacketTimeVariance",
    "PacketTimeStandardDeviation",
    "PacketTimeCoefficientofVariation",
]

DATA_FILES = [
    "l1-doh.csv",
    "l1-nondoh.csv",
    "l2-benign.csv",
    "l2-malicious.csv",
]

LABEL_MAP = {
    "DoH": 0,
    "NonDoH": 1,
    "Benign": 2,
    "Malicious": 3,
}


In [4]:
data.sample(n=5, random_state=42)

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,...,ResponseTimeTimeVariance,ResponseTimeTimeStandardDeviation,ResponseTimeTimeMean,ResponseTimeTimeMedian,ResponseTimeTimeMode,ResponseTimeTimeSkewFromMedian,ResponseTimeTimeSkewFromMode,ResponseTimeTimeCoefficientofVariation,Label,LabelEncoded
166,192.168.20.191,1.1.1.1,59658,443,2020-01-14 00:30:37,90.055391,165,1.832206,198,2.198647,...,1.932667e-09,4.4e-05,0.015218,0.015245,0.015156,-1.842496,1.410306,0.002889,DoH,0
1353219,192.168.20.208,9.9.9.11,35970,443,2020-03-31 00:21:25,33.320979,1875,56.270856,4897,146.964469,...,5.622135e-05,0.007498,0.009254,0.015141,4e-06,-2.35528,1.233688,0.810228,Malicious,3
336748,192.168.20.191,169.61.15.1,60881,443,2020-01-12 09:20:31,0.306559,1825,5953.177039,9844,32111.273849,...,0.0005026722,0.02242,0.049979,0.058749,3.4e-05,-1.173376,2.227678,0.448592,NonDoH,1
850249,192.168.20.191,23.33.241.6,50708,443,2020-01-13 15:51:10,5.84483,217,37.126828,211,36.10028,...,1.7956e-08,0.000134,0.024329,0.024329,0.024195,0.0,1.0,0.005508,NonDoH,1
1185075,176.103.130.130,192.168.20.113,443,41618,2019-12-20 06:48:00,177.001386,315,1.779647,618,3.491498,...,4.666667e-12,2e-06,3.1e-05,3.2e-05,2.8e-05,-1.38873,1.38873,0.069685,Benign,2


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

drop_cols = [
    "Label",
    "LabelEncoded",
    "TimeStamp",
    "SourceIP",
    "DestinationIP",
]

existing_drop_cols = [c for c in drop_cols if c in data.columns]
data_features = data.drop(columns=existing_drop_cols)

missing = [c for c in CORE_FEATURES if c not in data_features.columns]
if missing:
    raise ValueError(f"Missing required feature columns: {missing}")

X = data_features[CORE_FEATURES].copy()
y = data["LabelEncoded"].copy()

X = X.apply(pd.to_numeric, errors="coerce")
before = len(X)
mask = ~X.isna().any(axis=1) & ~y.isna()
X = X[mask]
y = y[mask]
print(f"Filtered out {before - len(X)} rows with NaN values")



Filtered out 0 rows with NaN values


In [6]:
X.head()

Unnamed: 0,SourcePort,DestinationPort,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthMean,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthCoefficientofVariation,PacketTimeMean,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeCoefficientofVariation
0,50749,443,95.08155,62311,655.342703,65358,687.388878,135.673751,7474.676771,86.456213,0.637236,45.065277,670.585814,25.895672,0.574626
1,50749,443,122.309318,93828,767.136973,101232,827.672018,141.245474,10458.118598,102.264943,0.724023,52.287903,708.465878,26.617022,0.509047
2,50749,443,120.958413,38784,320.639127,38236,316.108645,133.715278,7300.293933,85.441758,0.638983,50.316114,1358.911235,36.863413,0.732636
3,50749,443,110.50108,61993,561.017141,69757,631.278898,139.123548,8499.282518,92.191553,0.66266,51.693726,1118.135436,33.438532,0.646859
4,443,50749,54.229891,83641,1542.341289,76804,1416.266907,138.91342,8052.745751,89.737092,0.645993,36.435619,341.696613,18.485038,0.507334


In [82]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436779 entries, 0 to 1436778
Data columns (total 15 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   SourcePort                          1436779 non-null  int64  
 1   DestinationPort                     1436779 non-null  int64  
 2   Duration                            1436779 non-null  float64
 3   FlowBytesSent                       1436779 non-null  int64  
 4   FlowSentRate                        1436779 non-null  float64
 5   FlowBytesReceived                   1436779 non-null  int64  
 6   FlowReceivedRate                    1436779 non-null  float64
 7   PacketLengthMean                    1436779 non-null  float64
 8   PacketLengthVariance                1436779 non-null  float64
 9   PacketLengthStandardDeviation       1436779 non-null  float64
 10  PacketLengthCoefficientofVariation  1436779 non-null  float64
 11  PacketTimeM

### Outlier Analysis

In [7]:
import numpy as np
numeric_df = X.select_dtypes(include=['number'])
z_scores = np.abs((numeric_df - numeric_df.mean()) / numeric_df.std())
outlier_mask = z_scores > 3
outliers = numeric_df[outlier_mask]
outlier_counts = outlier_mask.sum()

In [8]:
outlier_counts

SourcePort                                0
DestinationPort                       73881
Duration                               3199
FlowBytesSent                         28970
FlowSentRate                           2075
FlowBytesReceived                      8325
FlowReceivedRate                       8794
PacketLengthMean                      29357
PacketLengthVariance                  20701
PacketLengthStandardDeviation         25953
PacketLengthCoefficientofVariation      190
PacketTimeMean                        20766
PacketTimeVariance                    21317
PacketTimeStandardDeviation            9445
PacketTimeCoefficientofVariation      19444
dtype: int64

SourcePort                                0
DestinationPort                       73881
Duration                               3199
FlowBytesSent                         28970
FlowSentRate                           2075
FlowBytesReceived                      8325
FlowReceivedRate                       8794
PacketLengthMean                      29357
PacketLengthVariance                  20701
PacketLengthStandardDeviation         25953
PacketLengthCoefficientofVariation      190
PacketTimeMean                        20766
PacketTimeVariance                    21317
PacketTimeStandardDeviation            9445
PacketTimeCoefficientofVariation      19444

In [9]:
import numpy as np

def cap_outliers(df, lower=0.01, upper=0.99):
    df_capped = df.copy()
    for col in df_capped.columns:
        low = df_capped[col].quantile(lower)
        high = df_capped[col].quantile(upper)
        df_capped[col] = np.clip(df_capped[col], low, high)
    return df_capped

X_capped = cap_outliers(X)
X_capped.head()

Unnamed: 0,SourcePort,DestinationPort,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthMean,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthCoefficientofVariation,PacketTimeMean,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeCoefficientofVariation
0,50749,443,95.08155,62311,655.342703,65358,687.388878,135.673751,7474.676771,86.456213,0.637236,45.065277,670.585814,25.895672,0.574626
1,50749,443,121.99283,93828,767.136973,101232,827.672018,141.245474,10458.118598,102.264943,0.724023,52.287903,708.465878,26.617022,0.509047
2,50749,443,120.958413,38784,320.639127,38236,316.108645,133.715278,7300.293933,85.441758,0.638983,50.316114,1358.911235,36.863413,0.732636
3,50749,443,110.50108,61993,561.017141,69757,631.278898,139.123548,8499.282518,92.191553,0.66266,51.693726,1118.135436,33.438532,0.646859
4,443,50749,54.229891,83641,1542.341289,76804,1416.266907,138.91342,8052.745751,89.737092,0.645993,36.435619,341.696613,18.485038,0.507334


In [10]:
X=X_capped

numeric_df = X.select_dtypes(include=['number'])
z_scores = np.abs((numeric_df - numeric_df.mean()) / numeric_df.std())
outlier_mask = z_scores > 3
outliers = numeric_df[outlier_mask]
outlier_counts = outlier_mask.sum()
outlier_counts

SourcePort                                0
DestinationPort                       76523
Duration                                  0
FlowBytesSent                         45359
FlowSentRate                          39391
FlowBytesReceived                     44033
FlowReceivedRate                      28882
PacketLengthMean                      43178
PacketLengthVariance                  32046
PacketLengthStandardDeviation         30186
PacketLengthCoefficientofVariation        0
PacketTimeMean                        22636
PacketTimeVariance                    29883
PacketTimeStandardDeviation               0
PacketTimeCoefficientofVariation      22287
dtype: int64

### Scaling Numericals

In [11]:
from sklearn.preprocessing import StandardScaler

num_cols = X.columns

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

num_cols = X.select_dtypes(include=['number']).columns
X[num_cols] = scaler.fit_transform(X[num_cols])

In [12]:
X.head()

Unnamed: 0,SourcePort,DestinationPort,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthMean,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthCoefficientofVariation,PacketTimeMean,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeCoefficientofVariation
0,0.245402,-0.29591,1.780927,0.451039,-0.508163,0.18414,-0.297785,-0.511576,-0.434552,-0.70409,-0.542547,1.866613,1.232077,1.539735,-0.815492
1,0.245402,-0.29591,2.479223,0.814505,-0.484621,0.465549,-0.294351,-0.489125,-0.429556,-0.662245,-0.427174,2.25852,1.331172,1.603107,-0.934221
2,0.245402,-0.29591,2.452382,0.179716,-0.578646,-0.028615,-0.306872,-0.519468,-0.434844,-0.706775,-0.540225,2.151529,3.032745,2.503263,-0.529417
3,0.245402,-0.29591,2.181034,0.447371,-0.528026,0.218647,-0.299158,-0.497675,-0.432836,-0.688909,-0.50875,2.226279,2.402873,2.202384,-0.684716
4,-2.870878,3.524133,0.720904,0.697024,-0.321376,0.273927,-0.279944,-0.498522,-0.433584,-0.695406,-0.530906,1.398359,0.371698,0.888703,-0.937323


### Model Training and Evaluation

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:

model = RandomForestClassifier(
        n_estimators=300,
        max_depth=18,
        n_jobs=-1,
        random_state=42,
    )

print("Training model...")
model.fit(X_train, y_train)

In [93]:
print("Evaluating on test set...")
y_pred = model.predict(X_test)

inv_label_map = {v: k for k, v in LABEL_MAP.items()}
target_names = [inv_label_map[i] for i in sorted(inv_label_map.keys())]

print(classification_report(y_test, y_pred, target_names=target_names))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

bundle = {
    "model": model,
    "label_map": LABEL_MAP,
    "core_features": CORE_FEATURES,
}
joblib.dump(bundle, "doh_optimized_model.joblib")
print("Saved model to doh_optimized_model.joblib")

Evaluating on test set...
              precision    recall  f1-score   support

         DoH       0.32      0.32      0.32     80893
      NonDoH       1.00      1.00      1.00    269248
      Benign       0.26      0.26      0.26      5942
   Malicious       0.31      0.30      0.30     74951

    accuracy                           0.74    431034
   macro avg       0.47      0.47      0.47    431034
weighted avg       0.74      0.74      0.74    431034

Confusion matrix:
[[ 26146    641   4378  49728]
 [   201 268937     68     42]
 [  3870    506   1565      1]
 [ 52419    126      0  22406]]
Saved model to doh_optimized_model.joblib


### Hypertuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_dist = {
    "n_estimators": [200, 300, 400, 600],
    "max_depth": [None, 10, 14, 18, 22],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.3, 0.5, None],
    "class_weight": [None, "balanced", "balanced_subsample"],
}

search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=30,
    scoring="f1_macro",
    n_jobs=-1,
    cv=3,
    verbose=2,
    random_state=42
)

search.fit(X_train, y_train)

print("Best macro F1 on CV:", search.best_score_)
print("Best params:")
for k, v in search.best_params_.items():
    print(f"  {k}: {v}")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
