In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df = pd.read_csv("WSdata.csv")
df.head()

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.0,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=119/30464,..."
1,2,0.160909,192.167.7.43,192.167.7.45,UDP,162,8130 > 12345 Len=120
2,3,1.024022,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=120/30720,..."
3,4,1.161161,192.167.7.43,192.167.7.45,UDP,162,8131 > 12345 Len=120
4,5,2.048007,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=121/30976,..."


In [3]:
# Drop the "No" column
df = df.drop(columns=["No."], errors="ignore")
df.head()

Unnamed: 0,Time,Source,Destination,Protocol,Length,Info
0,0.0,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=119/30464,..."
1,0.160909,192.167.7.43,192.167.7.45,UDP,162,8130 > 12345 Len=120
2,1.024022,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=120/30720,..."
3,1.161161,192.167.7.43,192.167.7.45,UDP,162,8131 > 12345 Len=120
4,2.048007,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=121/30976,..."


In [4]:
# Encode categorical features
protocol_encoder = LabelEncoder()
df['Protocol_Encoded'] = protocol_encoder.fit_transform(df['Protocol'])

# Normalize numerical features
scaler = MinMaxScaler()
df[['Time_Normalized', 'Length_Normalized']] = scaler.fit_transform(df[['Time', 'Length']])

df.head()

Unnamed: 0,Time,Source,Destination,Protocol,Length,Info,Protocol_Encoded,Time_Normalized,Length_Normalized
0,0.0,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=119/30464,...",4,0.0,0.019746
1,0.160909,192.167.7.43,192.167.7.45,UDP,162,8130 > 12345 Len=120,9,0.000555,0.042313
2,1.024022,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=120/30720,...",4,0.003534,0.019746
3,1.161161,192.167.7.43,192.167.7.45,UDP,162,8131 > 12345 Len=120,9,0.004007,0.042313
4,2.048007,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=121/30976,...",4,0.007067,0.019746


In [5]:
import joblib
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(protocol_encoder, 'protocol_encoder.pkl')

['protocol_encoder.pkl']

In [6]:
# Define: heuristic rules for labeling the traffic data
def label_traffic(row):
    """
    Assign '1' (malicious)
    Assign '0' (normal)
    """
    malicious_protocols = {'ICMP', 'SSH', 'MDNS', 'Telenet', 'TLSv1.2'} 
    malicious_keywords = {'cipher', 'syn', 'fin', 'psh', }

    # Check: protocol is suspicious
    if row['Protocol'] in malicious_protocols:
        return 1

    # Check: malicious keywords
    if any(keyword in str(row['Info']).lower() for keyword in malicious_keywords):
        return 1

    # otherwise, normal
    return 0

# Apply labeling function to dataset
df['Label'] = df.apply(label_traffic, axis=1)
df.head()

Unnamed: 0,Time,Source,Destination,Protocol,Length,Info,Protocol_Encoded,Time_Normalized,Length_Normalized,Label
0,0.0,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=119/30464,...",4,0.0,0.019746,1
1,0.160909,192.167.7.43,192.167.7.45,UDP,162,8130 > 12345 Len=120,9,0.000555,0.042313,0
2,1.024022,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=120/30720,...",4,0.003534,0.019746,1
3,1.161161,192.167.7.43,192.167.7.45,UDP,162,8131 > 12345 Len=120,9,0.004007,0.042313,0
4,2.048007,192.167.7.43,192.167.7.45,ICMP,98,"Echo (ping) request id=0x35a7, seq=121/30976,...",4,0.007067,0.019746,1


In [7]:
print(df['Label'].value_counts())

Label
0    16575
1     2805
Name: count, dtype: int64


In [8]:
# Select features and labels
features = ['Protocol_Encoded', 'Time_Normalized', 'Length_Normalized']
X = df[features]
y = df['Label']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
models = {
    "logistic regression": LogisticRegression(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "TabNet": TabNetClassifier(),
    "LightGBM": LGBMClassifier()
}



In [11]:
# Train and evaluate each model
for name, model in models.items():
    # Convert DataFrames to NumPy arrays for models that require it
    model.fit(X_train.values, y_train.values)
    
    y_pred = model.predict(X_test.values)
    
    # Print model performance with better readability
    print("="*40)  # Adds a separator line
    print(f"{name} Model Performance:\n")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_pred))
    print(" "*40)  # Adds another separator for clarity

logistic regression Model Performance:

Accuracy: 0.8412452700378397
Precision: 0.0
Recall: 0.0
F1-score: 0.0
ROC AUC: 0.4908671216378964
                                        
CatBoost Model Performance:

Accuracy: 0.9950120399036808
Precision: 1.0
Recall: 0.9651442307692307
F1-score: 0.982262996941896
ROC AUC: 0.9825721153846154
                                        




epoch 0  | loss: 0.29061 |  0:00:21s
epoch 1  | loss: 0.0778  |  0:00:35s
epoch 2  | loss: 0.05238 |  0:00:42s
epoch 3  | loss: 0.04602 |  0:00:49s
epoch 4  | loss: 0.04507 |  0:00:56s
epoch 5  | loss: 0.04611 |  0:01:14s
epoch 6  | loss: 0.03921 |  0:01:33s
epoch 7  | loss: 0.03957 |  0:01:39s
epoch 8  | loss: 0.03817 |  0:01:47s
epoch 9  | loss: 0.03616 |  0:01:54s
epoch 10 | loss: 0.03356 |  0:01:59s
epoch 11 | loss: 0.03402 |  0:02:07s
epoch 12 | loss: 0.03661 |  0:02:22s
epoch 13 | loss: 0.03531 |  0:02:39s
epoch 14 | loss: 0.03604 |  0:03:07s
epoch 15 | loss: 0.03446 |  0:03:21s
epoch 16 | loss: 0.03698 |  0:03:27s
epoch 17 | loss: 0.03467 |  0:03:51s
epoch 18 | loss: 0.03376 |  0:03:55s
epoch 19 | loss: 0.03158 |  0:03:59s
epoch 20 | loss: 0.03172 |  0:04:04s
epoch 21 | loss: 0.03015 |  0:04:15s
epoch 22 | loss: 0.0298  |  0:04:31s
epoch 23 | loss: 0.03097 |  0:04:38s
epoch 24 | loss: 0.034   |  0:04:45s
epoch 25 | loss: 0.03421 |  0:04:54s
epoch 26 | loss: 0.03116 |  0:04:59s
e

In [12]:
import joblib

# Save CatBoost
joblib.dump(models["CatBoost"], "catboost_model.pkl")

# Save LightGBM
joblib.dump(models["LightGBM"], "lightgbm_model.pkl")

['lightgbm_model.pkl']