In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [18]:
# Load the synthetic dataset
df = pd.read_csv('shuffled_synthetic_dataset.csv')

In [19]:
df.head()

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,length,label,flags_A,flags_ACK,flags_Echo Request,flags_FA,flags_FIN,flags_PA,flags_S,flags_SA,flags_SYN,protocol_ICMP,protocol_TCP,protocol_UDP
0,119.16.25.228,246.9.118.118,59268,140,208,normal,False,False,False,False,False,False,False,False,False,False,False,True
1,185.41.49.226,124.172.89.72,39983,348,532,normal,False,False,False,False,False,False,False,False,False,False,False,True
2,133.73.27.79,115.252.170.34,0,0,1436,normal,False,False,True,False,False,False,False,False,False,True,False,False
3,20.40.28.47,133.194.113.228,10459,644,1092,normal,False,False,False,False,False,False,False,False,False,False,False,True
4,103.61.224.190,9.40.10.41,0,0,637,malicious,False,False,True,False,False,False,False,False,False,True,False,False


In [20]:
df.tail()

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,length,label,flags_A,flags_ACK,flags_Echo Request,flags_FA,flags_FIN,flags_PA,flags_S,flags_SA,flags_SYN,protocol_ICMP,protocol_TCP,protocol_UDP
3995,58.244.254.90,187.118.170.24,21022,44889,1438,normal,False,False,False,False,False,False,False,False,True,True,False,False
3996,103.61.224.190,212.192.3.82,0,0,810,malicious,False,False,True,False,False,False,False,False,False,True,False,False
3997,97.4.7.70,9.68.82.50,43128,80,42,normal,False,False,False,False,False,False,False,False,True,False,True,False
3998,157.191.116.226,186.146.192.179,31917,28343,260,normal,False,False,False,False,False,False,False,False,False,False,True,False
3999,121.178.164.179,132.209.133.150,53419,18085,49,normal,True,False,False,False,False,False,False,False,False,False,False,True


In [21]:
df.shape

(4000, 18)

In [22]:
#Checking Null Values

df.isnull().sum()

src_ip                0
dst_ip                0
src_port              0
dst_port              0
length                0
label                 0
flags_A               0
flags_ACK             0
flags_Echo Request    0
flags_FA              0
flags_FIN             0
flags_PA              0
flags_S               0
flags_SA              0
flags_SYN             0
protocol_ICMP         0
protocol_TCP          0
protocol_UDP          0
dtype: int64

In [23]:
#Data Preporcessing
df = df.drop(['src_ip', 'dst_ip'],axis=1)

In [24]:
df['label'] = df['label'].map({'malicious':1,'normal':0})

In [25]:
# Define features and target
features = [
    'src_port', 'dst_port', 'length',
    'flags_A', 'flags_ACK', 'flags_Echo Request', 'flags_FA', 'flags_FIN',
    'flags_PA', 'flags_S', 'flags_SA', 'flags_SYN', 'protocol_ICMP',
    'protocol_TCP', 'protocol_UDP'
]
target = 'label'

In [26]:
X = df[features]
y = df[target]

In [27]:
# Convert categorical features to one-hot encoding
X = pd.get_dummies(X)

In [28]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Initialize the Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

lr = LogisticRegression()
lr.fit(X_train,y_train)

In [30]:
# Predict on the testing set
y_pred = clf.predict(X_test)
y_pred_lr = lr.predict(X_test)

In [31]:
# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy: 0.98125

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       623
           1       0.93      0.99      0.96       177

    accuracy                           0.98       800
   macro avg       0.96      0.99      0.97       800
weighted avg       0.98      0.98      0.98       800

Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.99       623
           1       0.92      1.00      0.96       177

    accuracy                           0.98       800
   macro avg       0.96      0.99      0.97       800
weighted avg       0.98      0.98      0.98       800



In [32]:
#EDA

import matplotlib.pyplot as plt
import seaborn as sns


#Countplot
sns.countplot(data=df, x='protocol_ICMP')
plt.show()
#Piechart
plt.pie(df['label'].value_counts(),autopct="%0.2f%%", labels=['normal', 'malicious'])
plt.show()
#Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()
#histoplot
sns.histplot(data=df, x='length', kde=True)
plt.show()
#Pairplot
sns.pairplot(df)
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Save the trained model to a file
joblib.dump(clf, 'random_forest_model.pkl')

['random_forest_model.pkl']