In [1]:
# Import required libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set visualization style
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)


In [2]:

# Load  dataset 
df = pd.read_csv('D:/FYP/data/datasets/final_dataset_02.csv') 


In [3]:

# 1️⃣ Basic Info
print(" Dataset Shape:", df.shape)
print("\n Dataset Preview:\n", df.head())
print("\n Dataset Info:")
df.info()


 Dataset Shape: (133854, 12)

 Dataset Preview:
    frame.number  _ws.col.protocol  ip.ttl  ip.src  ip.dst  tcp.srcport  \
0             1                 1      48      22      13      52813.0   
1             2                 1      48      22      13      52813.0   
2             3                 2      18      27      15      52813.0   
3             4                 2      18      27      15      52813.0   
4             5                 2      18      27      15      52813.0   

   tcp.dstport  tcp.flags  _ws.col.info  frame.len  \
0        443.0          2          5445        330   
1        443.0          2          5445        330   
2        443.0          2          8142        118   
3        443.0          2          8142        118   
4        443.0          2          8023        117   

   frame.time_delta_displayed  label  
0                    0.000000      0  
1                    0.000013      0  
2                    0.313026      1  
3                    0.00

In [None]:

# 2️⃣ Check for Missing Values
print("Missing Values:\n", df.isnull().sum())


In [None]:

# 2️⃣ Check for Missing Values
print("Missing Values:\n", df.isnull().sum())


In [None]:

# 3️⃣ Descriptive Statistics
print("Statistical Summary:\n", df.describe())


In [None]:

# 4️⃣ Label Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=df, palette='Set2')
plt.title("Class Distribution (Normal vs Attack)")
plt.xlabel("Label")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:

# 5️⃣ Identify column types
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'label' in categorical_cols: categorical_cols.remove('label')  # We'll plot it separately

print(f"\n Categorical Columns: {categorical_cols}")
print(f" Numerical Columns: {numerical_cols}")


In [None]:

# 6️⃣ Categorical Feature Distributions (Pre-Encoding)
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=col, data=df, palette='Set3')
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:

# 7️⃣ Numerical Feature Distributions + Outlier Detection
for col in numerical_cols:
    plt.figure(figsize=(10, 4))
    sns.histplot(df[col], kde=True, color="steelblue", bins=30)
    plt.title(f"{col} - Distribution")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col], color="tomato")
    plt.title(f"{col} - Outlier Detection (Boxplot)")
    plt.tight_layout()
    plt.show()


In [None]:

# 8️⃣ Correlation Heatmap
corr_matrix = df[numerical_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title("📈 Feature Correlation Heatmap (Numerical Only)")
plt.tight_layout()
plt.show()


In [None]:

# 9️⃣ Pairplot for Visual Patterns (Optional - heavy for large datasets)
# sns.pairplot(df, hue='label', vars=numerical_cols, corner=True, palette='husl')
# plt.suptitle("Pairplot of Numerical Features by Label", y=1.02)
# plt.show()


In [None]:

# 🔟 Encoding Categorical Columns for Post-Encoding EDA (Optional)
from sklearn.preprocessing import LabelEncoder
encoded_df = df.copy()
label_encoders = {}

for col in categorical_cols + ['label']:
    le = LabelEncoder()
    encoded_df[col] = le.fit_transform(encoded_df[col])
    label_encoders[col] = le

print("\n✅ Categorical columns encoded for post-EDA visualizations.")

# 🔁 Post-Encoding Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(encoded_df.corr(), annot=True, cmap='YlGnBu', fmt='.2f', square=True)
plt.title("🔁 Post-Encoding Feature Correlation Heatmap (All Columns)")
plt.tight_layout()
plt.show()


In [None]:
encoded_df = df.copy()
label_encoders = {}

for col in categorical_cols + ['label']:
    le = LabelEncoder()
    encoded_df[col] = le.fit_transform(encoded_df[col])
    label_encoders[col] = le

print("\n✅ Categorical columns encoded for post-EDA visualizations.")

# 🔁 Post-Encoding Correlation Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(encoded_df.corr(), annot=True, cmap='YlGnBu', fmt='.2f', square=True)
plt.title("🔁 Post-Encoding Feature Correlation Heatmap (All Columns)")
plt.tight_layout()
plt.show()


### 6️⃣ Label Distribution (Class Imbalance Check) ###
if 'label' in df.columns:
    plt.figure(figsize=(7, 5))
    sns.countplot(x=df['label'], palette="viridis")
    plt.title("Class Distribution - Normal vs. Malicious Traffic")
    plt.xlabel("Traffic Type")
    plt.ylabel("Count")
    plt.show()
else:
    print("⚠️ Warning: 'label' column not found in dataset!")

print("✅ Exploratory Data Analysis Completed!")

Saved the notebook on 09/02/2025