This file (data_preparation.ipynb) takes csv to make it final dataset. from data/processed  and save to data/datasets

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Set Seaborn Style
sns.set(style="whitegrid")

# File Paths
RAW_CSV_PATH = "D:/FYP/data/processed/traffic_data_03.csv"
PROCESSED_CSV_PATH = "D:/FYP/data/datasets/final_dataset.csv"

# Load Dataset
df = pd.read_csv(RAW_CSV_PATH)
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns before processing.\n")

# Display first few rows
display(df.head())

In [None]:
# Step 1: Handle Missing Values
print("Checking missing values before processing:")
print(df.isnull().sum())

# Fill missing values
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill categorical with mode
for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)  # Fill numeric with median

print("\nMissing values after handling:")
print(df.isnull().sum())

In [None]:
# Step 2: Remove Duplicates
df.drop_duplicates(inplace=True)
print(f"\nDataset now contains {df.shape[0]} rows after removing duplicates.\n")

In [None]:
# Step 3: Convert Timestamp Column
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['second'] = df['timestamp'].dt.second
df.drop(columns=['timestamp'], inplace=True)  # Drop original timestamp column

In [None]:
# Step 4: Encode Categorical Features
label_encoder = LabelEncoder()
df['protocol'] = label_encoder.fit_transform(df['protocol'])
df['source_ip'] = label_encoder.fit_transform(df['source_ip'])
df['destination_ip'] = label_encoder.fit_transform(df['destination_ip'])

In [None]:
# Step 5: Normalize Numerical Features
scaler = MinMaxScaler()
numeric_columns = ['packet_size', 'time_interval']  # Adjust if more numeric columns exist
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [None]:
# Step 6: Convert Attack Type to Binary Classification (0 = Normal, 1 = Malicious)
attack_types = {
    'normal': 0,
    'attack': 1  # will Modify if there are multiple attack categories
}
df['label'] = df['attack_category'].map(attack_types)
df.drop(columns=['attack_category'], inplace=True)

In [None]:
# Step 7: Save Processed Data
df.to_csv(PROCESSED_CSV_PATH, index=False)
print(f"✅ Processed dataset saved to {PROCESSED_CSV_PATH}")