In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load your dataset
df = pd.read_csv("your_dataset.csv")  # Replace with your dataset path

# 1. Deduplication
df = df.drop_duplicates()

# 2. Missing Value Handling
# Strategy: Drop if critical, fill otherwise
critical_columns = ['timestamp', 'attack_type', 'source_ip', 'destination_ip']
df = df.dropna(subset=critical_columns)

# Fill numeric missing values with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Fill non-numeric missing values with a placeholder
df = df.fillna("unknown")

# 3. Timestamp Alignment
# Assume 'timestamp' column exists and is in a string format
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.dropna(subset=['timestamp'])  # Remove rows where timestamp couldn't be parsed
df = df.sort_values(by='timestamp')   # Optional: sort by time

# 4. Anomaly Filtering (Example: Remove outliers from numeric fields)
from scipy import stats

z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

# Save the cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)
print("Data preprocessing complete. Saved to cleaned_dataset.csv")
