# Network Anomaly Detection - Data Exploration

This notebook helps you explore the CICIDS2017 dataset and understand the data before training models.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('dark_background')
sns.set_palette('husl')

print('Libraries imported successfully!')

## 1. Load Dataset

First, download CICIDS2017 from: https://www.unb.ca/cic/datasets/ids-2017.html

Place CSV files in `data/raw/` folder.

In [None]:
# Load a sample file (update path to your file)
import sys
sys.path.insert(0, '..')

from src.data.loader import load_dataset, get_dataset_info

# Load the data
df = load_dataset()

# Show info
info = get_dataset_info(df)
print(f"Total records: {info['total_records']:,}")
print(f"Features: {info['features']}")
print(f"Attack types: {info.get('attack_types', 'N/A')}")

In [None]:
# Preview the data
df.head()

## 2. Label Distribution

In [None]:
# Plot label distribution
label_counts = df['Label'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
ax1 = axes[0]
label_counts.plot(kind='bar', ax=ax1, color=sns.color_palette('husl', len(label_counts)))
ax1.set_title('Attack Type Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Attack Type')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Pie chart for top 5
ax2 = axes[1]
top_5 = label_counts.head(5)
colors = ['#00d4ff', '#ef4444', '#f59e0b', '#10b981', '#8b5cf6']
ax2.pie(top_5.values, labels=top_5.index, autopct='%1.1f%%', colors=colors)
ax2.set_title('Top 5 Categories', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nLabel counts:")
print(label_counts)

## 3. Feature Statistics

In [None]:
# Selected features
from src.utils.config import SELECTED_FEATURES

print("Selected features for detection:")
for i, f in enumerate(SELECTED_FEATURES, 1):
    print(f"{i:2}. {f}")

In [None]:
# Feature statistics
available_features = [f for f in SELECTED_FEATURES if f in df.columns]
df[available_features].describe()

## 4. Feature Correlation

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))

# Clean data for correlation
df_clean = df[available_features].replace([np.inf, -np.inf], np.nan).dropna()
correlation = df_clean.corr()

sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Normal vs Attack Traffic Comparison

In [None]:
# Compare normal vs attack traffic
df_sample = df.sample(min(10000, len(df)), random_state=42)
df_sample['is_attack'] = (df_sample['Label'] != 'BENIGN').astype(int)

feature_to_plot = 'Flow Bytes/s' if 'Flow Bytes/s' in df_sample.columns else available_features[0]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution comparison
ax1 = axes[0]
df_normal = df_sample[df_sample['is_attack'] == 0][feature_to_plot].replace([np.inf, -np.inf], np.nan).dropna()
df_attack = df_sample[df_sample['is_attack'] == 1][feature_to_plot].replace([np.inf, -np.inf], np.nan).dropna()

ax1.hist(df_normal, bins=50, alpha=0.7, label='Normal', color='#00d4ff')
ax1.hist(df_attack, bins=50, alpha=0.7, label='Attack', color='#ef4444')
ax1.set_title(f'{feature_to_plot} Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel(feature_to_plot)
ax1.set_ylabel('Frequency')
ax1.legend()
ax1.set_xlim(0, np.percentile(df_sample[feature_to_plot].replace([np.inf, -np.inf], np.nan).dropna(), 95))

# Box plot
ax2 = axes[1]
df_sample.boxplot(column=feature_to_plot, by='is_attack', ax=ax2)
ax2.set_title(f'{feature_to_plot} by Traffic Type', fontsize=14, fontweight='bold')
ax2.set_xlabel('Is Attack (0=Normal, 1=Attack)')
plt.suptitle('')

plt.tight_layout()
plt.show()

## 6. Train Models

Once you're satisfied with the data exploration, run the training script:

In [None]:
# Train all models
from src.train import train_all_models

# Use sample_size for quick testing, remove for full training
results = train_all_models(sample_size=50000)

print("\nâœ… Training complete!")
print(f"Statistical Detector: {results['statistical_accuracy']:.2%}")
print(f"Isolation Forest: {results['isolation_forest_accuracy']:.2%}")
print(f"Attack Classifier: {results['classifier_accuracy']:.2%}")