# ðŸ“Š Network Traffic - Exploratory Data Analysis

This notebook explores the CICIDS2017 dataset for network anomaly detection.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

%matplotlib inline
plt.style.use('seaborn-v0_8-darkgrid')
pd.set_option('display.max_columns', 50)

## 1. Load Data

In [None]:
# Load sample data or CICIDS2017
data_path = Path('../data/raw')

# Try loading sample data first
if (data_path / 'sample_data.csv').exists():
    df = pd.read_csv(data_path / 'sample_data.csv')
    print('Loaded sample data')
elif (data_path / 'Monday-WorkingHours.pcap_ISCX.csv').exists():
    df = pd.read_csv(data_path / 'Monday-WorkingHours.pcap_ISCX.csv')
    print('Loaded CICIDS2017 Monday file')
else:
    print('No data found! Run: python scripts/download_data.py')
    df = None

In [None]:
if df is not None:
    print(f'Shape: {df.shape}')
    print(f'\nColumns: {len(df.columns)}')
    df.head()

## 2. Basic Statistics

In [None]:
if df is not None:
    df.describe()

In [None]:
if df is not None:
    # Check for missing values
    missing = df.isnull().sum()
    print('Missing values per column:')
    print(missing[missing > 0])

## 3. Label Distribution

In [None]:
if df is not None and 'Label' in df.columns:
    # Strip whitespace from labels
    df['Label'] = df['Label'].str.strip()
    
    print('Label distribution:')
    label_counts = df['Label'].value_counts()
    print(label_counts)
    print(f'\nTotal labels: {len(label_counts)}')

In [None]:
if df is not None and 'Label' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar chart
    label_counts.plot(kind='bar', ax=axes[0], color='steelblue')
    axes[0].set_title('Attack Type Distribution')
    axes[0].set_xlabel('Attack Type')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Pie chart
    label_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
    axes[1].set_title('Attack Type Proportion')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()

## 4. Feature Distributions

In [None]:
if df is not None:
    # Select numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns[:10]
    
    fig, axes = plt.subplots(2, 5, figsize=(16, 8))
    axes = axes.flatten()
    
    for i, col in enumerate(numeric_cols):
        axes[i].hist(df[col].dropna(), bins=50, color='steelblue', alpha=0.7)
        axes[i].set_title(col[:20])
        axes[i].set_xlabel('')
    
    plt.tight_layout()
    plt.show()

## 5. Correlation Analysis

In [None]:
if df is not None:
    # Select top 15 numeric features
    numeric_df = df.select_dtypes(include=[np.number]).iloc[:, :15]
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(numeric_df.corr(), annot=False, cmap='coolwarm', center=0)
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

## 6. Attack vs Normal Comparison

In [None]:
if df is not None and 'Label' in df.columns:
    # Create binary label
    df['is_attack'] = (~df['Label'].str.contains('BENIGN', case=False, na=False)).astype(int)
    
    print(f"Normal traffic: {(df['is_attack'] == 0).sum():,}")
    print(f"Attack traffic: {(df['is_attack'] == 1).sum():,}")

In [None]:
if df is not None and 'is_attack' in df.columns:
    # Compare key features between normal and attack
    features_to_compare = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets'] 
    features_to_compare = [f for f in features_to_compare if f in df.columns]
    
    if features_to_compare:
        fig, axes = plt.subplots(1, len(features_to_compare), figsize=(5*len(features_to_compare), 4))
        if len(features_to_compare) == 1:
            axes = [axes]
        
        for i, feat in enumerate(features_to_compare):
            df.boxplot(column=feat, by='is_attack', ax=axes[i])
            axes[i].set_title(feat)
            axes[i].set_xlabel('Is Attack (0=No, 1=Yes)')
        
        plt.suptitle('')
        plt.tight_layout()
        plt.show()

## 7. Summary

In [None]:
if df is not None:
    print('=== Dataset Summary ===')
    print(f'Total samples: {len(df):,}')
    print(f'Total features: {len(df.columns)}')
    if 'Label' in df.columns:
        print(f'Unique labels: {df["Label"].nunique()}')
    if 'is_attack' in df.columns:
        print(f'Attack ratio: {df["is_attack"].mean()*100:.2f}%')
    print(f'Missing values: {df.isnull().sum().sum()}')