# Interactive Cybersecurity Data Visualization

This notebook provides comprehensive interactive visualizations of our network attack detection dataset. We'll explore:
1. Attack pattern analysis and temporal trends
2. Network traffic distributions and anomaly patterns  
3. Protocol and service analysis
4. Feature correlation and importance
5. Attack severity and risk assessment
6. Network flow characteristics
7. Attack type transitions and patterns
8. Port and IP analysis
9. Multi-dimensional attack profiling
10. Security metrics and KPI dashboards

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the cybersecurity data
print("Loading cybersecurity dataset...")
df = pd.read_csv('../Data/Train_data.csv')

# Data preparation for visualizations
print("Preparing data for visualization...")

# Identify categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Remove target from features if present
if 'class' in categorical_columns:
    categorical_columns.remove('class')
if 'class' in numerical_columns:
    numerical_columns.remove('class')

# Create attack severity score for visualization
numerical_features = [col for col in numerical_columns if col != 'class'][:5]
if len(numerical_features) >= 3:
    df['attack_severity'] = df[numerical_features[:3]].sum(axis=1)
    df['severity_level'] = pd.cut(df['attack_severity'], bins=3, labels=['Low', 'Medium', 'High'])

print("Data loaded and prepared successfully!")
print(f"\nDataset shape: {df.shape}")
print(f"Attack classes: {df['class'].value_counts().to_dict()}")
print(f"Categorical features: {len(categorical_columns)}")
print(f"Numerical features: {len(numerical_columns)}")
print(f"Date range: Full dataset ready for analysis")

Loading cybersecurity dataset...
Preparing data for visualization...
Data loaded and prepared successfully!

Dataset shape: (25192, 44)
Attack classes: {'normal': 13449, 'anomaly': 11743}
Categorical features: 3
Numerical features: 38
Date range: Full dataset ready for analysis
Preparing data for visualization...
Data loaded and prepared successfully!

Dataset shape: (25192, 44)
Attack classes: {'normal': 13449, 'anomaly': 11743}
Categorical features: 3
Numerical features: 38
Date range: Full dataset ready for analysis


## 1. Attack Pattern Analysis and Distribution

In [2]:
# 1.1 Attack Class Distribution with Interactive Pie Chart
attack_counts = df['class'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=attack_counts.index,
    values=attack_counts.values,
    hole=0.4,
    textinfo='label+percent+value',
    textfont_size=12,
    marker=dict(colors=['#FF6B6B', '#4ECDC4'], line=dict(color='#FFFFFF', width=2))
)])

fig.update_layout(
    title={
        'text': 'Network Attack Distribution',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 20}
    },
    annotations=[dict(text='Attack<br>Classes', x=0.5, y=0.5, font_size=16, showarrow=False)],
    width=800,
    height=600
)
fig.show()

# 1.2 Attack Severity Analysis
if 'severity_level' in df.columns:
    # Severity distribution by attack class
    severity_cross = pd.crosstab(df['class'], df['severity_level'], normalize='index') * 100
    
    fig = go.Figure()
    
    for severity in severity_cross.columns:
        fig.add_trace(go.Bar(
            name=severity,
            x=severity_cross.index,
            y=severity_cross[severity],
            text=np.round(severity_cross[severity], 1),
            textposition='auto',
        ))
    
    fig.update_layout(
        barmode='stack',
        title='Attack Severity Distribution by Class',
        xaxis_title='Attack Class',
        yaxis_title='Percentage (%)',
        legend_title='Severity Level'
    )
    fig.show()

# 1.3 Feature Distribution Comparison
key_features = numerical_columns[:6] if len(numerical_columns) >= 6 else numerical_columns

if key_features:
    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=key_features,
        vertical_spacing=0.08,
        horizontal_spacing=0.05
    )
    
    for i, feature in enumerate(key_features):
        row = (i // 3) + 1
        col = (i % 3) + 1
        
        for attack_class in df['class'].unique():
            attack_data = df[df['class'] == attack_class][feature]
            
            fig.add_trace(
                go.Histogram(
                    x=attack_data,
                    name=f'{attack_class}',
                    opacity=0.7,
                    legendgroup=attack_class,
                    showlegend=(i == 0)  # Only show legend for first subplot
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        height=800,
        title_text="Feature Distribution by Attack Class",
        barmode='overlay'
    )
    fig.show()

# 1.4 Attack Pattern Heatmap
if len(numerical_columns) >= 10:
    # Select top 10 features for heatmap
    features_for_heatmap = numerical_columns[:10]
    
    # Calculate mean values for each attack class
    heatmap_data = df.groupby('class')[features_for_heatmap].mean()
    
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale='Viridis',
        text=np.round(heatmap_data.values, 2),
        texttemplate='%{text}',
        textfont={"size": 10},
        colorbar=dict(title='Average Value')
    ))
    
    fig.update_layout(
        title='Attack Pattern Feature Heatmap',
        xaxis_title='Features',
        yaxis_title='Attack Class',
        height=600
    )
    fig.show()

## 2. Network Traffic Analysis

In [3]:
# 2.1 Network Traffic Volume Analysis
traffic_cols = [col for col in df.columns if any(keyword in col.lower() 
                                                for keyword in ['bytes', 'packets', 'duration', 'count'])]

if traffic_cols:
    # Traffic volume comparison
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=traffic_cols[:4] if len(traffic_cols) >= 4 else traffic_cols,
        vertical_spacing=0.1
    )
    
    for i, col in enumerate(traffic_cols[:4]):
        row = (i // 2) + 1
        col_pos = (i % 2) + 1
        
        for attack_class in df['class'].unique():
            attack_data = df[df['class'] == attack_class][col]
            
            fig.add_trace(
                go.Box(
                    y=attack_data,
                    name=f'{attack_class}',
                    legendgroup=attack_class,
                    showlegend=(i == 0)
                ),
                row=row, col=col_pos
            )
    
    fig.update_layout(
        height=800,
        title_text="Network Traffic Analysis by Attack Class"
    )
    fig.show()

# 2.2 Protocol Analysis
protocol_cols = [col for col in categorical_columns if any(keyword in col.lower() 
                                                          for keyword in ['protocol', 'service', 'flag'])]

if protocol_cols:
    protocol_col = protocol_cols[0]  # Use first protocol column
    
    # Protocol distribution
    protocol_attack = pd.crosstab(df[protocol_col], df['class'])
    
    fig = go.Figure()
    
    for attack_class in protocol_attack.columns:
        fig.add_trace(go.Bar(
            name=attack_class,
            x=protocol_attack.index,
            y=protocol_attack[attack_class],
            text=protocol_attack[attack_class],
            textposition='auto'
        ))
    
    fig.update_layout(
        barmode='group',
        title=f'{protocol_col.title()} Distribution by Attack Class',
        xaxis_title=protocol_col.title(),
        yaxis_title='Count',
        xaxis_tickangle=-45
    )
    fig.show()

# 2.3 Network Flow Scatter Analysis
if len(numerical_columns) >= 3:
    # Create 3D scatter plot of network characteristics
    x_col, y_col, z_col = numerical_columns[:3]
    
    fig = px.scatter_3d(
        df.sample(n=min(5000, len(df))),  # Sample for performance
        x=x_col, y=y_col, z=z_col,
        color='class',
        title='3D Network Flow Characteristics',
        labels={
            x_col: x_col.replace('_', ' ').title(),
            y_col: y_col.replace('_', ' ').title(),
            z_col: z_col.replace('_', ' ').title()
        },
        opacity=0.7,
        size_max=10
    )
    
    fig.update_layout(
        scene=dict(
            xaxis_title=x_col.replace('_', ' ').title(),
            yaxis_title=y_col.replace('_', ' ').title(),
            zaxis_title=z_col.replace('_', ' ').title()
        )
    )
    fig.show()

# 2.4 Connection Pattern Analysis
if len(numerical_columns) >= 2:
    # Analyze connection patterns
    fig = px.scatter(
        df.sample(n=min(3000, len(df))),  # Sample for performance
        x=numerical_columns[0], y=numerical_columns[1],
        color='class',
        size=numerical_columns[2] if len(numerical_columns) >= 3 else None,
        title='Connection Pattern Analysis',
        labels={
            numerical_columns[0]: numerical_columns[0].replace('_', ' ').title(),
            numerical_columns[1]: numerical_columns[1].replace('_', ' ').title()
        },
        hover_data=['class'] + numerical_columns[:3],
        opacity=0.7
    )
    
    fig.update_layout(
        xaxis_title=numerical_columns[0].replace('_', ' ').title(),
        yaxis_title=numerical_columns[1].replace('_', ' ').title()
    )
    fig.show()

## 3. Feature Correlation and Importance Analysis

In [4]:
# 3.1 Interactive Correlation Matrix
if len(numerical_columns) >= 5:
    # Select top features for correlation analysis
    features_for_corr = numerical_columns[:15] if len(numerical_columns) >= 15 else numerical_columns
    corr_matrix = df[features_for_corr].corr()
    
    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        text=np.round(corr_matrix, 2),
        texttemplate='%{text}',
        textfont={"size": 8},
        colorscale='RdBu',
        zmid=0,
        colorbar=dict(title='Correlation')
    ))
    
    fig.update_layout(
        title='Feature Correlation Matrix',
        width=800,
        height=800,
        xaxis_tickangle=-45
    )
    fig.show()

# 3.2 Feature Importance Simulation (using Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare data for feature importance
X = df[numerical_columns].fillna(0)  # Handle any missing values
y = df['class']

# Encode target if categorical
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train Random Forest for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y_encoded)

# Create feature importance visualization
feature_importance = pd.DataFrame({
    'feature': numerical_columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=True).tail(20)  # Top 20 features

fig = go.Figure(go.Bar(
    x=feature_importance['importance'],
    y=feature_importance['feature'],
    orientation='h',
    marker=dict(color=feature_importance['importance'], 
                colorscale='Viridis',
                showscale=True,
                colorbar=dict(title='Importance'))
))

fig.update_layout(
    title='Top 20 Feature Importance for Attack Detection',
    xaxis_title='Importance Score',
    yaxis_title='Features',
    height=800
)
fig.show()

# 3.3 Parallel Coordinates Plot
if len(numerical_columns) >= 4:
    # Select key features for parallel coordinates
    key_features = numerical_columns[:8] if len(numerical_columns) >= 8 else numerical_columns
    
    # Create a sample for better performance
    sample_df = df.sample(n=min(2000, len(df)), random_state=42)
    
    # Encode attack class for coloring
    attack_encoding = {attack: i for i, attack in enumerate(sample_df['class'].unique())}
    sample_df['class_code'] = sample_df['class'].map(attack_encoding)
    
    fig = px.parallel_coordinates(
        sample_df,
        dimensions=key_features,
        color='class_code',
        color_continuous_scale=px.colors.qualitative.Set3,
        title='Parallel Coordinates Plot of Network Features'
    )
    
    # Update color axis to show attack types
    fig.update_layout(
        coloraxis_colorbar=dict(
            title='Attack Class',
            ticktext=list(attack_encoding.keys()),
            tickvals=list(attack_encoding.values()),
        )
    )
    fig.show()

# 3.4 Attack Signature Analysis
if len(numerical_columns) >= 6:
    # Create radar chart for attack signatures
    top_features = feature_importance['feature'].tail(6).tolist()  # Top 6 features
    
    # Calculate mean values for each attack class
    attack_signatures = df.groupby('class')[top_features].mean()
    
    # Normalize values for radar chart (0-1 scale)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    attack_signatures_normalized = pd.DataFrame(
        scaler.fit_transform(attack_signatures.T).T,
        index=attack_signatures.index,
        columns=attack_signatures.columns
    )
    
    fig = go.Figure()
    
    for attack_class in attack_signatures_normalized.index:
        fig.add_trace(go.Scatterpolar(
            r=attack_signatures_normalized.loc[attack_class].values.tolist() + 
              [attack_signatures_normalized.loc[attack_class].values[0]],  # Close the polygon
            theta=top_features + [top_features[0]],  # Close the polygon
            fill='toself',
            name=attack_class,
            opacity=0.6
        ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )),
        showlegend=True,
        title="Attack Signature Radar Chart"
    )
    fig.show()

## 4. Advanced Security Analytics and Dashboards

In [5]:
# 4.1 Security Metrics Dashboard
attack_stats = df['class'].value_counts()
total_connections = len(df)
anomaly_rate = (attack_stats.get('anomaly', 0) / total_connections) * 100

# Create a comprehensive security dashboard
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[
        'Attack Distribution', 
        'Severity Levels',
        'Top Risk Features',
        'Connection Volume Trend',
        'Protocol Security',
        'Risk Score Distribution'
    ],
    specs=[[{"type": "pie"}, {"type": "bar"}, {"type": "bar"}],
           [{"type": "scatter"}, {"type": "bar"}, {"type": "histogram"}]],
    vertical_spacing=0.12,
    horizontal_spacing=0.08
)

# 1. Attack Distribution (Pie Chart)
fig.add_trace(
    go.Pie(
        labels=attack_stats.index,
        values=attack_stats.values,
        name="Attack Distribution"
    ),
    row=1, col=1
)

# 2. Severity Levels (if available)
if 'severity_level' in df.columns:
    severity_counts = df['severity_level'].value_counts()
    fig.add_trace(
        go.Bar(
            x=severity_counts.index,
            y=severity_counts.values,
            name="Severity Levels",
            marker_color=['green', 'orange', 'red']
        ),
        row=1, col=2
    )

# 3. Top Risk Features (Feature Importance)
if 'feature_importance' in locals():
    top_5_features = feature_importance.tail(5)
    fig.add_trace(
        go.Bar(
            x=top_5_features['importance'],
            y=top_5_features['feature'],
            orientation='h',
            name="Risk Features",
            marker_color='red'
        ),
        row=1, col=3
    )

# 4. Connection Volume (if numerical data available)
if numerical_columns:
    sample_indices = np.arange(0, len(df), max(1, len(df)//100))  # Sample points
    fig.add_trace(
        go.Scatter(
            x=sample_indices,
            y=df.iloc[sample_indices][numerical_columns[0]],
            mode='lines',
            name="Volume Trend",
            line=dict(color='blue')
        ),
        row=2, col=1
    )

# 5. Protocol Security (if protocol data available)
if protocol_cols and len(protocol_cols) > 0:
    protocol_col = protocol_cols[0]
    protocol_risk = df.groupby(protocol_col)['class'].apply(
        lambda x: (x == 'anomaly').mean() * 100 if 'anomaly' in df['class'].values else 
                  (x == df['class'].value_counts().index[1]).mean() * 100
    ).head(5)
    
    fig.add_trace(
        go.Bar(
            x=protocol_risk.index,
            y=protocol_risk.values,
            name="Protocol Risk %",
            marker_color='orange'
        ),
        row=2, col=2
    )

# 6. Risk Score Distribution (if severity available)
if 'attack_severity' in df.columns:
    fig.add_trace(
        go.Histogram(
            x=df['attack_severity'],
            name="Risk Scores",
            marker_color='purple',
            opacity=0.7
        ),
        row=2, col=3
    )

fig.update_layout(
    height=1000,
    title_text="Cybersecurity Analytics Dashboard",
    showlegend=False
)
fig.show()

# 4.2 Network Anomaly Heatmap
if len(numerical_columns) >= 10:
    # Create anomaly heatmap based on feature values
    features_for_heatmap = numerical_columns[:10]
    
    # Calculate z-scores to identify anomalies
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[features_for_heatmap].fillna(0)))
    
    # Mark anomalies (z-score > 3)
    anomaly_mask = (z_scores > 3).astype(int)
    
    # Sample for visualization
    sample_size = min(200, len(df))
    sample_indices = np.random.choice(len(df), sample_size, replace=False)
    
    fig = go.Figure(data=go.Heatmap(
        z=anomaly_mask[sample_indices],
        x=features_for_heatmap,
        y=[f"Sample {i}" for i in range(sample_size)],
        colorscale=[[0, 'lightblue'], [1, 'red']],
        colorbar=dict(title='Anomaly Detected')
    ))
    
    fig.update_layout(
        title='Network Anomaly Detection Heatmap',
        xaxis_title='Features',
        yaxis_title='Network Connections',
        height=800
    )
    fig.show()

# 4.3 Risk Assessment Summary
print("="*60)
print("CYBERSECURITY RISK ASSESSMENT SUMMARY")
print("="*60)
print(f"Total Network Connections Analyzed: {total_connections:,}")
print(f"Normal Traffic: {attack_stats.get('normal', 0):,} ({((attack_stats.get('normal', 0)/total_connections)*100):.1f}%)")
print(f"Anomalous Traffic: {attack_stats.get('anomaly', 0):,} ({anomaly_rate:.1f}%)")

if 'severity_level' in df.columns:
    print(f"\nSeverity Breakdown:")
    for level in ['Low', 'Medium', 'High']:
        count = df['severity_level'].value_counts().get(level, 0)
        percentage = (count / total_connections) * 100
        print(f"  {level} Risk: {count:,} ({percentage:.1f}%)")

print(f"\nKey Risk Indicators:")
if 'feature_importance' in locals():
    print(f"  Top Risk Feature: {feature_importance.iloc[-1]['feature']}")
    print(f"  Risk Score: {feature_importance.iloc[-1]['importance']:.3f}")

print(f"\nRecommendations:")
print(f"  • Monitor {feature_importance.iloc[-1]['feature'] if 'feature_importance' in locals() else 'key features'} closely")
print(f"  • Implement real-time alerting for anomaly rate > {anomaly_rate:.0f}%")
print(f"  • Focus on high-severity incidents for immediate response")

# 4.4 Interactive Security Treemap
if len(categorical_columns) > 0 and len(numerical_columns) > 0:
    # Create hierarchical data for treemap
    cat_col = categorical_columns[0]  # First categorical column
    num_col = numerical_columns[0]    # First numerical column
    
    # Group data for treemap
    treemap_data = df.groupby([cat_col, 'class']).agg({
        num_col: 'sum'
    }).reset_index()
    
    fig = px.treemap(
        treemap_data,
        path=[cat_col, 'class'],
        values=num_col,
        title=f'Security Hierarchy: {cat_col.title()} by Attack Class',
        color=num_col,
        color_continuous_scale='Reds'
    )
    
    fig.update_layout(height=600)
    fig.show()

print("\n" + "="*60)
print("VISUALIZATION ANALYSIS COMPLETE")
print("="*60)
print("📊 All cybersecurity visualizations have been generated")
print("🔍 Interactive charts ready for security analysis")
print("🛡️ Dashboard components prepared for deployment")
print("📈 Advanced analytics completed successfully")

CYBERSECURITY RISK ASSESSMENT SUMMARY
Total Network Connections Analyzed: 25,192
Normal Traffic: 13,449 (53.4%)
Anomalous Traffic: 11,743 (46.6%)

Severity Breakdown:
  Low Risk: 25,191 (100.0%)
  Medium Risk: 0 (0.0%)
  High Risk: 1 (0.0%)

Key Risk Indicators:
  Top Risk Feature: src_bytes
  Risk Score: 0.180

Recommendations:
  • Monitor src_bytes closely
  • Implement real-time alerting for anomaly rate > 47%
  • Focus on high-severity incidents for immediate response



VISUALIZATION ANALYSIS COMPLETE
📊 All cybersecurity visualizations have been generated
🔍 Interactive charts ready for security analysis
🛡️ Dashboard components prepared for deployment
📈 Advanced analytics completed successfully
