In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NSL-KDD Dataset Exploration\n",
    "\n",
    "Comprehensive analysis of the NSL-KDD intrusion detection dataset.\n",
    "\n",
    "## Objectives\n",
    "1. Load and explore the NSL-KDD dataset\n",
    "2. Understand feature distributions and characteristics\n",
    "3. Analyze attack patterns and class distributions\n",
    "4. Identify data quality issues and preprocessing needs\n",
    "5. Generate insights for model development"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('../src')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from nsl_kdd_analyzer import NSLKDDAnalyzer\n",
    "\n",
    "# Set up plotting\n",
    "plt.style.use('default')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline\n",
    "\n",
    "# Suppress warnings\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"📊 NSL-KDD Analysis Environment Ready!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Initialize Analyzer and Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the analyzer\n",
    "analyzer = NSLKDDAnalyzer(data_dir=\"../data/raw\", output_dir=\"../data/results\")\n",
    "\n",
    "print(\"Available data files:\")\n",
    "for file in analyzer.data_dir.glob(\"*.txt\"):\n",
    "    size_mb = file.stat().st_size / (1024 * 1024)\n",
    "    print(f\"  📄 {file.name:<25} ({size_mb:.1f} MB)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Quick Analysis with 20% Subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start with 20% subset for faster analysis\n",
    "print(\"🔍 Analyzing 20% Training Subset...\")\n",
    "train_20_data = analyzer.load_data('KDDTrain+_20Percent.txt')\n",
    "\n",
    "if train_20_data is not None:\n",
    "    print(f\"\\n📊 Dataset Overview:\")\n",
    "    print(f\"   Shape: {train_20_data.shape}\")\n",
    "    print(f\"   Records: {len(train_20_data):,}\")\n",
    "    print(f\"   Features: {train_20_data.shape[1] - 2}\")\n",
    "    \n",
    "    # Display first few rows\n",
    "    print(f\"\\n📋 Sample Data:\")\n",
    "    display(train_20_data.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Attack Distribution Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Attack category analysis\n",
    "if train_20_data is not None:\n",
    "    print(\"🎯 Attack Category Analysis:\")\n",
    "    attack_summary = train_20_data['attack_category'].value_counts()\n",
    "    print(attack_summary)\n",
    "    \n",
    "    # Calculate percentages\n",
    "    attack_percentages = (attack_summary / len(train_20_data) * 100).round(2)\n",
    "    print(\"\\nPercentages:\")\n",
    "    for category, percentage in attack_percentages.items():\n",
    "        print(f\"  {category}: {percentage}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Detailed attack type analysis\n",
    "if train_20_data is not None:\n",
    "    print(\"🔍 Detailed Attack Types:\")\n",
    "    attack_details = train_20_data.groupby(['attack_category', 'attack_type']).size().reset_index(name='count')\n",
    "    attack_details['percentage'] = (attack_details['count'] / len(train_20_data) * 100).round(3)\n",
    "    display(attack_details.sort_values('count', ascending=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create visualizations\n",
    "if train_20_data is not None:\n",
    "    fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
    "    \n",
    "    # Attack categories pie chart\n",
    "    attack_cat_counts = train_20_data['attack_category'].value_counts()\n",
    "    axes[0, 0].pie(attack_cat_counts.values, labels=attack_cat_counts.index, autopct='%1.1f%%', startangle=90)\n",
    "    axes[0, 0].set_title('Attack Categories Distribution')\n",
    "    \n",
    "    # Attack categories bar chart (log scale)\n",
    "    attack_cat_counts.plot(kind='bar', ax=axes[0, 1], alpha=0.7, logy=True)\n",
    "    axes[0, 1].set_title('Attack Categories (Log Scale)')\n",
    "    axes[0, 1].set_ylabel('Count (log scale)')\n",
    "    axes[0, 1].tick_params(axis='x', rotation=45)\n",
    "    \n",
    "    # Top 20 attack types\n",
    "    top_attacks = train_20_data['attack_type'].value_counts().head(20)\n",
    "    top_attacks.plot(kind='bar', ax=axes[1, 0], alpha=0.7)\n",
    "    axes[1, 0].set_title('Top 20 Attack Types')\n",
    "    axes[1, 0].set_ylabel('Count')\n",
    "    axes[1, 0].tick_params(axis='x', rotation=45)\n",
    "    \n",
    "    # Binary classification (Normal vs Attack)\n",
    "    binary_dist = train_20_data['attack_type'].apply(lambda x: 'Normal' if x == 'normal' else 'Attack').value_counts()\n",
    "    binary_dist.plot(kind='bar', ax=axes[1, 1], alpha=0.7, color=['green', 'red'])\n",
    "    axes[1, 1].set_title('Binary Classification Distribution')\n",
    "    axes[1, 1].set_ylabel('Count')\n",
    "    axes[1, 1].tick_params(axis='x', rotation=45)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Feature Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze numerical features\n",
    "if train_20_data is not None:\n",
    "    numerical_cols = train_20_data.select_dtypes(include=[np.number]).columns.tolist()\n",
    "    # Remove labels\n",
    "    numerical_cols = [col for col in numerical_cols if col not in ['difficulty_level']]\n",
    "    \n",
    "    print(f\"📊 Numerical Features: {len(numerical_cols)}\")\n",
    "    print(f\"First 10: {numerical_cols[:10]}\")\n",
    "    \n",
    "    # Statistical summary for key features\n",
    "    key_features = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']\n",
    "    print(f\"\\n📈 Key Features Statistics:\")\n",
    "    display(train_20_data[key_features].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Next Steps and Insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summary and next steps\n",
    "if train_20_data is not None:\n",
    "    print(\"🎯 KEY INSIGHTS:\")\n",
    "    print(\"=\"*50)\n",
    "    \n",
    "    normal_pct = (train_20_data['attack_type'] == 'normal').mean() * 100\n",
    "    attack_pct = 100 - normal_pct\n",
    "    \n",
    "    print(f\"1. Dataset Characteristics:\")\n",
    "    print(f\"   • Records: {len(train_20_data):,}\")\n",
    "    print(f\"   • Features: {train_20_data.shape[1] - 2}\")\n",
    "    print(f\"   • Attack types: {train_20_data['attack_type'].nunique()}\")\n",
    "    \n",
    "    print(f\"\\n2. Class Distribution:\")\n",
    "    print(f\"   • Normal traffic: {normal_pct:.1f}%\")\n",
    "    print(f\"   • Attack traffic: {attack_pct:.1f}%\")\n",
    "    \n",
    "    print(f\"\\n3. Key Challenges:\")\n",
    "    print(f\"   • Severe class imbalance\")\n",
    "    print(f\"   • Mixed data types (numerical + categorical)\")\n",
    "    print(f\"   • High dimensionality (41 features)\")\n",
    "    \n",
    "    print(f\"\\n4. Next Steps:\")\n",
    "    print(f\"   • Create preprocessing pipeline\")\n",
    "    print(f\"   • Handle class imbalance\")\n",
    "    print(f\"   • Implement baseline models\")\n",
    "    print(f\"   • Set up evaluation framework\")\n",
    "    \n",
    "    print(f\"\\n✅ Exploration complete! Ready for preprocessing phase.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# NSL-KDD Dataset Exploration\n',
    '\n',
    'Comprehensive analysis of the NSL-KDD intrusion detection dataset.\n',
    '\n',
    '## Objectives\n',
    '1. Load and explore the NSL-KDD dataset\n',
    '2. Understand feature distributions and characteristics\n',
    '3. Analyze attack patterns and class distributions\n',
    '4. Identify data quality issues and preprocessing needs\n',
    '5. Generate insights for model development']},
  {'cell_type': 'code',
   'execution_count': None,
   'metadata': {},
   'outputs': [],
   'source': ['# Import libraries\n',
    'import sys\n',
    'import os\n',
    "sys.path.append('../src')\n",
    '\n',
    'import pandas as pd\n',
    'import numpy as np\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    'from nsl_kdd_analyzer import NSLKDDAnalyzer\n',
    '\n',
    '# Set up plotting\n',
    "plt.style.use('default')\n",
    'sns.set_palette(