In [8]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NSL-KDD Dataset Initial Analysis\n",
    "\n",
    "This notebook provides a comprehensive analysis of the NSL-KDD intrusion detection dataset.\n",
    "\n",
    "## Objectives\n",
    "1. Load and explore the NSL-KDD dataset\n",
    "2. Understand feature distributions and characteristics\n",
    "3. Analyze attack patterns and class distributions\n",
    "4. Identify data quality issues and preprocessing needs\n",
    "5. Generate visualizations for better understanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('../src')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from nsl_kdd_analyzer import NSLKDDAnalyzer\n",
    "\n",
    "# Set up plotting\n",
    "plt.style.use('default')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline\n",
    "\n",
    "# Suppress warnings\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print(\"📊 NSL-KDD Analysis Environment Ready!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Initialize the Analyzer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the analyzer\n",
    "analyzer = NSLKDDAnalyzer(data_dir=\"../data/raw\", output_dir=\"../data/results\")\n",
    "\n",
    "print(\"Available data files:\")\n",
    "for file in analyzer.data_dir.glob(\"*.txt\"):\n",
    "    size_mb = file.stat().st_size / (1024 * 1024)\n",
    "    print(f\"  📄 {file.name:<25} ({size_mb:.1f} MB)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Load and Analyze Training Data (20% subset first)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start with 20% subset for faster initial analysis\n",
    "print(\"🔍 Analyzing 20% Training Subset...\")\n",
    "train_20_data = analyzer.comprehensive_analysis('KDDTrain+_20Percent.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Detailed Feature Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Examine the first few rows\n",
    "print(\"📋 Sample Data:\")\n",
    "display(train_20_data.head())\n",
    "\n",
    "print(\"\\n📊 Data Info:\")\n",
    "print(train_20_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Attack category breakdown\n",
    "print(\"🎯 Attack Category Analysis:\")\n",
    "attack_summary = train_20_data['attack_category'].value_counts()\n",
    "print(attack_summary)\n",
    "\n",
    "# Calculate percentages\n",
    "attack_percentages = (attack_summary / len(train_20_data) * 100).round(2)\n",
    "print(\"\\nPercentages:\")\n",
    "for category, percentage in attack_percentages.items():\n",
    "    print(f\"  {category}: {percentage}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Detailed attack type analysis\n",
    "print(\"🔍 Detailed Attack Types:\")\n",
    "attack_details = train_20_data.groupby(['attack_category', 'attack_type']).size().reset_index(name='count')\n",
    "attack_details['percentage'] = (attack_details['count'] / len(train_20_data) * 100).round(3)\n",
    "display(attack_details.sort_values('count', ascending=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Feature Distribution Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze numerical features\n",
    "numerical_cols = train_20_data.select_dtypes(include=[np.number]).columns.tolist()\n",
    "# Remove labels\n",
    "numerical_cols = [col for col in numerical_cols if col not in ['difficulty_level']]\n",
    "\n",
    "print(f\"📊 Numerical Features: {len(numerical_cols)}\")\n",
    "print(f\"First 10: {numerical_cols[:10]}\")\n",
    "\n",
    "# Statistical summary for key features\n",
    "key_features = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']\n",
    "print(\"\\n📈 Key Features Statistics:\")\n",
    "display(train_20_data[key_features].describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Categorical features analysis\n",
    "categorical_cols = ['protocol_type', 'service', 'flag']\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(18, 5))\n",
    "\n",
    "for i, col in enumerate(categorical_cols):\n",
    "    value_counts = train_20_data[col].value_counts()\n",
    "    if len(value_counts) > 10:\n",
    "        value_counts = value_counts.head(10)\n",
    "    \n",
    "    value_counts.plot(kind='bar', ax=axes[i], alpha=0.7)\n",
    "    axes[i].set_title(f'{col.title()} Distribution')\n",
    "    axes[i].set_xlabel(col)\n",
    "    axes[i].set_ylabel('Count')\n",
    "    axes[i].tick_params(axis='x', rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Class Imbalance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a detailed class imbalance visualization\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
    "\n",
    "# Attack categories pie chart\n",
    "attack_cat_counts = train_20_data['attack_category'].value_counts()\n",
    "axes[0, 0].pie(attack_cat_counts.values, labels=attack_cat_counts.index, autopct='%1.1f%%', startangle=90)\n",
    "axes[0, 0].set_title('Attack Categories Distribution')\n",
    "\n",
    "# Attack categories bar chart (log scale)\n",
    "attack_cat_counts.plot(kind='bar', ax=axes[0, 1], alpha=0.7, logy=True)\n",
    "axes[0, 1].set_title('Attack Categories (Log Scale)')\n",
    "axes[0, 1].set_ylabel('Count (log scale)')\n",
    "axes[0, 1].tick_params(axis='x', rotation=45)\n",
    "\n",
    "# Top 20 attack types\n",
    "top_attacks = train_20_data['attack_type'].value_counts().head(20)\n",
    "top_attacks.plot(kind='bar', ax=axes[1, 0], alpha=0.7)\n",
    "axes[1, 0].set_title('Top 20 Attack Types')\n",
    "axes[1, 0].set_ylabel('Count')\n",
    "axes[1, 0].tick_params(axis='x', rotation=45)\n",
    "\n",
    "# Binary classification (Normal vs Attack)\n",
    "binary_dist = train_20_data['attack_type'].apply(lambda x: 'Normal' if x == 'normal' else 'Attack').value_counts()\n",
    "binary_dist.plot(kind='bar', ax=axes[1, 1], alpha=0.7, color=['green', 'red'])\n",
    "axes[1, 1].set_title('Binary Classification Distribution')\n",
    "axes[1, 1].set_ylabel('Count')\n",
    "axes[1, 1].tick_params(axis='x', rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"⚠️ Class Imbalance Observations:\")\n",
    "for category, count in attack_cat_counts.items():\n",
    "    percentage = (count / len(train_20_data)) * 100\n",
    "    print(f\"  {category}: {percentage:.2f}% ({count:,} records)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Load and Compare Full Training Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load full training data\n",
    "print(\"🔍 Analyzing Full Training Data...\")\n",
    "train_full_data = analyzer.comprehensive_analysis('KDDTrain+.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Load and Analyze Test Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load test data\n",
    "print(\"🔍 Analyzing Test Data...\")\n",
    "test_data = analyzer.comprehensive_analysis('KDDTest+.txt')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Dataset Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare datasets\n",
    "novel_attacks = analyzer.compare_datasets(train_full_data, test_data)\n",
    "\n",
    "print(f\"\\n📊 Dataset Summary:\")\n",
    "print(f\"Training (Full): {len(train_full_data):,} records, {train_full_data['attack_type'].nunique()} attack types\")\n",
    "print(f\"Training (20%):  {len(train_20_data):,} records, {train_20_data['attack_type'].nunique()} attack types\")\n",
    "print(f\"Test:            {len(test_data):,} records, {test_data['attack_type'].nunique()} attack types\")\n",
    "print(f\"Novel attacks in test: {len(novel_attacks)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Data Quality Assessment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for data quality issues\n",
    "print(\"🔍 Data Quality Assessment:\")\n",
    "\n",
    "# Missing values\n",
    "missing_train = train_full_data.isnull().sum().sum()\n",
    "missing_test = test_data.isnull().sum().sum()\n",
    "print(f\"Missing values - Train: {missing_train}, Test: {missing_test}\")\n",
    "\n",
    "# Duplicate records\n",
    "duplicates_train = train_full_data.duplicated().sum()\n",
    "duplicates_test = test_data.duplicated().sum()\n",
    "print(f\"Duplicate records - Train: {duplicates_train}, Test: {duplicates_test}\")\n",
    "\n",
    "# Zero variance features\n",
    "numeric_cols = train_full_data.select_dtypes(include=[np.number]).columns\n",
    "zero_var_features = [col for col in numeric_cols if train_full_data[col].var() == 0]\n",
    "print(f\"Zero variance features: {len(zero_var_features)}\")\n",
    "if zero_var_features:\n",
    "    print(f\"  {zero_var_features}\")\n",
    "\n",
    "# Constant features\n",
    "constant_features = [col for col in train_full_data.columns if train_full_data[col].nunique() == 1]\n",
    "print(f\"Constant features: {len(constant_features)}\")\n",
    "if constant_features:\n",
    "    print(f\"  {constant_features}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Key Insights and Next Steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"🎯 KEY INSIGHTS:\")\n",
    "print(\"=\"*50)\n",
    "\n",
    "# Dataset characteristics\n",
    "print(f\"1. Dataset Size:\")\n",
    "print(f\"   • Training: {len(train_full_data):,} records\")\n",
    "print(f\"   • Test: {len(test_data):,} records\")\n",
    "print(f\"   • Features: 41 + 2 labels\")\n",
    "\n",
    "# Class distribution\n",
    "normal_pct = (train_full_data['attack_type'] == 'normal').mean() * 100\n",
    "print(f\"\\n2. Class Distribution:\")\n",
    "print(f\"   • Normal traffic: {normal_pct:.1f}%\")\n",
    "print(f\"   • Attack traffic: {100-normal_pct:.1f}%\")\n",
    "print(f\"   • Attack categories: {train_full_data['attack_category'].nunique()}\")\n",
    "\n",
    "# Challenges identified\n",
    "print(f\"\\n3. Key Challenges:\")\n",
    "print(f\"   • Class imbalance (especially U2R and R2L)\")\n",
    "print(f\"   • Novel attacks in test set: {len(novel_attacks)}\")\n",
    "print(f\"   • Feature selection needed (41 features)\")\n",
    "print(f\"   • Mixed data types (numerical + categorical)\")\n",
    "\n",
    "print(f\"\\n4. Recommended Next Steps:\")\n",
    "print(f\"   • Feature preprocessing and encoding\")\n",
    "print(f\"   • Handle class imbalance (SMOTE, undersampling)\")\n",
    "print(f\"   • Feature selection/importance analysis\")\n",
    "print(f\"   • Baseline model development\")\n",
    "print(f\"   • Cross-validation strategy\")\n",
    "\n",
    "print(\"\\n✅ Initial analysis complete! Check data/results/ for saved outputs.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary\n",
    "\n",
    "This notebook has provided a comprehensive initial analysis of the NSL-KDD dataset. Key findings:\n",
    "\n",
    "1. **Dataset Structure**: 41 features across 4 categories (basic, content, time-based, host-based)\n",
    "2. **Class Distribution**: Highly imbalanced with DoS attacks dominating\n",
    "3. **Novel Attacks**: Test set contains attacks not seen in training\n",
    "4. **Data Quality**: Clean dataset with no missing values\n",
    "5. **Challenges**: Class imbalance, feature selection, novel attack detection\n",
    "\n",
    "The processed data has been saved to `data/processed/` for use in subsequent analysis and modeling steps."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# NSL-KDD Dataset Initial Analysis\n',
    '\n',
    'This notebook provides a comprehensive analysis of the NSL-KDD intrusion detection dataset.\n',
    '\n',
    '## Objectives\n',
    '1. Load and explore the NSL-KDD dataset\n',
    '2. Understand feature distributions and characteristics\n',
    '3. Analyze attack patterns and class distributions\n',
    '4. Identify data quality issues and preprocessing needs\n',
    '5. Generate visualizations for better understanding']},
  {'cell_type': 'code',
   'execution_count': None,
   'metadata': {},
   'outputs': [],
   'source': ['# Import necessary libraries\n',
    'import sys\n',
    'import os\n',
    "sys.path.append('../src')\n",
    '\n',
    'import pandas as pd\n',
    'import numpy as np\n',
    'import matplotlib.pyplot as plt\n',
    'import seaborn as sns\n',
    'from nsl_kdd_analyzer import NSLKDDAnalyzer\n',
    '\n',
    '# Set up plotting\n',
    "p