# IoT-23 Dataset Preprocessing for Kaggle

**Purpose:** Download and preprocess IoT-23 dataset into HDF5 format

**Input:** Raw IoT-23 PCAP files (from internet or uploaded)

**Output:** Preprocessed HDF5 file ready for MAPPO training

**Time:** ~2-3 hours (CPU only, no GPU needed)

---

## Instructions:
1. **Choose Session Type:** CPU (no GPU needed for preprocessing)
2. **Enable Internet:** ON (to download IoT-23)
3. **Run All Cells**
4. **Download Output:** Save as Kaggle dataset for training notebook

## üì¶ Cell 1: Install Dependencies

In [None]:
%%time
print("Installing preprocessing dependencies...")
!pip install -q scapy
!pip install -q pandas
!pip install -q h5py
!pip install -q numpy
!pip install -q scikit-learn
!pip install -q tqdm

print("‚úÖ Dependencies installed!")

## üì• Cell 2: Download IoT-23 Dataset

In [None]:
%%time
import os
import urllib.request
from tqdm import tqdm

# IoT-23 dataset URLs
# Note: Using smaller captures for Kaggle (full dataset is ~25GB)
IOT23_URLS = [
    # Scenario 1: Mirai botnet (C&C + Attack)
    "https://mcfp.felk.cvut.cz/publicDatasets/IoT-23-Dataset/IndividualScenarios/CTU-IoT-Malware-Capture-1-1/2018-12-21-capture.pcap",
    # Scenario 9: Philips HUE IoT (benign + attack)
    "https://mcfp.felk.cvut.cz/publicDatasets/IoT-23-Dataset/IndividualScenarios/CTU-IoT-Malware-Capture-9-1/2019-01-09-capture.pcap",
]

# Create download directory
os.makedirs('/kaggle/working/raw_pcap', exist_ok=True)

print("Downloading IoT-23 dataset...")
print("This may take 30-60 minutes depending on connection speed\n")

for i, url in enumerate(IOT23_URLS, 1):
    filename = f"scenario_{i}.pcap"
    filepath = f"/kaggle/working/raw_pcap/{filename}"
    
    if os.path.exists(filepath):
        print(f"‚úÖ {filename} already downloaded")
        continue
    
    print(f"Downloading {filename}...")
    try:
        urllib.request.urlretrieve(url, filepath)
        size_mb = os.path.getsize(filepath) / 1e6
        print(f"‚úÖ Downloaded {filename} ({size_mb:.1f} MB)")
    except Exception as e:
        print(f"‚ùå Error downloading {filename}: {e}")
        print("   Continuing with available files...")

# List downloaded files
print("\nDownloaded files:")
!ls -lh /kaggle/working/raw_pcap/

## üîß Cell 3: Parse PCAP Files

In [None]:
%%time
from scapy.all import rdpcap, IP, TCP, UDP
import pandas as pd
from tqdm import tqdm
import numpy as np

def parse_pcap(pcap_file, max_packets=100000):
    """
    Parse PCAP file and extract features
    
    Args:
        pcap_file: Path to PCAP file
        max_packets: Maximum packets to process (for memory)
    """
    print(f"\nParsing {pcap_file}...")
    
    try:
        packets = rdpcap(pcap_file)
        print(f"  Total packets: {len(packets)}")
        
        # Limit for memory
        if len(packets) > max_packets:
            print(f"  Sampling {max_packets} packets for memory efficiency")
            packets = packets[:max_packets]
        
        # Extract features
        features = []
        for pkt in tqdm(packets, desc="Extracting features"):
            if IP in pkt:
                feature = {
                    'timestamp': float(pkt.time),
                    'src_ip': pkt[IP].src,
                    'dst_ip': pkt[IP].dst,
                    'protocol': pkt[IP].proto,
                    'packet_size': len(pkt),
                    'src_port': pkt[TCP].sport if TCP in pkt else (pkt[UDP].sport if UDP in pkt else 0),
                    'dst_port': pkt[TCP].dport if TCP in pkt else (pkt[UDP].dport if UDP in pkt else 0),
                    'flags': pkt[TCP].flags if TCP in pkt else 0,
                }
                features.append(feature)
        
        df = pd.DataFrame(features)
        print(f"  Extracted {len(df)} flow records")
        return df
        
    except Exception as e:
        print(f"  ‚ùå Error parsing {pcap_file}: {e}")
        return pd.DataFrame()

# Parse all PCAP files
all_dataframes = []

pcap_files = !ls /kaggle/working/raw_pcap/*.pcap
for pcap_file in pcap_files:
    df = parse_pcap(pcap_file.strip())
    if not df.empty:
        all_dataframes.append(df)

# Combine all dataframes
if all_dataframes:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"\n‚úÖ Total flow records: {len(combined_df)}")
else:
    print("\n‚ùå No data extracted")

## üè∑Ô∏è Cell 4: Label Data (Simplified)

In [None]:
%%time
# Simple heuristic labeling for IoT-23
# In production, use actual labels from IoT-23 metadata

def label_traffic(df):
    """
    Simple heuristic labeling
    Note: IoT-23 provides actual labels - this is simplified for demo
    """
    df['label'] = 0  # Default: benign
    
    # Heuristic 1: High packet rate to single IP
    ip_counts = df['dst_ip'].value_counts()
    high_traffic_ips = ip_counts[ip_counts > 1000].index
    df.loc[df['dst_ip'].isin(high_traffic_ips), 'label'] = 1
    
    # Heuristic 2: Unusual ports (known malware ports)
    malware_ports = [23, 2323, 5555, 7547, 37215, 52869]
    df.loc[df['dst_port'].isin(malware_ports), 'label'] = 1
    
    # Heuristic 3: Very large or very small packets
    df.loc[(df['packet_size'] < 40) | (df['packet_size'] > 1500), 'label'] = 1
    
    attack_count = (df['label'] == 1).sum()
    attack_ratio = attack_count / len(df)
    
    print(f"Labeling results:")
    print(f"  Benign: {(df['label'] == 0).sum()} ({100*(1-attack_ratio):.1f}%)")
    print(f"  Attack: {attack_count} ({100*attack_ratio:.1f}%)")
    
    return df

combined_df = label_traffic(combined_df)
print("\n‚úÖ Data labeled")

## üî¢ Cell 5: Feature Engineering

In [None]:
%%time
from sklearn.preprocessing import StandardScaler
import numpy as np

def engineer_features(df):
    """
    Create statistical features for ML
    """
    print("Engineering features...")
    
    # Time-based features
    df = df.sort_values('timestamp')
    df['inter_arrival_time'] = df['timestamp'].diff().fillna(0)
    
    # Flow-based features (5-tuple aggregation)
    df['flow_id'] = (df['src_ip'] + '_' + df['dst_ip'] + '_' + 
                     df['protocol'].astype(str) + '_' + 
                     df['src_port'].astype(str) + '_' + 
                     df['dst_port'].astype(str))
    
    # Aggregate by flows
    flow_features = df.groupby('flow_id').agg({
        'packet_size': ['mean', 'std', 'min', 'max', 'sum'],
        'inter_arrival_time': ['mean', 'std'],
        'timestamp': ['count', 'min', 'max'],
        'label': 'max'  # If any packet in flow is attack, flow is attack
    }).reset_index()
    
    # Flatten column names
    flow_features.columns = ['_'.join(col).strip('_') for col in flow_features.columns.values]
    
    # Calculate flow duration
    flow_features['flow_duration'] = (flow_features['timestamp_max'] - 
                                       flow_features['timestamp_min'])
    
    # Calculate packets per second
    flow_features['packets_per_sec'] = (flow_features['timestamp_count'] / 
                                         (flow_features['flow_duration'] + 1e-6))
    
    print(f"  Created {len(flow_features)} flow records with {len(flow_features.columns)} features")
    
    return flow_features

flow_data = engineer_features(combined_df)

# Show sample
print("\nSample features:")
print(flow_data.head())
print("\n‚úÖ Feature engineering complete")

## üíæ Cell 6: Save to HDF5 Format

In [None]:
%%time
import h5py
import numpy as np
from sklearn.model_selection import train_test_split

# Prepare data for HDF5
print("Preparing data for HDF5...")

# Select numeric features only
feature_cols = [col for col in flow_data.columns 
                if col not in ['flow_id', 'label_max'] 
                and flow_data[col].dtype in [np.float64, np.int64, np.float32, np.int32]]

X = flow_data[feature_cols].values
y = flow_data['label_max'].values

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/val/test split (60/20/20)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\nDataset splits:")
print(f"  Train: {len(X_train)} samples")
print(f"  Val:   {len(X_val)} samples")
print(f"  Test:  {len(X_test)} samples")

# Save to HDF5
output_path = '/kaggle/working/iot23_processed.h5'

with h5py.File(output_path, 'w') as f:
    # Training data
    f.create_dataset('train/features', data=X_train, compression='gzip')
    f.create_dataset('train/labels', data=y_train, compression='gzip')
    
    # Validation data
    f.create_dataset('val/features', data=X_val, compression='gzip')
    f.create_dataset('val/labels', data=y_val, compression='gzip')
    
    # Test data
    f.create_dataset('test/features', data=X_test, compression='gzip')
    f.create_dataset('test/labels', data=y_test, compression='gzip')
    
    # Metadata
    f.attrs['num_features'] = X.shape[1]
    f.attrs['feature_names'] = ','.join(feature_cols)
    f.attrs['attack_ratio'] = float(y.sum() / len(y))

file_size_mb = os.path.getsize(output_path) / 1e6
print(f"\n‚úÖ Saved to {output_path} ({file_size_mb:.1f} MB)")

## ‚úÖ Cell 7: Verify and Summarize

In [None]:
# Verify HDF5 file
print("Verifying HDF5 file...\n")

with h5py.File(output_path, 'r') as f:
    print("Dataset structure:")
    def print_structure(name, obj):
        if isinstance(obj, h5py.Dataset):
            print(f"  {name}: {obj.shape} ({obj.dtype})")
    f.visititems(print_structure)
    
    print("\nMetadata:")
    for key, value in f.attrs.items():
        print(f"  {key}: {value}")

print("\n" + "="*60)
print("PREPROCESSING COMPLETE!")
print("="*60)
print(f"\nüìÅ Output file: {output_path}")
print(f"üìä Size: {file_size_mb:.1f} MB")
print(f"\nüì• Next steps:")
print("   1. Download this file from Output tab")
print("   2. Upload as Kaggle dataset: 'iot23-processed'")
print("   3. Use in training notebook!")
print("\nüí∞ Cost: $0 (Kaggle free CPU)")
print("‚è±Ô∏è  Time: ~2-3 hours")