In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import resample
from joblib import dump
import time
import psutil

# =============================================
# 1. Configuration (ADJUST AS NEEDED)
# =============================================
DATA_DIR = "."  # Directory with 23 files
# Add this right after DATA_DIR definition
print("\nInspecting file structure...")
sample_file = [f for f in os.listdir(DATA_DIR) if f.endswith(".labeled")][0]
with open(os.path.join(DATA_DIR, sample_file), 'r') as f:
    for _ in range(5):  # Print first 5 lines
        print(f.readline().strip())
OUTPUT_FILE = "iot23_balanced_2gb_noSMOTE.joblib"  # Output filename
TARGET_SIZE_GB = 1.8  # Target dataset size (1.5-2GB)
MAX_RAM_USAGE = 9  # GB (2GB buffer for system)

# Essential columns to keep (reduces memory)
ESSENTIAL_COLS = [
    "ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p",
    "proto", "service", "duration", "orig_bytes", "resp_bytes",
    "conn_state", "local_orig", "local_resp", "missed_bytes",
    "history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes",
    "label"
]

# =============================================
# 2. Memory-Optimized File Sampling
# =============================================
def sample_with_memory_control(filepath, target_rows):
    """Samples large files without full memory load"""
    # Get approximate row count
    with open(filepath, 'r') as f:
        total_lines = sum(1 for line in f if not line.startswith('#'))
    
    # Dynamic chunksize (1% of file or 100k max)
    chunksize = min(100000, max(50000, int(total_lines * 0.01)))
    samples = []
    
    for chunk in pd.read_csv(
        filepath,
        sep="\t",
        comment="#",
        chunksize=chunksize,
        usecols=ESSENTIAL_COLS,
        low_memory=True
    ):
        # Sample 5-15% from each chunk
        sample_frac = min(0.15, max(0.05, target_rows/(total_lines * 2)))
        samples.append(chunk.sample(frac=sample_frac, random_state=42))
        
        # Early termination
        if sum(len(s) for s in samples) >= target_rows * 0.9:
            break
    
    # Combine and return exact target size
    combined = pd.concat(samples)
    return combined.sample(min(target_rows, len(combined)), random_state=42)

# =============================================
# 3. Balanced Dataset Creation
# =============================================
def create_balanced_dataset():
    print("Starting balanced dataset creation...")
    start_time = time.time()
    mem = psutil.virtual_memory()
    print(f"Available RAM: {mem.available/(1024**3):.1f}GB")
    
    # --------------------------------------------------
    # Phase 1: Sample from all files with memory control
    # --------------------------------------------------
    all_samples = []
    target_per_file = 120000  # Initial target rows per file
    
    for i, filename in enumerate(sorted(os.listdir(DATA_DIR))):
        if not filename.endswith(".labeled"):
            continue
            
        filepath = os.path.join(DATA_DIR, filename)
        file_size_gb = os.path.getsize(filepath)/(1024**3)
        
        # Adjust sample size based on file size
        file_target = int(target_per_file * min(2, 1 + file_size_gb/4))
        
        print(f"[{i+1}/23] Sampling {filename} ({file_size_gb:.1f}GB) -> {file_target} rows...")
        sample = sample_with_memory_control(filepath, file_target)
        all_samples.append(sample)
        
        # Memory safeguard
        current_ram = psutil.virtual_memory().used/(1024**3)
        if current_ram > MAX_RAM_USAGE:
            print(f"Memory alert ({current_ram:.1f}GB). Reducing future samples.")
            target_per_file = int(target_per_file * 0.7)
    
    # --------------------------------------------------
    # Phase 2: Combine and preprocess
    # --------------------------------------------------
    print("\nCombining samples...")
    combined = pd.concat(all_samples, ignore_index=True)
    del all_samples  # Free memory
    
    # Memory optimization
    for col in ["duration", "orig_bytes", "resp_bytes"]:
        combined[col] = pd.to_numeric(combined[col], downcast="float")
    
    # Categorical encoding
    for col in ["proto", "service", "conn_state"]:
        combined[col] = combined[col].astype("category").cat.codes
    
    # --------------------------------------------------
    # Phase 3: Manual class balancing
    # --------------------------------------------------
    print("\nBalancing classes without SMOTE...")
    
    # Get class distribution
    class_counts = combined["label"].value_counts()
    min_class = class_counts.idxmin()
    min_count = class_counts.min()
    
    balanced_dfs = []
    
    # Downsample majority classes
    for class_name in class_counts.index:
        class_df = combined[combined["label"] == class_name]
        if len(class_df) > min_count * 1.2:  # Only downsample if significantly larger
            class_df = class_df.sample(min_count, random_state=42)
        balanced_dfs.append(class_df)
    
    final_df = pd.concat(balanced_dfs, ignore_index=True)
    
    # --------------------------------------------------
    # Phase 4: Final size adjustment
    # --------------------------------------------------
    # Calculate memory usage
    final_size = final_df.memory_usage(deep=True).sum() / (1024**3)
    
    if final_size > TARGET_SIZE_GB * 1.1:
        reduction = TARGET_SIZE_GB / final_size
        final_df = final_df.sample(frac=reduction, random_state=42)
        final_size = final_df.memory_usage(deep=True).sum() / (1024**3)
    
    print(f"\nFinal Dataset Stats:")
    print(f"- Size: {final_size:.2f}GB")
    print(f"- Samples: {len(final_df):,}")
    print(f"- Class Distribution:\n{final_df['label'].value_counts()}")
    
    # Prepare for saving
    X = final_df.drop("label", axis=1)
    y = final_df["label"]
    
    # Save
    dump({
        "features": X,
        "labels": y,
        "feature_names": list(X.columns),
        "metadata": {
            "source_files": os.listdir(DATA_DIR),
            "creation_time": pd.Timestamp.now(),
            "balance_method": "manual_downsampling"
        }
    }, OUTPUT_FILE)
    
    # Performance report
    total_time = (time.time() - start_time)/60
    print(f"\nProcessing completed in {total_time:.1f} minutes")
    print(f"Dataset saved to {OUTPUT_FILE}")

# =============================================
# 4. Execute with error handling
# =============================================
if __name__ == "__main__":
    try:
        create_balanced_dataset()
    except MemoryError:
        print("\nMemory Error! Try reducing target_per_file in config")
    except Exception as e:
        print(f"\nError: {str(e)}")


Inspecting file structure...
#separator \x09
#set_separator	,
#empty_field	(empty)
#unset_field	-
#path	conn
Starting balanced dataset creation...
Available RAM: 7.0GB
[2/23] Sampling 1.labeled (0.1GB) -> 124143 rows...

Error: Usecols do not match columns, columns expected but not found: ['history', 'ts', 'resp_pkts', 'id.orig_p', 'conn_state', 'local_orig', 'duration', 'label', 'id.resp_h', 'orig_bytes', 'resp_ip_bytes', 'id.resp_p', 'service', 'orig_pkts', 'orig_ip_bytes', 'uid', 'local_resp', 'proto', 'missed_bytes', 'id.orig_h', 'resp_bytes']


In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.utils import resample
from joblib import dump
import time
import psutil

# =============================================
# 1. Configuration (ADJUST AS NEEDED)
# =============================================
DATA_DIR = "."  # Directory with 23 files
OUTPUT_FILE = "iot23_balanced_2gb_noSMOTE.joblib"  # Output filename
TARGET_SIZE_GB = 1.8  # Target dataset size (1.5-2GB)
MAX_RAM_USAGE = 9  # GB (2GB buffer for system)

# Essential columns to keep (reduces memory)
ESSENTIAL_COLS = [
    "ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p",
    "proto", "service", "duration", "orig_bytes", "resp_bytes",
    "conn_state", "local_orig", "local_resp", "missed_bytes",
    "history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes",
    "label"
]

# =============================================
# 2. Memory-Optimized File Sampling
# =============================================
def sample_with_memory_control(filepath, target_rows):
    """Samples large files without full memory load"""
    # First read the header to get column names
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#fields'):
                columns = line.strip().split('\t')[1:]  # Get column names after #fields
                break
    
    # Get approximate row count
    with open(filepath, 'r') as f:
        total_lines = sum(1 for line in f if not line.startswith('#'))
    
    # Dynamic chunksize (1% of file or 100k max)
    chunksize = min(100000, max(50000, int(total_lines * 0.01)))
    samples = []
    
    for chunk in pd.read_csv(
        filepath,
        sep="\t",
        comment="#",
        chunksize=chunksize,
        names=columns,  # Use the columns we extracted
        usecols=[col for col in ESSENTIAL_COLS if col in columns],  # Only use existing columns
        low_memory=True
    ):
        # Sample 5-15% from each chunk
        sample_frac = min(0.15, max(0.05, target_rows/(total_lines * 2)))
        samples.append(chunk.sample(frac=sample_frac, random_state=42))
        
        # Early termination
        if sum(len(s) for s in samples) >= target_rows * 0.9:
            break
    
    # Combine and return exact target size
    combined = pd.concat(samples)
    return combined.sample(min(target_rows, len(combined)), random_state=42)

# =============================================
# 3. Balanced Dataset Creation
# =============================================
def create_balanced_dataset():
    print("Starting balanced dataset creation...")
    start_time = time.time()
    mem = psutil.virtual_memory()
    print(f"Available RAM: {mem.available/(1024**3):.1f}GB")
    
    # --------------------------------------------------
    # Phase 1: Sample from all files with memory control
    # --------------------------------------------------
    all_samples = []
    target_per_file = 120000  # Initial target rows per file
    
    labeled_files = [f for f in sorted(os.listdir(DATA_DIR)) if f.endswith(".labeled")]
    if not labeled_files:
        raise FileNotFoundError("No .labeled files found in directory")
    
    for i, filename in enumerate(labeled_files):
        filepath = os.path.join(DATA_DIR, filename)
        file_size_gb = os.path.getsize(filepath)/(1024**3)
        
        # Adjust sample size based on file size
        file_target = int(target_per_file * min(2, 1 + file_size_gb/4))
        
        print(f"[{i+1}/{len(labeled_files)}] Sampling {filename} ({file_size_gb:.1f}GB) -> {file_target} rows...")
        sample = sample_with_memory_control(filepath, file_target)
        all_samples.append(sample)
        
        # Memory safeguard
        current_ram = psutil.virtual_memory().used/(1024**3)
        if current_ram > MAX_RAM_USAGE:
            print(f"Memory alert ({current_ram:.1f}GB). Reducing future samples.")
            target_per_file = int(target_per_file * 0.7)
    
    # --------------------------------------------------
    # Phase 2: Combine and preprocess
    # --------------------------------------------------
    print("\nCombining samples...")
    combined = pd.concat(all_samples, ignore_index=True)
    del all_samples  # Free memory
    
    # Memory optimization
    for col in ["duration", "orig_bytes", "resp_bytes"]:
        if col in combined.columns:
            combined[col] = pd.to_numeric(combined[col], downcast="float")
    
    # Categorical encoding
    for col in ["proto", "service", "conn_state"]:
        if col in combined.columns:
            combined[col] = combined[col].astype("category").cat.codes
    
    # --------------------------------------------------
    # Phase 3: Manual class balancing
    # --------------------------------------------------
    print("\nBalancing classes without SMOTE...")
    
    if "label" not in combined.columns:
        raise KeyError("'label' column not found in the data")
    
    # Get class distribution
    class_counts = combined["label"].value_counts()
    min_class = class_counts.idxmin()
    min_count = class_counts.min()
    
    balanced_dfs = []
    
    # Downsample majority classes
    for class_name in class_counts.index:
        class_df = combined[combined["label"] == class_name]
        if len(class_df) > min_count * 1.2:  # Only downsample if significantly larger
            class_df = class_df.sample(min_count, random_state=42)
        balanced_dfs.append(class_df)
    
    final_df = pd.concat(balanced_dfs, ignore_index=True)
    
    # --------------------------------------------------
    # Phase 4: Final size adjustment
    # --------------------------------------------------
    # Calculate memory usage
    final_size = final_df.memory_usage(deep=True).sum() / (1024**3)
    
    if final_size > TARGET_SIZE_GB * 1.1:
        reduction = TARGET_SIZE_GB / final_size
        final_df = final_df.sample(frac=reduction, random_state=42)
        final_size = final_df.memory_usage(deep=True).sum() / (1024**3)
    
    print(f"\nFinal Dataset Stats:")
    print(f"- Size: {final_size:.2f}GB")
    print(f"- Samples: {len(final_df):,}")
    print(f"- Class Distribution:\n{final_df['label'].value_counts()}")
    
    # Prepare for saving
    X = final_df.drop("label", axis=1)
    y = final_df["label"]
    
    # Save
    dump({
        "features": X,
        "labels": y,
        "feature_names": list(X.columns),
        "metadata": {
            "source_files": labeled_files,
            "creation_time": pd.Timestamp.now(),
            "balance_method": "manual_downsampling"
        }
    }, OUTPUT_FILE)
    
    # Performance report
    total_time = (time.time() - start_time)/60
    print(f"\nProcessing completed in {total_time:.1f} minutes")
    print(f"Dataset saved to {OUTPUT_FILE}")

# =============================================
# 4. Execute with error handling
# =============================================
if __name__ == "__main__":
    try:
        create_balanced_dataset()
    except MemoryError:
        print("\nMemory Error! Try reducing target_per_file in config")
    except Exception as e:
        print(f"\nError: {str(e)}")

Starting balanced dataset creation...
Available RAM: 7.3GB
[1/22] Sampling 1.labeled (0.1GB) -> 124143 rows...
[2/22] Sampling 17conn.log.labeled (7.6GB) -> 240000 rows...
[3/22] Sampling 20conn.log.labeled (0.0GB) -> 120011 rows...
[4/22] Sampling 21.labeled (0.0GB) -> 120011 rows...
[5/22] Sampling 3.labeled (0.0GB) -> 120681 rows...
[6/22] Sampling 33conn.log.labeled (7.3GB) -> 240000 rows...
[7/22] Sampling 34conn.log.labeled (0.0GB) -> 120084 rows...
[8/22] Sampling 35conn.log.labeled (1.2GB) -> 157444 rows...
[9/22] Sampling 36conn.log.labeled (1.7GB) -> 169952 rows...
[10/22] Sampling 39conn.log.labeled (10.1GB) -> 240000 rows...
[11/22] Sampling 42conn.log.labeled (0.0GB) -> 120016 rows...
[12/22] Sampling 43conn.log.labeled (8.7GB) -> 240000 rows...
[13/22] Sampling 44conn.log.labeled (0.0GB) -> 120000 rows...
[14/22] Sampling 48conn.log.labeled (0.5GB) -> 134856 rows...
[15/22] Sampling 49conn.log.labeled (0.8GB) -> 143850 rows...
[16/22] Sampling 4conn.log.labeled (0.0GB) ->

  for chunk in pd.read_csv(


[18/22] Sampling 5conn.log.labeled (0.0GB) -> 120005 rows...
[19/22] Sampling 60conn.log.labeled (0.4GB) -> 133063 rows...
[20/22] Sampling 7conn.log.labeled (0.0GB) -> 120000 rows...
[21/22] Sampling 8conn.log.labeled (0.0GB) -> 120039 rows...
[22/22] Sampling 9conn.log.labeled (0.9GB) -> 147756 rows...

Combining samples...

Error: Unable to parse string "-" at position 0


In [6]:
import os
import pandas as pd
import numpy as np
import time
import psutil

# =============================================
# 1. Configuration
# =============================================
DATA_DIR = "."  # Directory with files
OUTPUT_FILE = "iot23_unbalanced.csv"  # Output CSV filename
TARGET_SIZE_GB = 1.8  # Target dataset size (1.5-2GB)
MAX_RAM_USAGE = 9  # GB (2GB buffer for system)

# Essential columns to keep (reduces memory)
ESSENTIAL_COLS = [
    "ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p",
    "proto", "service", "duration", "orig_bytes", "resp_bytes",
    "conn_state", "local_orig", "local_resp", "missed_bytes",
    "history", "orig_pkts", "orig_ip_bytes", "resp_pkts", "resp_ip_bytes",
    "label"
]

# =============================================
# 2. Memory-Optimized File Sampling
# =============================================
def sample_with_memory_control(filepath, target_rows):
    """Samples large files with proper handling of missing values"""
    # First read the header to get column names
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#fields'):
                columns = line.strip().split('\t')[1:]  # Get column names after #fields
                break
    
    # Get approximate row count
    with open(filepath, 'r') as f:
        total_lines = sum(1 for line in f if not line.startswith('#'))
    
    # Dynamic chunksize (1% of file or 100k max)
    chunksize = min(100000, max(50000, int(total_lines * 0.01)))
    samples = []
    
    # Define NA values specific to Zeek logs
    na_values = ['(empty)', '-', 'NA', 'N/A', 'nan', 'NaN', '']
    
    for chunk in pd.read_csv(
        filepath,
        sep="\t",
        comment="#",
        chunksize=chunksize,
        names=columns,
        usecols=[col for col in ESSENTIAL_COLS if col in columns],
        low_memory=False,  # Better for mixed types
        na_values=na_values
    ):
        # Sample 5-15% from each chunk
        sample_frac = min(0.15, max(0.05, target_rows/(total_lines * 2)))
        samples.append(chunk.sample(frac=sample_frac, random_state=42))
        
        # Early termination
        if sum(len(s) for s in samples) >= target_rows * 0.9:
            break
    
    # Combine and return exact target size
    combined = pd.concat(samples)
    return combined.sample(min(target_rows, len(combined)), random_state=42)

# =============================================
# 3. Dataset Creation (Without Balancing)
# =============================================
def create_dataset():
    print("Starting dataset creation without class balancing...")
    start_time = time.time()
    mem = psutil.virtual_memory()
    print(f"Available RAM: {mem.available/(1024**3):.1f}GB")
    
    # --------------------------------------------------
    # Phase 1: Sample from all files with memory control
    # --------------------------------------------------
    all_samples = []
    target_per_file = 120000  # Initial target rows per file
    
    labeled_files = [f for f in sorted(os.listdir(DATA_DIR)) if f.endswith(".labeled")]
    if not labeled_files:
        raise FileNotFoundError("No .labeled files found in directory")
    
    for i, filename in enumerate(labeled_files):
        filepath = os.path.join(DATA_DIR, filename)
        file_size_gb = os.path.getsize(filepath)/(1024**3)
        
        # Adjust sample size based on file size
        file_target = int(target_per_file * min(2, 1 + file_size_gb/4))
        
        print(f"[{i+1}/{len(labeled_files)}] Sampling {filename} ({file_size_gb:.1f}GB) -> {file_target} rows...")
        sample = sample_with_memory_control(filepath, file_target)
        all_samples.append(sample)
        
        # Memory safeguard
        current_ram = psutil.virtual_memory().used/(1024**3)
        if current_ram > MAX_RAM_USAGE:
            print(f"Memory alert ({current_ram:.1f}GB). Reducing future samples.")
            target_per_file = int(target_per_file * 0.7)
    
    # --------------------------------------------------
    # Phase 2: Combine and preprocess
    # --------------------------------------------------
    print("\nCombining samples...")
    combined = pd.concat(all_samples, ignore_index=True)
    del all_samples  # Free memory
    
    # Handle missing values in numeric columns
    numeric_cols = ["duration", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts", 
                   "orig_ip_bytes", "resp_ip_bytes", "missed_bytes"]
    for col in numeric_cols:
        if col in combined.columns:
            combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0)
    
    # Categorical encoding
    for col in ["proto", "service", "conn_state"]:
        if col in combined.columns:
            combined[col] = combined[col].astype("category").cat.codes
    
    # --------------------------------------------------
    # Phase 3: Final size adjustment
    # --------------------------------------------------
    # Calculate memory usage
    final_size = combined.memory_usage(deep=True).sum() / (1024**3)
    
    if final_size > TARGET_SIZE_GB * 1.1:
        reduction = TARGET_SIZE_GB / final_size
        combined = combined.sample(frac=reduction, random_state=42)
        final_size = combined.memory_usage(deep=True).sum() / (1024**3)
    
    print(f"\nFinal Dataset Stats:")
    print(f"- Size: {final_size:.2f}GB")
    print(f"- Samples: {len(combined):,}")
    print(f"- Class Distribution:\n{combined['label'].value_counts()}")
    
    # Save as CSV in chunks to avoid memory issues
    print("\nSaving as CSV...")
    chunksize = 100000  # Rows per chunk
    for i in range(0, len(combined), chunksize):
        chunk = combined.iloc[i:i+chunksize]
        mode = 'w' if i == 0 else 'a'
        header = i == 0
        chunk.to_csv(OUTPUT_FILE, mode=mode, header=header, index=False)
        print(f"Saved rows {i} to {min(i+chunksize, len(combined))}")
    
    # Performance report
    total_time = (time.time() - start_time)/60
    print(f"\nProcessing completed in {total_time:.1f} minutes")
    print(f"Dataset saved to {OUTPUT_FILE}")

# =============================================
# 4. Execute with error handling
# =============================================
if __name__ == "__main__":
    try:
        create_dataset()
    except MemoryError:
        print("\nMemory Error! Try reducing target_per_file in config")
    except Exception as e:
        print(f"\nError: {str(e)}")

Starting dataset creation without class balancing...
Available RAM: 6.6GB
[1/22] Sampling 1.labeled (0.1GB) -> 124143 rows...
[2/22] Sampling 17conn.log.labeled (7.6GB) -> 240000 rows...
[3/22] Sampling 20conn.log.labeled (0.0GB) -> 120011 rows...
[4/22] Sampling 21.labeled (0.0GB) -> 120011 rows...
[5/22] Sampling 3.labeled (0.0GB) -> 120681 rows...
[6/22] Sampling 33conn.log.labeled (7.3GB) -> 240000 rows...
[7/22] Sampling 34conn.log.labeled (0.0GB) -> 120084 rows...
[8/22] Sampling 35conn.log.labeled (1.2GB) -> 157444 rows...
[9/22] Sampling 36conn.log.labeled (1.7GB) -> 169952 rows...
[10/22] Sampling 39conn.log.labeled (10.1GB) -> 240000 rows...
[11/22] Sampling 42conn.log.labeled (0.0GB) -> 120016 rows...
[12/22] Sampling 43conn.log.labeled (8.7GB) -> 240000 rows...
[13/22] Sampling 44conn.log.labeled (0.0GB) -> 120000 rows...
[14/22] Sampling 48conn.log.labeled (0.5GB) -> 134856 rows...
[15/22] Sampling 49conn.log.labeled (0.8GB) -> 143850 rows...
[16/22] Sampling 4conn.log.lab

In [8]:
import os
import pandas as pd
import numpy as np
import time
import psutil

# =============================================
# 1. Configuration
# =============================================
DATA_DIR = "."  # Directory with files
OUTPUT_FILE = "iot23_unbalanced.csv"  # Output CSV filename
TARGET_SIZE_GB = 1.8  # Target dataset size (1.5-2GB)
MAX_RAM_USAGE = 9  # GB (2GB buffer for system)

# Essential columns to keep (reduces memory)
ESSENTIAL_COLS = [
    "ts", "uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p",
    "proto", "service", "duration", "orig_bytes", "resp_bytes",
    "conn_state", "missed_bytes", "history", "orig_pkts",
    "orig_ip_bytes", "resp_pkts", "resp_ip_bytes", "ip_proto"
    # Removed 'label' since it's not found in your data
]

# =============================================
# 2. Memory-Optimized File Sampling
# =============================================
def sample_with_memory_control(filepath, target_rows):
    """Samples large files with proper handling of missing values"""
    # First read the header to get column names
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#fields'):
                columns = line.strip().split('\t')[1:]  # Get column names after #fields
                break
    
    # Get approximate row count
    with open(filepath, 'r') as f:
        total_lines = sum(1 for line in f if not line.startswith('#'))
    
    # Dynamic chunksize (1% of file or 100k max)
    chunksize = min(100000, max(50000, int(total_lines * 0.01)))
    samples = []
    
    # Define NA values specific to Zeek logs
    na_values = ['(empty)', '-', 'NA', 'N/A', 'nan', 'NaN', '']
    
    # Only include columns that exist in the file
    usecols = [col for col in ESSENTIAL_COLS if col in columns]
    
    for chunk in pd.read_csv(
        filepath,
        sep="\t",
        comment="#",
        chunksize=chunksize,
        names=columns,
        usecols=usecols,
        low_memory=False,  # Better for mixed types
        na_values=na_values
    ):
        # Sample 5-15% from each chunk
        sample_frac = min(0.15, max(0.05, target_rows/(total_lines * 2)))
        samples.append(chunk.sample(frac=sample_frac, random_state=42))
        
        # Early termination
        if sum(len(s) for s in samples) >= target_rows * 0.9:
            break
    
    # Combine and return exact target size
    combined = pd.concat(samples)
    return combined.sample(min(target_rows, len(combined)), random_state=42)

# =============================================
# 3. Dataset Creation (Without Balancing)
# =============================================
def create_dataset():
    print("Starting dataset creation without class balancing...")
    start_time = time.time()
    mem = psutil.virtual_memory()
    print(f"Available RAM: {mem.available/(1024**3):.1f}GB")
    
    # --------------------------------------------------
    # Phase 1: Sample from all files with memory control
    # --------------------------------------------------
    all_samples = []
    target_per_file = 120000  # Initial target rows per file
    
    labeled_files = [f for f in sorted(os.listdir(DATA_DIR)) if f.endswith(".labeled")]
    if not labeled_files:
        raise FileNotFoundError("No .labeled files found in directory")
    
    for i, filename in enumerate(labeled_files):
        filepath = os.path.join(DATA_DIR, filename)
        file_size_gb = os.path.getsize(filepath)/(1024**3)
        
        # Adjust sample size based on file size
        file_target = int(target_per_file * min(2, 1 + file_size_gb/4))
        
        print(f"[{i+1}/{len(labeled_files)}] Sampling {filename} ({file_size_gb:.1f}GB) -> {file_target} rows...")
        sample = sample_with_memory_control(filepath, file_target)
        all_samples.append(sample)
        
        # Memory safeguard
        current_ram = psutil.virtual_memory().used/(1024**3)
        if current_ram > MAX_RAM_USAGE:
            print(f"Memory alert ({current_ram:.1f}GB). Reducing future samples.")
            target_per_file = int(target_per_file * 0.7)
    
    # --------------------------------------------------
    # Phase 2: Combine and preprocess
    # --------------------------------------------------
    print("\nCombining samples...")
    combined = pd.concat(all_samples, ignore_index=True)
    del all_samples  # Free memory
    
    # Handle missing values in numeric columns
    numeric_cols = ["duration", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts", 
                   "orig_ip_bytes", "resp_ip_bytes", "missed_bytes"]
    for col in numeric_cols:
        if col in combined.columns:
            combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0)
    
    # Categorical encoding
    for col in ["proto", "service", "conn_state"]:
        if col in combined.columns:
            combined[col] = combined[col].astype("category").cat.codes
    
    # --------------------------------------------------
    # Phase 3: Final size adjustment
    # --------------------------------------------------
    # Calculate memory usage
    final_size = combined.memory_usage(deep=True).sum() / (1024**3)
    
    if final_size > TARGET_SIZE_GB * 1.1:
        reduction = TARGET_SIZE_GB / final_size
        combined = combined.sample(frac=reduction, random_state=42)
        final_size = combined.memory_usage(deep=True).sum() / (1024**3)
    
    print(f"\nFinal Dataset Stats:")
    print(f"- Size: {final_size:.2f}GB")
    print(f"- Samples: {len(combined):,}")
    
    # Check if label column exists before trying to print distribution
    if 'label' in combined.columns:
        print(f"- Class Distribution:\n{combined['label'].value_counts()}")
    else:
        print("- No label column found in the data")
    
    # Save as CSV in chunks to avoid memory issues
    print("\nSaving as CSV...")
    chunksize = 100000  # Rows per chunk
    for i in range(0, len(combined), chunksize):
        chunk = combined.iloc[i:i+chunksize]
        mode = 'w' if i == 0 else 'a'
        header = i == 0
        chunk.to_csv(OUTPUT_FILE, mode=mode, header=header, index=False)
        print(f"Saved rows {i} to {min(i+chunksize, len(combined))}")
    
    # Performance report
    total_time = (time.time() - start_time)/60
    print(f"\nProcessing completed in {total_time:.1f} minutes")
    print(f"Dataset saved to {OUTPUT_FILE}")

# =============================================
# 4. Execute with error handling
# =============================================
if __name__ == "__main__":
    try:
        create_dataset()
    except MemoryError:
        print("\nMemory Error! Try reducing target_per_file in config")
    except Exception as e:
        print(f"\nError: {str(e)}")

Starting dataset creation without class balancing...
Available RAM: 6.4GB
[1/22] Sampling 1.labeled (0.1GB) -> 124143 rows...
[2/22] Sampling 17conn.log.labeled (7.6GB) -> 240000 rows...
[3/22] Sampling 20conn.log.labeled (0.0GB) -> 120011 rows...
[4/22] Sampling 21.labeled (0.0GB) -> 120011 rows...
[5/22] Sampling 3.labeled (0.0GB) -> 120681 rows...
[6/22] Sampling 33conn.log.labeled (7.3GB) -> 240000 rows...
[7/22] Sampling 34conn.log.labeled (0.0GB) -> 120084 rows...
[8/22] Sampling 35conn.log.labeled (1.2GB) -> 157444 rows...
[9/22] Sampling 36conn.log.labeled (1.7GB) -> 169952 rows...
[10/22] Sampling 39conn.log.labeled (10.1GB) -> 240000 rows...
[11/22] Sampling 42conn.log.labeled (0.0GB) -> 120016 rows...
[12/22] Sampling 43conn.log.labeled (8.7GB) -> 240000 rows...
[13/22] Sampling 44conn.log.labeled (0.0GB) -> 120000 rows...
[14/22] Sampling 48conn.log.labeled (0.5GB) -> 134856 rows...
[15/22] Sampling 49conn.log.labeled (0.8GB) -> 143850 rows...
[16/22] Sampling 4conn.log.lab

In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Configuration
DATA_DIR = "."  # Directory containing your conn.log.labeled files
OUTPUT_CSV = "network_training_data.csv"  # Output file name

# =============================================
# 1. Load and Parse Zeek Logs
# =============================================
def parse_zeek_log(filepath):
    """Parse a single Zeek conn.log.labeled file"""
    # Read the header to get column names
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#fields'):
                columns = line.strip().split('\t')[1:]
                break
    
    # Load data with proper handling of Zeek-specific values
    df = pd.read_csv(
        filepath,
        sep='\t',
        comment='#',
        names=columns,
        na_values=['(empty)', '-', 'NA'],
        low_memory=False
    )
    
    return df

# =============================================
# 2. Feature Engineering
# =============================================
def engineer_features(df):
    """Create ML-ready features from raw Zeek data"""
    
    # Basic cleaning
    df = df.dropna(subset=['id.orig_h', 'id.resp_h'])  # Remove rows missing IPs
    
    # Protocol handling
    proto_map = {0: 'unknown', 1: 'tcp', 2: 'udp', 3: 'icmp'}
    df['proto'] = df['proto'].map(proto_map).fillna('other')
    
    # Connection state encoding
    state_map = {
        'S0': 'attempt', 'S1': 'established', 'S2': 'attempt',
        'S3': 'attempt', 'SF': 'normal', 'REJ': 'rejected',
        'RSTO': 'reset', 'RSTR': 'reset', 'RSTOS0': 'reset',
        'RSTRH': 'reset', 'SH': 'scan', 'SHR': 'scan',
        'OTH': 'other'
    }
    df['conn_state'] = df['conn_state'].map(state_map).fillna('other')
    
    # Service inference from ports
    def infer_service(row):
        port = row['id.resp_p']
        if row['proto'] == 'tcp':
            if port == 80 or port == 443: return 'http'
            elif port == 22: return 'ssh'
            elif port == 21: return 'ftp'
        elif row['proto'] == 'udp' and port == 53: return 'dns'
        return 'other'
    
    df['service'] = df.apply(infer_service, axis=1)
    
    # Numeric features
    df['duration'] = df['duration'].fillna(0)
    df['bytes_ratio'] = (df['orig_bytes'] + 1) / (df['resp_bytes'] + 1)
    df['pkts_ratio'] = (df['orig_pkts'] + 1) / (df['resp_pkts'] + 1)
    
    return df

# =============================================
# 3. Label Processing
# =============================================
def process_labels(df):
    """Extract and encode labels from Zeek's annotations"""
    
    # Zeek typically adds labels like:
    # "label": "Malware", "Botnet", "Scan", etc.
    if 'label' not in df.columns:
        df['label'] = 'unknown'  # Default if no labels exist
    else:
        # Clean label strings
        df['label'] = df['label'].str.extract(r'\"(.*?)\"')[0].fillna('benign')
    
    # Encode labels
    le = LabelEncoder()
    df['label_encoded'] = le.fit_transform(df['label'])
    
    print("Label distribution:")
    print(df['label'].value_counts())
    
    return df, le

# =============================================
# 4. Main Processing Function
# =============================================
def process_all_files():
    all_data = []
    
    # Process each file
    for filename in os.listdir(DATA_DIR):
        if filename.endswith('.labeled'):
            print(f"Processing {filename}...")
            filepath = os.path.join(DATA_DIR, filename)
            df = parse_zeek_log(filepath)
            df = engineer_features(df)
            all_data.append(df)
    
    # Combine all data
    combined = pd.concat(all_data, ignore_index=True)
    
    # Process labels
    combined, label_encoder = process_labels(combined)
    
    # Select final features
    features = [
        'duration', 'orig_bytes', 'resp_bytes', 'bytes_ratio', 'pkts_ratio',
        'orig_pkts', 'resp_pkts', 'proto', 'service', 'conn_state',
        'label', 'label_encoded'
    ]
    features = [f for f in features if f in combined.columns]
    
    final_df = combined[features]
    
    # Save to CSV
    final_df.to_csv(OUTPUT_CSV, index=False)
    print(f"\nSaved processed data to {OUTPUT_CSV}")
    print(f"Final shape: {final_df.shape}")
    
    return final_df, label_encoder

# Run the processing
if __name__ == "__main__":
    df, le = process_all_files()

Processing 1.labeled...
Processing 20conn.log.labeled...
Processing 21.labeled...
Processing 3.labeled...
Processing 4conn.log.labeled...
Processing 5conn.log.labeled...
Processing 7conn.log.labeled...
Processing 8conn.log.labeled...
Processing 9conn.log.labeled...
Label distribution:
label
unknown    7561998
Name: count, dtype: int64

Saved processed data to network_training_data.csv
Final shape: (7561998, 12)


In [11]:
import numpy as np
import pandas as pd
# Assign the filename
fpath = r"9conn.log.labeled"
# Import file using np.recfromcsv: d
narray = np.recfromcsv(fpath, encoding=None)

# Print out first n records of d 
print(narray)
# string to search in file
word = 'fields'
with open(fpath, 'r') as fp:
    # read all lines in a list
    lines = fp.readlines()
    for line in lines:
        # check if string present on a current line
        if line.find(word) != -1:
            print(word, 'string exists in file')
            print('Line Number:', lines.index(line))
            print('Line:', line)
            linefields = line
x = linefields.split("\t")
df = pd.concat([pd.Series(l.astype(str).split('\t')) for l in narray], axis=1).T
print(df)


[('1532510451.648888\tCnR6zKxrWbFw26ua7\t192.168.100.111\t40008\t46.28.110.244\t123\tudp\t-\t0.004751\t48\t48\tSF\t-\t-\t0\tDd\t1\t76\t1\t76\t(empty)   Benign   -',)
 ('1532511003.118878\tCcfURS3zgEMzX0RqTc\t192.168.100.102\t57849\t192.168.100.111\t22\ttcp\t-\t108.421563\t2376\t96\tOTH\t-\t-\t0\tDAd\t5\t332\t2\t200\t(empty)   Benign   -',)
 ('1532511739.837566\tCPhAqD1EhNOoPVA6ha\t192.168.100.102\t59670\t192.168.100.111\t22\ttcp\t-\t0.001244\t21\t0\tS0\t-\t-\t0\tSAD\t3\t189\t0\t0\t(empty)   Benign   -',)
 ...
 ('1532591674.249972\tCxHWyM2vp5x1mCZmp3\t192.168.100.111\t39234\t192.121.45.63\t23\ttcp\t-\t165.760290\t0\t0\tS1\t-\t-\t0\tShA\t5\t220\t1\t44\t(empty)   Benign   -',)
 ('1532595145.171589\tCtmaRjXW7FOus0sPa\t168.102.14.4\t11\t192.168.100.111\t0\ticmp\t-\t-\t-\t-\tOTH\t-\t-\t0\t-\t1\t68\t0\t0\t(empty)   Benign   -',)
 ('1532595149.229773\tCUVfp02K1YEXmtDf2h\t192.94.118.248\t11\t192.168.100.111\t0\ticmp\t-\t-\t-\t-\tOTH\t-\t-\t0\t-\t1\t56\t0\t0\t(empty)   Benign   -',)]
fields stri

In [12]:
display(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1532510451.648888,CnR6zKxrWbFw26ua7,192.168.100.111,40008,46.28.110.244,123,udp,-,0.004751,48,...,SF,-,-,0,Dd,1,76,1,76,(empty) Benign -
1,1532511003.118878,CcfURS3zgEMzX0RqTc,192.168.100.102,57849,192.168.100.111,22,tcp,-,108.421563,2376,...,OTH,-,-,0,DAd,5,332,2,200,(empty) Benign -
2,1532511739.837566,CPhAqD1EhNOoPVA6ha,192.168.100.102,59670,192.168.100.111,22,tcp,-,0.001244,21,...,S0,-,-,0,SAD,3,189,0,0,(empty) Benign -
3,1532512077.774888,CFr56iYeYk4YZXrPf,192.168.100.102,59701,192.168.100.111,22,tcp,-,0.000738,21,...,S0,-,-,0,SAD,3,189,0,0,(empty) Benign -
4,1532512335.605565,C8a9xW23hF8nDgeNtj,192.168.100.111,36199,192.168.100.1,53,udp,dns,-,-,...,S0,-,-,0,D,1,58,0,0,(empty) Benign -
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6378288,1532595195.007148,CIHonx2ymqlX5DqnU2,192.168.100.111,28057,173.94.58.125,23,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,(empty) Malicious PartOfAHorizontalPortScan
6378289,1532594748.215022,CwhtUD3BIdOP5L6Y8g,192.168.100.111,52876,69.196.96.231,23,tcp,-,209.874367,0,...,S1,-,-,0,ShA,8,360,1,44,(empty) Benign -
6378290,1532591674.249972,CxHWyM2vp5x1mCZmp3,192.168.100.111,39234,192.121.45.63,23,tcp,-,165.760290,0,...,S1,-,-,0,ShA,5,220,1,44,(empty) Benign -
6378291,1532595145.171589,CtmaRjXW7FOus0sPa,168.102.14.4,11,192.168.100.111,0,icmp,-,-,-,...,OTH,-,-,0,-,1,68,0,0,(empty) Benign -


In [23]:
df[20].values[1:100]

array(['(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(empty)   Benign   -', '(empty)   Benign   -',
       '(e

In [20]:
x

['#fields',
 'ts',
 'uid',
 'id.orig_h',
 'id.orig_p',
 'id.resp_h',
 'id.resp_p',
 'proto',
 'service',
 'duration',
 'orig_bytes',
 'resp_bytes',
 'conn_state',
 'local_orig',
 'local_resp',
 'missed_bytes',
 'history',
 'orig_pkts',
 'orig_ip_bytes',
 'resp_pkts',
 'resp_ip_bytes',
 'tunnel_parents   label   detailed-label\n']

In [21]:
x.pop(0)

'#fields'

In [24]:
x[20]='tunnel_parents   label   detailed-label'

In [25]:
x

['ts',
 'uid',
 'id.orig_h',
 'id.orig_p',
 'id.resp_h',
 'id.resp_p',
 'proto',
 'service',
 'duration',
 'orig_bytes',
 'resp_bytes',
 'conn_state',
 'local_orig',
 'local_resp',
 'missed_bytes',
 'history',
 'orig_pkts',
 'orig_ip_bytes',
 'resp_pkts',
 'resp_ip_bytes',
 'tunnel_parents   label   detailed-label']

In [26]:
df.columns=x

In [28]:
df

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents label detailed-label
0,1532510451.648888,CnR6zKxrWbFw26ua7,192.168.100.111,40008,46.28.110.244,123,udp,-,0.004751,48,...,SF,-,-,0,Dd,1,76,1,76,(empty) Benign -
1,1532511003.118878,CcfURS3zgEMzX0RqTc,192.168.100.102,57849,192.168.100.111,22,tcp,-,108.421563,2376,...,OTH,-,-,0,DAd,5,332,2,200,(empty) Benign -
2,1532511739.837566,CPhAqD1EhNOoPVA6ha,192.168.100.102,59670,192.168.100.111,22,tcp,-,0.001244,21,...,S0,-,-,0,SAD,3,189,0,0,(empty) Benign -
3,1532512077.774888,CFr56iYeYk4YZXrPf,192.168.100.102,59701,192.168.100.111,22,tcp,-,0.000738,21,...,S0,-,-,0,SAD,3,189,0,0,(empty) Benign -
4,1532512335.605565,C8a9xW23hF8nDgeNtj,192.168.100.111,36199,192.168.100.1,53,udp,dns,-,-,...,S0,-,-,0,D,1,58,0,0,(empty) Benign -
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6378288,1532595195.007148,CIHonx2ymqlX5DqnU2,192.168.100.111,28057,173.94.58.125,23,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,(empty) Malicious PartOfAHorizontalPortScan
6378289,1532594748.215022,CwhtUD3BIdOP5L6Y8g,192.168.100.111,52876,69.196.96.231,23,tcp,-,209.874367,0,...,S1,-,-,0,ShA,8,360,1,44,(empty) Benign -
6378290,1532591674.249972,CxHWyM2vp5x1mCZmp3,192.168.100.111,39234,192.121.45.63,23,tcp,-,165.760290,0,...,S1,-,-,0,ShA,5,220,1,44,(empty) Benign -
6378291,1532595145.171589,CtmaRjXW7FOus0sPa,168.102.14.4,11,192.168.100.111,0,icmp,-,-,-,...,OTH,-,-,0,-,1,68,0,0,(empty) Benign -


In [30]:
df.to_csv("9.csv")