In [1]:
import os
import random
import sys
from tqdm import tqdm

def create_sampled_zeek_log(input_file, output_file, target_size_mb=200):
    """
    Creates a sampled version of a Zeek conn.log.labeled file that's approximately target_size_mb MB.
    
    Args:
        input_file (str): Path to the large conn.log.labeled file
        output_file (str): Path for the output sampled file
        target_size_mb (int): Target size in MB (default: 200)
        
    Returns:
        None (writes sampled file to disk)
    """
    # Validate inputs
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found!")
        sys.exit(1)
        
    if os.path.exists(output_file):
        print(f"Warning: Output file '{output_file}' already exists and will be overwritten!")
    
    # Convert target size to bytes (with 10% buffer)
    target_bytes = int(target_size_mb * 1024 * 1024 * 1.1)
    total_bytes = os.path.getsize(input_file)
    
    # Check if sampling is needed
    if total_bytes <= target_bytes:
        print("Input file is already smaller than target size. Copying as-is.")
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            outfile.writelines(infile)
        return
    
    # Calculate approximate sampling ratio
    sample_ratio = target_bytes / total_bytes
    
    print(f"Creating sampled version of {input_file}")
    print(f"Original size: {total_bytes/(1024*1024):.2f} MB")
    print(f"Target size: ~{target_size_mb} MB")
    print(f"Sampling ratio: {sample_ratio:.4f}")
    
    # First pass: count lines and extract headers
    header_lines = []
    data_line_count = 0
    
    print("\nAnalyzing input file...")
    with open(input_file, 'r') as infile:
        for line in tqdm(infile, desc="Counting lines"):
            if line.startswith('#'):
                header_lines.append(line)
            else:
                data_line_count += 1
    
    # Calculate number of lines to sample
    sample_lines = int(data_line_count * sample_ratio)
    print(f"\nWill sample {sample_lines:,} of {data_line_count:,} data lines")
    
    # Second pass: random sampling
    sampled_lines = []
    selected_indices = random.sample(range(data_line_count), sample_lines)
    selected_indices_set = set(selected_indices)  # For O(1) lookups
    
    print("\nSampling data lines...")
    current_line = 0
    with open(input_file, 'r') as infile:
        for line in tqdm(infile, desc="Sampling", total=data_line_count + len(header_lines)):
            if not line.startswith('#'):
                if current_line in selected_indices_set:
                    sampled_lines.append(line)
                current_line += 1
    
    # Write output file
    print("\nWriting output file...")
    with open(output_file, 'w') as outfile:
        outfile.writelines(header_lines)
        outfile.writelines(sampled_lines)
    
    # Verify output
    output_size = os.path.getsize(output_file)
    print(f"\nSampling complete!")
    print(f"Output file: {output_file}")
    print(f"Output size: {output_size/(1024*1024):.2f} MB")
    print(f"Lines written: {len(sampled_lines):,}")
    print(f"Headers preserved: {len(header_lines)}")

if __name__ == "__main__":
    # Example usage (modify these paths)
    input_file = "36conn.log.labeled"  # Your large input file
    output_file = "sampl36conn.log.labeled"  # Output file
    
    create_sampled_zeek_log(input_file, output_file)

Creating sampled version of 36conn.log.labeled
Original size: 1705.03 MB
Target size: ~200 MB
Sampling ratio: 0.1290

Analyzing input file...


Counting lines: 13645107it [00:12, 1089460.47it/s]



Will sample 1,760,627 of 13,645,098 data lines

Sampling data lines...


Sampling: 100%|██████████| 13645107/13645107 [00:13<00:00, 1004336.63it/s]



Writing output file...

Sampling complete!
Output file: sampl36conn.log.labeled
Output size: 221.68 MB
Lines written: 1,760,627
Headers preserved: 9
