In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm  # for progress bar

# Configuration
fpath = "36conn.log.labeled"
output_csv = "36.csv"
target_size_mb = 200
chunksize = 100000  # Number of rows to process at a time

# =============================================
# 1. Get column headers from the file
# =============================================
with open(fpath, 'r') as fp:
    for line in fp:
        if line.startswith('#fields'):
            columns = line.strip().split('\t')[1:]  # Extract column names
            break

# =============================================
# 2. Calculate how many rows we need for ~200MB
# =============================================
# First get approximate row count
total_rows = 0
with open(fpath, 'r') as fp:
    for line in fp:
        if not line.startswith('#'):
            total_rows += 1

# Estimate bytes per row (conservative)
sample_fraction = (target_size_mb * 1024 * 1024) / (os.path.getsize(fpath) * 1.5)
rows_needed = int(total_rows * sample_fraction)

print(f"Sampling ~{rows_needed:,} rows to achieve ~{target_size_mb}MB output")

# =============================================
# 3. Process the file in chunks and sample
# =============================================
sampled_chunks = []
rows_processed = 0

# Read the file in chunks
for chunk in tqdm(pd.read_csv(
    fpath,
    sep='\t',
    comment='#',
    names=columns,
    chunksize=chunksize,
    low_memory=False
), desc="Processing chunks"):
    
    # Take a sample from this chunk
    chunk_sample = chunk.sample(frac=sample_fraction, random_state=42)
    sampled_chunks.append(chunk_sample)
    
    rows_processed += len(chunk_sample)
    if rows_processed >= rows_needed:
        break

# Combine all sampled chunks
df = pd.concat(sampled_chunks, ignore_index=True)

# =============================================
# 4. Save the sampled data
# =============================================
df.to_csv(output_csv, index=False)
print(f"\nSaved sampled data to {output_csv}")
print(f"Final size: {os.path.getsize(output_csv)/(1024*1024):.2f} MB")
print(f"Rows: {len(df):,}")
print(f"Columns: {len(df.columns)}")

# Display the first few rows
display(df.head())

Sampling ~1,067,047 rows to achieve ~200MB output


Processing chunks: 136it [00:52,  2.60it/s]



Saved sampled data to 36.csv
Final size: 134.24 MB
Rows: 1,067,047
Columns: 21


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents label detailed-label
0,1545396000.0,CpGY7n1zgaCbdhIlOb,192.168.1.198,36097,121.146.80.205,37215,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,- Malicious Okiru
1,1545396000.0,CQJt4v9fksL87VYG2,192.168.1.198,36097,155.26.72.36,37215,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,- Malicious Okiru
2,1545396000.0,CObs9T3Uwc6NvPenC3,192.168.1.198,36097,189.94.119.137,37215,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,- Malicious Okiru
3,1545396000.0,CKJNI92YP8Vmbf9dn9,192.168.1.198,36097,184.214.205.132,37215,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,- Malicious Okiru
4,1545396000.0,C7ATyU3vSLjm5Ktwli,192.168.1.198,36097,197.22.8.56,37215,tcp,-,-,-,...,S0,-,-,0,S,1,40,0,0,- Malicious Okiru


In [3]:
df.to_csv(output_csv, index=False)

In [None]:
import os
import random
import sys
from tqdm import tqdm

def create_sampled_zeek_log(input_file, output_file, target_size_mb=200):
    """
    Creates a sampled version of a Zeek conn.log.labeled file that's approximately target_size_mb MB.
    
    Args:
        input_file (str): Path to the large conn.log.labeled file
        output_file (str): Path for the output sampled file
        target_size_mb (int): Target size in MB (default: 200)
        
    Returns:
        None (writes sampled file to disk)
    """
    # Validate inputs
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' not found!")
        sys.exit(1)
        
    if os.path.exists(output_file):
        print(f"Warning: Output file '{output_file}' already exists and will be overwritten!")
    
    # Convert target size to bytes (with 10% buffer)
    target_bytes = int(target_size_mb * 1024 * 1024 * 1.1)
    total_bytes = os.path.getsize(input_file)
    
    # Check if sampling is needed
    if total_bytes <= target_bytes:
        print("Input file is already smaller than target size. Copying as-is.")
        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
            outfile.writelines(infile)
        return
    
    # Calculate approximate sampling ratio
    sample_ratio = target_bytes / total_bytes
    
    print(f"Creating sampled version of {input_file}")
    print(f"Original size: {total_bytes/(1024*1024):.2f} MB")
    print(f"Target size: ~{target_size_mb} MB")
    print(f"Sampling ratio: {sample_ratio:.4f}")
    
    # First pass: count lines and extract headers
    header_lines = []
    data_line_count = 0
    
    print("\nAnalyzing input file...")
    with open(input_file, 'r') as infile:
        for line in tqdm(infile, desc="Counting lines"):
            if line.startswith('#'):
                header_lines.append(line)
            else:
                data_line_count += 1
    
    # Calculate number of lines to sample
    sample_lines = int(data_line_count * sample_ratio)
    print(f"\nWill sample {sample_lines:,} of {data_line_count:,} data lines")
    
    # Second pass: random sampling
    sampled_lines = []
    selected_indices = random.sample(range(data_line_count), sample_lines)
    selected_indices_set = set(selected_indices)  # For O(1) lookups
    
    print("\nSampling data lines...")
    current_line = 0
    with open(input_file, 'r') as infile:
        for line in tqdm(infile, desc="Sampling", total=data_line_count + len(header_lines)):
            if not line.startswith('#'):
                if current_line in selected_indices_set:
                    sampled_lines.append(line)
                current_line += 1
    
    # Write output file
    print("\nWriting output file...")
    with open(output_file, 'w') as outfile:
        outfile.writelines(header_lines)
        outfile.writelines(sampled_lines)
    
    # Verify output
    output_size = os.path.getsize(output_file)
    print(f"\nSampling complete!")
    print(f"Output file: {output_file}")
    print(f"Output size: {output_size/(1024*1024):.2f} MB")
    print(f"Lines written: {len(sampled_lines):,}")
    print(f"Headers preserved: {len(header_lines)}")

if __name__ == "__main__":
    # Example usage (modify these paths)
    input_file = "36conn.log.labeled"  # Your large input file
    output_file = "sampl36conn.log.labeled"  # Output file
    
    create_sampled_zeek_log(input_file, output_file)