# Iberian outage analysis

We have access to daily data for 2025

In [None]:
%pip install boto3

import os
import re
import time
import gzip
import ipaddress

import pandas as pd
import numpy as np

import boto3
from botocore.utils import fix_s3_host
from botocore.config import Config

In [None]:
# Load secret environment variables
with open(".env") as f:
    for line in f:
        if line.strip() == '' or line.strip().startswith('#'):
            continue
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

# Set up global constants
DATA_DIR = "data/"

# Protocol and IP version constants
PROTOCOL = "ICMP"  # Currently only ICMP is supported
IP_VERSION = 4     # 4 or 6
ANYCAST = True     # Anycast or Unicast

In [None]:
# Create boto3 resource using environment variables
S3 = boto3.resource(
    's3',
    aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    aws_secret_access_key=os.environ['AWS_ACCESS_KEY_SECRET'],
    endpoint_url=os.environ['AWS_ENDPOINT_URL'],
    # Change timeouts in case we are uploading large files
    config=Config(
        connect_timeout=3, 
        read_timeout=900, 
        retries={"max_attempts":0}
    )
)

# Unregister to ensure requests don’t go to AWS
S3.meta.client.meta.events.unregister('before-sign.s3', fix_s3_host) # type: ignore[attr-defined]

# Use bucket name from environment
HOME_BUCKET = S3.Bucket(os.environ['AWS_BUCKET_NAME']) # type: ignore[attr-defined]

In [None]:
def download_minio_file(
    year: int, month: int, day: int,
    anycast: bool, protocol: str, ip_version: int
) -> str:
    """
    Download a (Manycast/Unicast, IPv4/IPv6) CSV file from MinIO if not already present locally.
    Prints the result.
    """
    prefix = f"manycast/{year}/{month:02}/{day:02}/"
    proto = f"{protocol}v{ip_version}"
    base_pattern = f"MAnycast_{proto}" if anycast else f"GCD_{proto}"

    for obj in HOME_BUCKET.objects.filter(Prefix=prefix):
        filename = obj.key[len(prefix):]
        # Clean up filename for Windows compatibility
        filename = re.sub(r'[:<>"/\\|?*]', '_', filename)
        filepath = os.path.join(DATA_DIR, filename)

        # Only try files matching the pattern and proper extension
        if filename.startswith(base_pattern) and filename.endswith('.csv.gz'):
            print(f"Found file: {filename} (bucket key: {obj.key})")

            # Check if the file already exists locally
            if os.path.exists(filepath):
                print(f"File {filename} already exists locally. Skipping download.")
            else:
                print(f"Downloading {filename} from bucket...")
                os.makedirs(DATA_DIR, exist_ok=True)
                t0 = time.time()
                HOME_BUCKET.download_file(obj.key, filepath)
                t1 = time.time()
                print(f"Download complete in {t1-t0:.2f}s")

            return filepath

    # If no matching file is found, print a message
    print("No matching file found in bucket for provided criteria.")
    raise FileNotFoundError(
        f"No file found for {base_pattern} on {year}-{month:02}-{day:02} in bucket {HOME_BUCKET.name}"
    )

In [None]:
def extract_comments(filepath: str) -> list[str]:
    """
    Prints the leading comment lines (lines starting with '#') from a gzip-compressed CSV file.
    Returns the comment lines as a list.
    """
    # Raise error if file is missing
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: '{filepath}'")

    try:
        # Open the gzip file in text mode for reading
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            # Collect all leading lines that start with '#'
            comment_lines = []
            for line in f:
                if line.startswith('#'):
                    comment_lines.append(line.rstrip())
                else:
                    # Stop at the first data (non-comment) line
                    return comment_lines

    except gzip.BadGzipFile:
        raise gzip.BadGzipFile(f"Invalid gzip file: '{filepath}'")

    except Exception as e:
        raise RuntimeError(f"Error reading file '{filepath}': {e}")
    return []

In [None]:
def extract_hostname_map(comment_lines: list[str]) -> dict[int, str]:
    """
    Create a mapping of Client ID to hostname from comment lines.
    """
    pattern = r"ID:\s*(\d+)\s*,\s*hostname:\s*([\w-]+)"
    hostname_map = {}

    for line in comment_lines:
        # Look for a line containing 'ID' and 'hostname'
        if match := re.search(pattern, line):
            client_id = int(match.group(1))      # Extract and convert ID
            hostname = match.group(2)            # Extract hostname
            hostname_map[client_id] = hostname   # Fill mapping

    # Return the mapping sorted by worker id
    return dict(sorted(hostname_map.items()))

In [None]:
def process_chunk(
    chunk: pd.DataFrame,
    hostname_map: dict[int, str],
    tx_worker_id: int | None = None
) -> pd.DataFrame:
    """
    Filter by tx_worker_id and process a DataFrame chunk to add hostname.
    """
    # Avoid SettingWithCopyWarning
    chunk = chunk.copy()

    # Filter rows on tx_worker_id if supplied
    if tx_worker_id is not None:
        chunk = chunk[chunk['tx_worker_id'] == tx_worker_id]

    # Map hostnames for sender and receiver
    chunk['receiver'] = chunk['rx_worker_id'].map(hostname_map)
    chunk['sender'] = chunk['tx_worker_id'].map(hostname_map)

    # Convert IP-number to ip network /24
    chunk['target'] = chunk['reply_src_addr'].apply(
        lambda x: ipaddress.ip_network(f"{ipaddress.ip_address(int(x))}/24", strict=False) # type: ignore
    )

    # Calculate RTT in seconds
    chunk['rtt'] = (chunk['rx_time'] - chunk['tx_time']) / 1e6

    # Return only the needed columns
    return chunk[['receiver', 'sender', 'target', 'rtt', 'ttl']]

def csv_to_df(
    filepath: str,
    hostname_map: dict[int, str],
    chunksize: int = 1_000_000,
    tx_worker: str | None = None
) -> pd.DataFrame:
    """
    Load a large, gzipped CSV file in chunks and filter rows by a given tx_worker_id.
    Prints progress and a summary upon completion.

    Returns:
        (DataFrame, read_time_sec, process_time_sec)
    """
    filtered_chunks = []
    chunk_num: int = 0

    # Get tx_worker_id if tx_worker name is supplied
    tx_worker_id = None
    if tx_worker is not None:
        try:
            tx_worker_id = next(k for k, v in hostname_map.items() if v.startswith(tx_worker))
        except StopIteration:
            raise ValueError(f"No tx_worker_id found for tx_worker name starting with: {tx_worker}")

    # Read the CSV file in chunks
    print(f"Reading file: {filepath} in chunks of {chunksize:,} rows...")
    t0 = time.time()
    chunks = pd.read_csv(
        filepath,
        compression='gzip',
        comment='#',
        usecols=['rx_worker_id', 'tx_worker_id', 'reply_src_addr', 'rx_time', 'tx_time', 'ttl'],
        dtype={
            'rx_worker_id': 'uint8',
            'tx_worker_id': 'uint8',
            'reply_src_addr': 'uint32' if IP_VERSION == 4 else 'str', # For IPv4 use unit32, IPv6 uses str
            'rx_time': 'float64',
            'tx_time': 'float64',
            'ttl': 'uint8'
        },
        chunksize=chunksize
    )

    # Process each chunk
    t1 = time.time()
    for chunk in chunks:
        filtered_chunk = process_chunk(chunk, hostname_map, tx_worker_id)
        filtered_chunks.append(filtered_chunk)
        chunk_num += 1

        # Print progress
        print(f"Read {chunk_num} chunks", end='\r')

    # Processing complete
    t2 = time.time()
    print(f"\nProcessing complete! Time taken: {t1 - t0:.2f}s (reading) + {t2 - t1:.2f}s (processing)")
    print(f"Processed {sum(len(c) for c in filtered_chunks):,} entries!")

    return pd.concat(filtered_chunks, ignore_index=True)

def load(
    year: int, month: int, day: int,
    anycast: bool, protocol: str, ip_version: int
) -> pd.DataFrame:
    # Download the file from MinIO if it exists
    filepath = download_minio_file(
        year, month, day,
        anycast, protocol, ip_version
    )
    
    # Get all comment lines found
    comment_lines = extract_comments(filepath)
    
    # Extract the hostname map from the comment lines
    hostname_map = extract_hostname_map(comment_lines)
    
    # Load the CSV file into a DataFrame
    df = csv_to_df(filepath, hostname_map, tx_worker='de-fra')

    # Drop the duplicates based on receiver, sender, and target
    df.drop_duplicates(subset=['receiver', 'sender', 'target'], keep='first', inplace=True)

    # Return the loaded dataframe
    return df


**Some ideas:**
- what is the impact on average hop count (TTL)?
- what is the impact on average RTT?
- which prefixes became unreachable?
- which prefixes shifted catchment?
- where did the prefixes that switched catchment go?

Also consider looking at GCD and filtering on `sender == madrid` -> how many prefixes are reachable from there?

**Tasks:**
- the ’normal’ situation before the outage, showing which hosts (in what parts of the world) routed to Madrid
- the number, and locations, of hosts that went unresponsive during the outage
- hosts that shifted routing from Madrid to a different anycast site (using e.g., a Sankey diagram), e.g., how many hosts shifted to Paris? How many hosts shifted to Frankfurt? etc..
- the situation after the outage, did routing return to normal? or are changes still visible?
- overall reachability of the Madrid site (using the unicast data) towards the entire IPv4 Internet

In [None]:
%%script false --no-raise-error

# The date of the data we are using
YEAR = 2025
MONTH = 4
DAY = 28

# Load the data
df = load(YEAR, MONTH, DAY, ANYCAST, PROTOCOL, IP_VERSION)

# Print the first 10 rows
df.head(10)