# Iberian outage analysis

We have access to daily data for 2025

In [None]:
import os
import re
import gzip

import pandas as pd
import numpy as np

import ipaddress

import boto3
from botocore.utils import fix_s3_host
from botocore.config import Config

In [None]:
# Load secret environment variables
with open(".env") as f:
    for line in f:
        if line.strip() == '' or line.strip().startswith('#'):
            continue
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

# Set up global constants
DATA_DIR = "/data"
ROWS = 1_000_000

# The date of the data we are using
YEAR = 2025
MONTH = 5
DAY = 1

In [None]:
# Create boto3 resource using environment variables
S3 = boto3.resource(
    's3',
    aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    aws_secret_access_key=os.environ['AWS_ACCESS_KEY_SECRET'],
    endpoint_url=os.environ['AWS_ENDPOINT_URL'],
    # Change timeouts in case we are uploading large files
    config=Config(
        connect_timeout=3, 
        read_timeout=900, 
        retries={"max_attempts":0}
    )
)

# Unregister to ensure requests don’t go to AWS
S3.meta.client.meta.events.unregister('before-sign.s3', fix_s3_host)

# Use bucket name from environment
HOME_BUCKET = S3.Bucket(os.environ['AWS_BUCKET_NAME'])

In [None]:
# MAnycastR files have metadata that give information about the measurement
def read_gzipped_comment_lines(filepath, comment_char='#'):
    """Read initial comment lines from a gzipped file."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: '{filepath}'")

    try:
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            comment_lines = []
            # Read lines until the first non-comment line
            for line in f:
                if line.startswith(comment_char):
                    comment_lines.append(line.rstrip())
                else:
                    # Stop at first non-comment line
                    break
            return comment_lines

    except gzip.BadGzipFile:
        raise gzip.BadGzipFile(f"Invalid gzip file: '{filepath}'")
    except Exception as e:
        raise RuntimeError(f"Error reading file '{filepath}': {e}")

In [None]:
# function to create a hostname mapping (Client ID -> hostname)
def create_hostname_mapping(comment_lines):
    """Map Client ID to hostname from comment lines."""
    pattern = r"ID:\s*(\d+)\s*,\s*hostname:\s*([\w-]+)"
    mapping = {}

    for line in comment_lines:
        if (match := re.search(pattern, line)):
            client_id = int(match.group(1))       # Extract Client ID
            hostname = match.group(2)             # Extract hostname
            mapping[client_id] = hostname

    return mapping

In [None]:
def get_manycast_file(year, month, day, anycast=True, ipv6=False):
    """Download a (Manycast/Unicast, IPv4/IPv6) file from MinIO if not already present."""

    # Build object prefix based on date
    prefix = f"manycast/{year}/{month:02}/{day:02}/"

    # Choose file pattern based on anycast and IP version
    protocol = "ICMPv6" if ipv6 else "ICMPv4"
    base_pattern = f"MAnycast_{protocol}" if anycast else f"GCD_{protocol}"

    # Search for matching file in bucket
    for obj in HOME_BUCKET.objects.filter(Prefix=prefix):
         # Replace invalid Windows characters in filenames
        filename = re.sub(r'[:<>"/\\|?*]', '_', obj.key[len(prefix):])
        filepath = os.path.join(DATA_DIR, filename)

        if filename.startswith(base_pattern) and filename.endswith('.csv.gz'):
            print(f"Found file: {filename} (bucket key: {obj.key})")

            # Check if file already exists locally
            if os.path.exists(filepath):
                print(f"File {filename} already exists. Skipping download.")
            else:
                print(f"Downloading {filename} from bucket...")
                os.makedirs(DATA_DIR, exist_ok=True)
                HOME_BUCKET.download_file(obj.key, filepath)

            return filepath

    print("No matching file found.")

In [None]:
# Download the file for the specified date
filepath = get_manycast_file(YEAR, MONTH, DAY)

In [None]:
# Get metadata
comment_lines = read_gzipped_comment_lines(filepath)
print("\n".join(comment_lines))

In [None]:
# create hostname mapping
hostname_map = create_hostname_mapping(comment_lines)
hostname_map

In [None]:
# load in data as a pandas dataframe
result_df = pd.read_csv(filepath, skiprows=len(comment_lines), nrows=ROWS, compression='gzip')
result_df.head(10)

In [None]:
# data cleaning as we are only using the following columns:
result_df = result_df[['rx_worker_id', 'tx_worker_id', 'reply_src_addr', 'rx_time', 'tx_time', 'ttl']]
result_df.head(10)

In [None]:
# we scan an address in each /24 prefix, which is representative of that prefix
# each target is scanned by all 32 anycast sites
# in your case this might give redundancy, so make sure to take into consideration that you will see the same prefix multiple times (e.g., remove duplicate targets from the analysis)

# convert IP-number to ip network
result_df['target'] = result_df['reply_src_addr'].apply(
    # can be sped up with swifter
    lambda x: ipaddress.ip_network(f"{ipaddress.ip_address(x)}/24", strict=False)
)

# get receiving anycast site
result_df['receiver'] = result_df['rx_worker_id'].map(hostname_map)

# get sending anycast site
result_df['sender'] = result_df['tx_worker_id'].map(hostname_map)

# calculate rtt
result_df['rtt'] = ((result_df['rx_time'] - result_df['tx_time']) / 1e6)

# drop unnecessary columns
result_df = result_df[['receiver', 'sender', 'target', 'rtt', 'ttl']]

result_df.head(10)

**Some ideas:**
- what is the impact on average hop count (TTL)?
- what is the impact on average RTT?
- which prefixes became unreachable?
- which prefixes shifted catchment?
- where did the prefixes that switched catchment go?

Also consider looking at GCD and filtering on sender == madrid -> how many prefixes are reachable from there?