# Iberian outage analysis

We have access to daily data for 2025

In [None]:
import os
import re
import gzip

import pandas as pd
import numpy as np

import ipaddress

import boto3
from botocore.utils import fix_s3_host
from botocore.config import Config

In [None]:
# Load secret environment variables
with open(".env") as f:
    for line in f:
        if line.strip() == '' or line.strip().startswith('#'):
            continue
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

# Set up global constants
DATA_DIR = "data/"
ROWS = 1_000_000

# The date of the data we are using
YEAR = 2025
MONTH = 5
DAY = 1

In [None]:
# Create boto3 resource using environment variables
S3 = boto3.resource(
    's3',
    aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
    aws_secret_access_key=os.environ['AWS_ACCESS_KEY_SECRET'],
    endpoint_url=os.environ['AWS_ENDPOINT_URL'],
    # Change timeouts in case we are uploading large files
    config=Config(
        connect_timeout=3, 
        read_timeout=900, 
        retries={"max_attempts":0}
    )
)

# Unregister to ensure requests don’t go to AWS
S3.meta.client.meta.events.unregister('before-sign.s3', fix_s3_host)

# Use bucket name from environment
HOME_BUCKET = S3.Bucket(os.environ['AWS_BUCKET_NAME'])

In [None]:
# MAnycastR files have metadata that give information about the measurement
def read_gzipped_comment_lines(filepath, comment_char='#'):
    """Read initial comment lines from a gzipped file."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: '{filepath}'")

    try:
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            comment_lines = []
            # Read lines until the first non-comment line
            for line in f:
                if line.startswith(comment_char):
                    comment_lines.append(line.rstrip())
                else:
                    # Stop at first non-comment line
                    break
            return comment_lines

    except gzip.BadGzipFile:
        raise gzip.BadGzipFile(f"Invalid gzip file: '{filepath}'")
    except Exception as e:
        raise RuntimeError(f"Error reading file '{filepath}': {e}")

In [None]:
# function to create a hostname mapping (Client ID -> hostname)
def create_hostname_mapping(comment_lines):
    """Map Client ID to hostname from comment lines."""
    pattern = r"ID:\s*(\d+)\s*,\s*hostname:\s*([\w-]+)"
    mapping = {}

    for line in comment_lines:
        if (match := re.search(pattern, line)):
            client_id = int(match.group(1))       # Extract Client ID
            hostname = match.group(2)             # Extract hostname
            mapping[client_id] = hostname

    return mapping

In [None]:
def get_manycast_file(year, month, day, anycast=True, ipv6=False):
    """Download a (Manycast/Unicast, IPv4/IPv6) file from MinIO if not already present."""

    # Build object prefix based on date
    prefix = f"manycast/{year}/{month:02}/{day:02}/"

    # Choose file pattern based on anycast and IP version
    protocol = "ICMPv6" if ipv6 else "ICMPv4"
    base_pattern = f"MAnycast_{protocol}" if anycast else f"GCD_{protocol}"

    # Search for matching file in bucket
    for obj in HOME_BUCKET.objects.filter(Prefix=prefix):
         # Replace invalid Windows characters in filenames
        filename = re.sub(r'[:<>"/\\|?*]', '_', obj.key[len(prefix):])
        filepath = os.path.join(DATA_DIR, filename)

        if filename.startswith(base_pattern) and filename.endswith('.csv.gz'):
            print(f"Found file: {filename} (bucket key: {obj.key})")

            # Check if file already exists locally
            if os.path.exists(filepath):
                print(f"File {filename} already exists. Skipping download.")
            else:
                print(f"Downloading {filename} from bucket...")
                os.makedirs(DATA_DIR, exist_ok=True)
                HOME_BUCKET.download_file(obj.key, filepath)

            return filepath

    print("No matching file found.")

In [None]:
# Download the file for the specified date
filepath = get_manycast_file(YEAR, MONTH, DAY)

In [None]:
# Get metadata
comment_lines = read_gzipped_comment_lines(filepath)
print("\n".join(comment_lines))

In [None]:
# create hostname mapping
hostname_map = create_hostname_mapping(comment_lines)
hostname_map

In [None]:
# load in data as a pandas dataframe
result_df = pd.read_csv(filepath, skiprows=len(comment_lines), nrows=ROWS, compression='gzip')
result_df.head(10)

In [None]:
# data cleaning as we are only using the following columns:
result_df = result_df[['rx_worker_id', 'tx_worker_id', 'reply_src_addr', 'rx_time', 'tx_time', 'ttl']]
result_df.head(10)

In [None]:
# we scan an address in each /24 prefix, which is representative of that prefix
# each target is scanned by all 32 anycast sites
# in your case this might give redundancy, so make sure to take into consideration that you will see the same prefix multiple times (e.g., remove duplicate targets from the analysis)

# convert IP-number to ip network
result_df['target'] = result_df['reply_src_addr'].apply(
    # can be sped up with swifter
    lambda x: ipaddress.ip_network(f"{ipaddress.ip_address(x)}/24", strict=False)
)

# get receiving anycast site
result_df['receiver'] = result_df['rx_worker_id'].map(hostname_map)

# get sending anycast site
result_df['sender'] = result_df['tx_worker_id'].map(hostname_map)

# calculate rtt
result_df['rtt'] = ((result_df['rx_time'] - result_df['tx_time']) / 1e6)

# drop unnecessary columns
result_df = result_df[['receiver', 'sender', 'target', 'rtt', 'ttl']]

result_df.head(10)

**Some ideas:**
- what is the impact on average hop count (TTL)?
- what is the impact on average RTT?
- which prefixes became unreachable?
- which prefixes shifted catchment?
- where did the prefixes that switched catchment go?

Also consider looking at GCD and filtering on sender == madrid -> how many prefixes are reachable from there?

# Work for project
## geolocate ip addresses
We may not be able to map all addresses, due to IPs missing in the geolocation database.
Also the IP may be missing or wrongly geolocated, due to changes over time (however the prefixes generally should stay in a certain region (maybe find a source for this claim)).

### initial set up

In [None]:
# imports
from typing import Generator
from pathlib import Path
import os
import gzip
import re
import ipaddress
import zipfile

import pandas as pd
import numpy as np

import boto3
from botocore.utils import fix_s3_host
from botocore.config import Config
from botocore.exceptions import EndpointConnectionError

import requests

In [None]:
DEBUG = False

# Load secret environment variables
with open(".env") as f:
    for line in f:
        if line.strip() == '' or line.strip().startswith('#'):
            continue
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

# Set up global constants for paths
DATA_DIR = Path("data/")
IP_GEOLOCATION_DB_PATH = DATA_DIR / "IP2LOCATION-LITE-DB5.CSV"
IP_GEOLOCATION_DOWNLOAD_PATH = DATA_DIR / "ip2location-lite-db5.zip"
IP_GEOLOCATION_URL = f"https://www.ip2location.com/download/?token={os.environ["IP2LOCATION_LITE_TOKEN"]}&file=DB5LITECSV"
# MANYCAST_DATA_PATH = DATA_DIR / "MAnycast_ICMPv42025-05-01T01_37_56.csv.gz"

ROWS = 1_000_000

# The date of the data we are using
YEAR = 2025
MONTH = 5
DAY = 1

### helper functions

In [None]:
def get_bucket():
    """
    Creates a bucket based on provided environment variables 

    Make sure to set AWS_ACCESS_KEY_ID, AWS_ACCESS_KEY_SECRET, AWS_ENDPOINT_URL, AWS_BUCKET_NAME
    before creating the bucket
    """
        # Create boto3 resource using environment variables
    S3 = boto3.resource(
        's3',
        aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],
        aws_secret_access_key=os.environ['AWS_ACCESS_KEY_SECRET'],
        endpoint_url=os.environ['AWS_ENDPOINT_URL'],
        # Change timeouts in case we are uploading large files
        config=Config(
            connect_timeout=3, 
            read_timeout=900, 
            retries={"max_attempts":0}
        )
    )

    # Unregister to ensure requests don’t go to AWS
    S3.meta.client.meta.events.unregister('before-sign.s3', fix_s3_host)

    # Use bucket name from environment
    return S3.Bucket(os.environ['AWS_BUCKET_NAME'])

In [None]:
def dl_manycast_files(bucket, data_dir: Path, year=2025, month=5, day=1, anycast=True, ipv6=False) -> Generator:
    """Download a (Manycast/Unicast, IPv4/IPv6) file from MinIO if not already present."""
    # Build object prefix based on date
    prefix = f"manycast/{year}/{month:02}/{day:02}/"

    # Choose file pattern based on anycast and IP version
    protocol = "ICMPv6" if ipv6 else "ICMPv4"
    base_pattern = f"MAnycast_{protocol}" if anycast else f"GCD_{protocol}"

    # Search for matching file in bucket
    try:
        for obj in bucket.objects.filter(Prefix=prefix):
            # Replace invalid Windows characters in filenames
            filename = re.sub(r'[:<>"/\\|?*]', '_', obj.key[len(prefix):])
            filepath = os.path.join(data_dir, filename)

            if filename.startswith(base_pattern) and filename.endswith('.csv.gz'):
                if DEBUG:
                    print(f"Found file: {filename} (bucket key: {obj.key})")
                # Check if file already exists locally
                if os.path.exists(filepath):
                    if DEBUG:
                        print(f"File {filename} already exists. Skipping download.")
                else:
                    if DEBUG:
                        print(f"Downloading {filename} from bucket...")
                    os.makedirs(data_dir, exist_ok=True)
                    bucket.download_file(obj.key, filepath)

                yield filepath
    except EndpointConnectionError:
        print("Could not access the Bucket, falling back to local data.")
        for file in os.listdir(data_dir):
            if file.startswith(base_pattern + f"{year}-{month:02}-{day:02}"):
                yield data_dir / file 

    print("No matching file found.")

In [None]:
def download_ip2location_db(ip_geoloc_db: Path, ip_geoloc_dl_path: Path, ip_geoloc_dl_url: str):
    # retrieve ip2location database
    if not ip_geoloc_db.exists():
        # download
        response = requests.get(ip_geoloc_dl_url, allow_redirects=True)
        if not response.ok:
            print("Download of IP2Location database failed!")
            return     
        with open(ip_geoloc_dl_path, "wb") as f:
            f.write(response.content)
        
        # extract
        with zipfile.ZipFile(ip_geoloc_dl_path, "r") as zipf:
            zipf.extractall("data")
        
        # remove downloaded zip file
        os.remove(ip_geoloc_dl_path)

In [None]:
# function to create a hostname mapping (Client ID -> hostname)
def create_hostname_mapping(comment_lines):
    """Map Client ID to hostname from comment lines."""
    pattern = r"ID:\s*(\d+)\s*,\s*hostname:\s*([\w-]+)"
    mapping = {}

    for line in comment_lines:
        if (match := re.search(pattern, line)):
            client_id = int(match.group(1))       # Extract Client ID
            hostname = match.group(2)             # Extract hostname
            mapping[client_id] = hostname

    return mapping

In [None]:
# MAnycastR files have metadata that give information about the measurement
def read_gzipped_comment_lines(filepath, comment_char='#'):
    """Read initial comment lines from a gzipped file."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: '{filepath}'")

    try:
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            comment_lines = []
            # Read lines until the first non-comment line
            for line in f:
                if line.startswith(comment_char):
                    comment_lines.append(line.rstrip())
                else:
                    # Stop at first non-comment line
                    break
            return comment_lines

    except gzip.BadGzipFile:
        raise gzip.BadGzipFile(f"Invalid gzip file: '{filepath}'")
    except Exception as e:
        raise RuntimeError(f"Error reading file '{filepath}': {e}")

In [None]:
def preproc_manycast_data(manycast_data_path: Path) -> (pd.DataFrame | str):
    """
    Process a MAnycast csv.gz file

    Returns:     
        Tuple[pd.DataFrame, str]: A resulting dataframe containing the data and the metadata as a string
    """
    # Preprocessing of MAnycast data
    comment_lines = read_gzipped_comment_lines(manycast_data_path)
    hostname_map = create_hostname_mapping(comment_lines)

    # load in data as a pandas dataframe
    manycast_df = pd.read_csv(manycast_data_path, skiprows=len(comment_lines), nrows=ROWS, compression='gzip')

    # reduce to relevant columns
    manycast_df = manycast_df[['rx_worker_id', 'tx_worker_id', 'reply_src_addr', 'rx_time', 'tx_time', 'ttl']]

    # convert IP-number to ip network
    manycast_df['target'] = manycast_df['reply_src_addr'].apply(
        # can be sped up with swifter => uv pip install swifter "swifter[groupby] swifter[notebook]"
        lambda x: ipaddress.ip_network(f"{ipaddress.ip_address(x)}/24", strict=False)
    )

    # get receiving anycast site
    manycast_df['receiver'] = manycast_df['rx_worker_id'].map(hostname_map)
    # get sending anycast site
    manycast_df['sender'] = manycast_df['tx_worker_id'].map(hostname_map)
    # calculate rtt
    manycast_df['rtt'] = ((manycast_df['rx_time'] - manycast_df['tx_time']) / 1e6)

    # drop unnecessary columns
    manycast_df = manycast_df[['receiver', 'sender', 'target', 'reply_src_addr', 'rtt', 'ttl']]
    manycast_df = manycast_df.rename(columns={"reply_src_addr": "encoded_target_addr"})

    return manycast_df, comment_lines

In [None]:
def preproc_ip2location(ip_geoloc_db: Path) -> pd.DataFrame:
    # read the ip2location database
    return pd.read_csv(ip_geoloc_db,
                       names=["ip_from", "ip_to", "country_code", "country_name", "region", "city", "lat", "lon"])


In [None]:
def get_manycast_geolocated(manycast_df: pd.DataFrame, ip_geoloc_df: pd.DataFrame) -> pd.DataFrame:
    """
    Returns: the manycast_df, with added lat and lon values corresponding to encoded_target_addr
    """
    sorted_ip_geoloc_df = ip_geoloc_df.sort_values("ip_from")
    sorted_manycast_df = manycast_df.sort_values("encoded_target_addr")

    # merge_asof for efficient range join
    merged = pd.merge_asof(
        sorted_manycast_df,
        sorted_ip_geoloc_df,
        left_on="encoded_target_addr",
        right_on="ip_from",
        direction="backward"
    )

    # filter entry when reply_src_addr > ip_to
    merged = merged[merged["encoded_target_addr"] <= merged["ip_to"]]

    sorted_manycast_df.insert(len(sorted_manycast_df.columns), "lat", 0.0)
    sorted_manycast_df.insert(len(sorted_manycast_df.columns), "lon", 0.0)

    sorted_manycast_df["lat"] = merged["lat"].values
    sorted_manycast_df["lon"] = merged["lon"].values

    return sorted_manycast_df

### Preprocessing data

In [None]:
# retrieve manycast data
bucket = get_bucket()
manycast_file = next(dl_manycast_files(bucket, DATA_DIR))

In [None]:
manycast_df, meta_inf = preproc_manycast_data(manycast_file)
if DEBUG:
    print(meta_inf)
    print(manycast_df)

In [None]:
ip2geoloc_df = preproc_ip2location(IP_GEOLOCATION_DB_PATH)

In [None]:
if DEBUG:
    # check amount of invalid lat long values for any entries in the ip2location database (first + 1, last - 1)
    lat0_lon0_amount = ((ip2geoloc_df["lon"] == 0.0) & (ip2geoloc_df["lat"] == 0.0)).sum()
    print("invalid lat long values found in ip2location database: ", lat0_lon0_amount)

### add latitude and longitude to the manycast data

In [None]:
manycast_df = get_manycast_geolocated(manycast_df, ip2geoloc_df)

In [None]:
print(manycast_df)