# Atlas data aggregation

In [45]:
%reset

### Setup

In [46]:
# Load environment variables
    # Dependencies:
    # - python-dotenv

from dotenv import load_dotenv, dotenv_values
import os

# Load env vars (manual)
    # load_dotenv("../.env/master.env")
    # maps_key = os.getenv("maps_key")

# Load all variables from the .env file into a dictionary
env_vars = dotenv_values("../.env/master.env")

# Set them as environment variables
for key, value in env_vars.items():
    os.environ[key] = value
    print(f"Loaded: {key}")

# Print
print(f"\nSuccessfully loaded {len(env_vars)} environment variables from master.env")

# print(env_vars.get("VAR1")) # call variable
# VAR1 = env_vars.get("VAR1")

Loaded: VAR1
Loaded: maps_key
Loaded: atlas_key_name
Loaded: atlas_key

Successfully loaded 4 environment variables from master.env


### Results: Fetch and clean

In [49]:
# Fetch measurement data from RIPE Atlas
    # Dependencies:
    # - RIPE Atlas API

import pandas as pd
import requests
import json, pprint
from ripe.atlas.sagan import PingResult, TracerouteResult, DnsResult, SslResult

api_key = env_vars.get("atlas_key")

# atlasregion = "aws-fra"  # "AWS Frankfurt"
# measurement_id = 155973633 

atlasregion = "aws-sto"  # "AWS Stockholm"
measurement_id = 155991104

In [50]:
# TXT response

import json, pprint
from ripe.atlas.sagan import PingResult, TracerouteResult, DnsResult, SslResult

url = f"https://atlas.ripe.net/api/v2/measurements/{measurement_id}/results/"
params = {
    "key": api_key,
    # optional filters:
    # "start": 1700000000,  # Unix timestamp
    # "stop":  1700003600,
    # "probe_ids": "1,2,3",
    "page_size": 100,
    "format": "txt"  # no pagination necessary
}

response = requests.get(url, params=params)

results = []
for line in response.text.strip().splitlines():
    if line:  # skip empty lines
        results.append(json.loads(line))

# Confirm fetch
print("Length:", len(results))

Length: 32


In [51]:
# Parse & Print raw TXT data per dict element

import json, pprint
from ripe.atlas.sagan import PingResult

parsed_results = [PingResult(r) for r in results]

# pprint.pprint(vars(parsed_results[1]))  # 0,1,2,3,4,5 results in dict

In [52]:
# Preview parsed results

parsed_results = [PingResult(r) for r in results]

for result in parsed_results[:2]:  # preview first 2 results only
    msm_id = result.raw_data.get('msm_id')
    type = result.raw_data.get('type')
    step = result.raw_data.get('step')
    proto = result.raw_data.get('proto')
    prb_id = result.raw_data['prb_id']
    dst_addr = result.raw_data.get('dst_addr')
    dst_name = result.raw_data.get('dst_name')
    af = result.raw_data.get('af')
    size = result.raw_data.get('size')
    avg_rtt = result.raw_data['avg']  # custom name
    min_rtt = result.raw_data.get('min')
    med_rtt = result.rtt_median  # custom name
    max_rtt = result.raw_data.get('max')
    sent = result.raw_data['sent']
    rcvd = result.raw_data['rcvd']
    lost = sent - rcvd  # custom name
    ttl = result.raw_data.get('ttl')

    # print(f"Probe {result.raw_data['prb_id']}\nAvg RTT: {result.raw_data['avg']} ms\nPackets sent: {result.raw_data['sent']}\nLoss: {lost}\n") # directly access dict
    print(f"Probe {prb_id}\nAvg RTT: {avg_rtt} ms\nSent: {sent}\nLost: {lost}\n") # use custom variables, preferred

Probe 1000661
Avg RTT: 41.0179852 ms
Sent: 5
Lost: 0

Probe 1000792
Avg RTT: 32.5383126 ms
Sent: 5
Lost: 0



In [53]:
# Create df_results
# for each result in parsed_results, extract relevant fields and create a DataFrame

import pandas as pd

df_results = pd.DataFrame([
    {
        'msm_id': result.raw_data.get('msm_id'),
        'type': result.raw_data.get('type'),
        'step': result.raw_data.get('step'),
        'proto': result.raw_data.get('proto'),
        'prb_id': result.raw_data['prb_id'],
        'dst_addr': result.raw_data.get('dst_addr'),
        'dst_name': result.raw_data.get('dst_name'),
        'af': result.raw_data.get('af'),
        'size': result.raw_data.get('size'),
        'avg_rtt': result.raw_data['avg'],
        'min_rtt': result.raw_data.get('min'),
        'med_rtt': result.rtt_median,
        'max_rtt': result.raw_data.get('max'),
        'sent': result.raw_data['sent'],
        'rcvd': result.raw_data['rcvd'],
        'lost': result.raw_data['sent'] - result.raw_data['rcvd'],
        'ttl': result.raw_data.get('ttl')
    }
    for result in parsed_results
])

df_results.head()

Unnamed: 0,msm_id,type,step,proto,prb_id,dst_addr,dst_name,af,size,avg_rtt,min_rtt,med_rtt,max_rtt,sent,rcvd,lost,ttl
0,155991104,ping,,ICMP,1000661,16.16.206.178,aws-sto.ncc.dock.ee,4,48,41.017985,40.368241,40.909,41.550243,5,5,0,52.0
1,155991104,ping,,ICMP,1000792,16.16.206.178,aws-sto.ncc.dock.ee,4,48,32.538313,32.14,32.266,33.314375,5,5,0,56.0
2,155991104,ping,,ICMP,1001538,16.16.206.178,aws-sto.ncc.dock.ee,4,48,24.617804,24.555184,24.612,24.693352,5,5,0,55.0
3,155991104,ping,,ICMP,1002071,16.16.206.178,aws-sto.ncc.dock.ee,4,48,22.804704,22.316064,22.395,24.529114,5,5,0,51.0
4,155991104,ping,,ICMP,1007433,16.16.206.178,aws-sto.ncc.dock.ee,4,48,26.2232,26.099694,26.178,26.424955,5,5,0,54.0


In [None]:
# Further: Augmet with geo from ../data/processed/probes_de_20260218-061315.csv

### Preprocess other datasets

In [54]:
# Load probes sample

import pandas as pd

df_probes_sample = pd.read_csv(
    '../data/processed/probes_sample.csv',
    dtype={
        'mun_key': str
        }
    )
print(f"Cols: {df_probes_sample.columns.tolist()}\n")
print(f"Types:\n{df_probes_sample.dtypes}\n")
df_probes_sample.head()

Cols: ['mun_key', 'probe-1', 'probe-2', 'probe-3', 'probe-4', 'probe-5', 'distance-1', 'distance-2', 'distance-3', 'distance-4', 'distance-5']

Types:
mun_key           str
probe-1         int64
probe-2         int64
probe-3         int64
probe-4         int64
probe-5         int64
distance-1    float64
distance-2    float64
distance-3    float64
distance-4    float64
distance-5    float64
dtype: object



Unnamed: 0,mun_key,probe-1,probe-2,probe-3,probe-4,probe-5,distance-1,distance-2,distance-3,distance-4,distance-5
0,81110000000,19869,22109,21574,19836,19837,0.219192,0.366082,0.396241,0.403501,0.403501
1,82120000000,11194,52831,21421,21441,896,0.350609,0.350778,0.399705,0.422199,0.450911
2,83110000000,1013945,11729,1004771,1010559,29215,0.660241,1.966321,3.476046,4.190831,6.377792
3,84375001047,52839,19884,51094,52995,11296,11.799138,15.042199,24.252101,24.252101,26.778651
4,81255007103,29222,18811,25329,81,29951,16.59288,16.720524,20.260771,20.842327,21.648458


In [55]:
# Load municipality sample

import pandas as pd

df_mun_sample = pd.read_csv(
    '../data/processed/municipalities_full_sample.csv',
    dtype={
        'mun_key': str
        }
    )
print(f"Cols: {df_mun_sample.columns.tolist()}\n")
print(f"Types:\n{df_mun_sample.dtypes}\n")
df_mun_sample.head()

Cols: ['mun_key', 'sample_type', 'mun_name_short', 'mun_pop_cen22', 'state', 'lat', 'lon']

Types:
mun_key               str
sample_type           str
mun_name_short        str
mun_pop_cen22       int64
state                 str
lat               float64
lon               float64
dtype: object



Unnamed: 0,mun_key,sample_type,mun_name_short,mun_pop_cen22,state,lat,lon
0,81110000000,top,Stuttgart,612663,Baden-Württemberg,48.782703,9.182863
1,82120000000,top,Karlsruhe,309050,Baden-Württemberg,49.00675,8.393843
2,83110000000,top,Freiburg im Breisgau,237460,Baden-Württemberg,47.993854,7.846758
3,84375001047,bottom,Hettingen,1861,Baden-Württemberg,48.218306,9.233478
4,81255007103,bottom,Widdern,1782,Baden-Württemberg,49.316383,9.415555


In [56]:
# Reshape probe sample to long format

import pandas as pd

df_probes_sample_long = df_probes_sample.melt(id_vars='mun_key', var_name='temp', value_name='value')

df_probes_sample_long[['col', 'num']] = df_probes_sample_long['temp'].str.rsplit('-', n=1, expand=True)

df_probes_sample_long = df_probes_sample_long.drop(columns='temp').pivot_table(
    index=['mun_key', 'num'], columns='col', values='value', aggfunc='first'
).reset_index().drop(columns='num').rename_axis(None, axis=1)

df_probes_sample_long = df_probes_sample_long.rename(columns={'probe': 'probe_id'})
df_probes_sample_long['probe_id'] = df_probes_sample_long['probe_id'].astype('int64')

print(f"Cols: {df_probes_sample_long.columns.tolist()}\n")
print(f"Types:\n{df_probes_sample_long.dtypes}\n")
df_probes_sample_long.head()

Cols: ['mun_key', 'distance', 'probe_id']

Types:
mun_key         str
distance    float64
probe_id      int64
dtype: object



Unnamed: 0,mun_key,distance,probe_id
0,10010000000,0.746746,28729
1,10010000000,9.620536,19632
2,10010000000,16.84858,1013717
3,10010000000,31.31868,50680
4,10010000000,35.4117,4879


In [57]:
# Write to csv

df_probes_sample_long.to_csv(
    '../data/processed/probes_sample_long.csv',
    index=False
)

print(f"Saved df_probes_sample_long to file.")
print(f"Shape: {df_probes_sample_long.shape}")

Saved df_probes_sample_long to file.
Shape: (600, 3)


### Join

In [58]:
# Join df_probes_sample_long with df_results on probe_id
df_merged = df_probes_sample_long.merge(
    df_results, 
    left_on='probe_id', 
    right_on='prb_id', 
    how='left'
)

# Join with df_mun_sample on mun_key
df_merged = df_merged.merge(
    df_mun_sample, 
    on='mun_key', 
    how='left'
)

# Drop all without probe_id (no match in results)
df_merged = df_merged.dropna(subset=['prb_id'])  # use col name from df "results"

# Rename lat and lon for clarity
df_merged = df_merged.rename(columns={
    'lat': 'mun_lat',
    'lon': 'mun_lon'
})

# Fix dtypes
df_merged = df_merged.astype({  # nullable integers (Int64) for graceful handling of NaN
    'mun_key': 'str',
    'distance': 'float64',
    'probe_id': 'int64',
    'msm_id': 'Int64',
    'type': 'str',
    'step': 'str',
    'proto': 'str',
    'dst_addr': 'str',
    'dst_name': 'str',
    'af': 'Int64',
    'size': 'Int64',
    'avg_rtt': 'float64',
    'min_rtt': 'float64',
    'med_rtt': 'float64',
    'max_rtt': 'float64',
    'sent': 'Int64',
    'rcvd': 'Int64',
    'lost': 'Int64',
    'ttl': 'Int64',
    'sample_type': 'str',
    'mun_name_short': 'str',
    'mun_pop_cen22': 'int64',
    'state': 'str',
    'mun_lat': 'float64',
    'mun_lon': 'float64'
})

print(f"Cols: {df_merged.columns.tolist()}\n")
print(f"Shape: {df_merged.shape}\n")
df_merged.head()


Cols: ['mun_key', 'distance', 'probe_id', 'msm_id', 'type', 'step', 'proto', 'prb_id', 'dst_addr', 'dst_name', 'af', 'size', 'avg_rtt', 'min_rtt', 'med_rtt', 'max_rtt', 'sent', 'rcvd', 'lost', 'ttl', 'sample_type', 'mun_name_short', 'mun_pop_cen22', 'state', 'mun_lat', 'mun_lon']

Shape: (38, 26)



Unnamed: 0,mun_key,distance,probe_id,msm_id,type,step,proto,prb_id,dst_addr,dst_name,...,sent,rcvd,lost,ttl,sample_type,mun_name_short,mun_pop_cen22,state,mun_lat,mun_lon
245,81350020020,22.270402,1506,155991104,ping,,ICMP,1506.0,16.16.206.178,aws-sto.ncc.dock.ee,...,5,5,0,52.0,median,Herbrechtingen,13238,Baden-Württemberg,48.621403,10.176871
248,81350020020,25.714412,53353,155991104,ping,,ICMP,53353.0,16.16.206.178,aws-sto.ncc.dock.ee,...,5,5,0,53.0,median,Herbrechtingen,13238,Baden-Württemberg,48.621403,10.176871
280,91620000000,0.369909,1010220,155991104,ping,,ICMP,1010220.0,16.16.206.178,aws-sto.ncc.dock.ee,...,5,0,5,,top,München,1505005,Bayern,48.135125,11.581981
281,91620000000,0.369909,1010221,155991104,ping,,ICMP,1010221.0,16.16.206.178,aws-sto.ncc.dock.ee,...,5,5,0,52.0,top,München,1505005,Bayern,48.135125,11.581981
282,91620000000,0.376944,1000792,155991104,ping,,ICMP,1000792.0,16.16.206.178,aws-sto.ncc.dock.ee,...,5,5,0,56.0,top,München,1505005,Bayern,48.135125,11.581981


In [60]:
# Write to csv

df_merged.to_csv(
    f'../data/processed/df_merged_{atlasregion}.csv',
    index=False
)

print(f"Saved to file.")
print(f"Shape: {df_merged.shape}")

Saved to file.
Shape: (38, 26)


In [None]:
# Combine ../data/processed/df_merged_aws-sto.csv and df_merged_aws-fra.csv into one df_merged_all