In [1]:
import re
import pandas as pd

# 1. Matching flight plan callsigns with adsb data

In [2]:
flight_plan = pd.read_csv("data/extract_edyy.csv")

# Last message recieved by the NM
flight_plan["timestamp"] = flight_plan["timestamp"].str.replace(
    r" (\d{3})$", r".\1", regex=True
)
flight_plan["timestamp"] = pd.to_datetime(flight_plan["timestamp"])

# Estimated departure time
flight_plan["flightData.flightId.keys.estimatedOffBlockTime"] = flight_plan["flightData.flightId.keys.estimatedOffBlockTime"].str.replace(
    r" (\d{3})$", r".\1", regex=True
)
flight_plan["flightData.flightId.keys.estimatedOffBlockTime"] = pd.to_datetime(flight_plan["flightData.flightId.keys.estimatedOffBlockTime"])

# Estimated arrival time
flight_plan["flightData.estimatedTimeOfArrival"] = flight_plan["flightData.estimatedTimeOfArrival"].str.replace(
    r" (\d{3})$", r".\1", regex=True
)
flight_plan["flightData.estimatedTimeOfArrival"] = pd.to_datetime(flight_plan["flightData.estimatedTimeOfArrival"])

flight_plan.columns

Index(['timestamp', 'flightData.flightId.keys.aircraftId',
       'flightData.flightId.keys.aerodromeOfDestination',
       'flightData.flightId.keys.estimatedOffBlockTime',
       'flightData.flightId.keys.aerodromeOfDeparture',
       'flightData.flightId.id', 'flightData.estimatedTimeOfArrival',
       'flightData.aircraftAddress', 'flightData.icaoRoute'],
      dtype='object')

In [3]:
print(flight_plan["flightData.flightId.keys.estimatedOffBlockTime"].argmin())
print(flight_plan["flightData.estimatedTimeOfArrival"].argmax())

77
416


In [4]:
flight_plan.iloc[416]

timestamp                                                                 2025-08-08 23:55:44.031000
flightData.flightId.keys.aircraftId                                                           CCA865
flightData.flightId.keys.aerodromeOfDestination                                                 LEMD
flightData.flightId.keys.estimatedOffBlockTime                                   2025-08-08 23:00:00
flightData.flightId.keys.aerodromeOfDeparture                                                   ZBAA
flightData.flightId.id                                                                    AA73820119
flightData.estimatedTimeOfArrival                                                2025-08-09 10:02:00
flightData.aircraftAddress                                                                    7811A2
flightData.icaoRoute                               K0956S0920 IDKEX B339 ASILA A575 INTIK/K0932S0...
Name: 416, dtype: object

In [5]:
def delete_bad_cs(flight):
    from traffic.core import Flight
    if not isinstance(flight, Flight):
        return None
    cs = flight.callsign
    icao_pattern = re.compile(r"^[A-Z]{3}[0-9]+[A-Z]*$")
    return flight if bool(icao_pattern.match(cs.strip().upper())) else None

In [6]:
from datetime import datetime, timezone, timedelta
from traffic.data import eurofirs
from traffic.data import opensky
from pyopensky.schema import StateVectorsData4
from shapely.geometry import Polygon as ShapelyPolygon
from traffic.core import Traffic

adsb_df = Traffic.from_file("data/extract_edyy.parquet")

# start_adsb = datetime(2025, 8, 8, 0, 0, 0, tzinfo=timezone.utc)
# stop_adsb  = datetime(2025, 8, 8, 23, 59, 59, tzinfo=timezone.utc)

# delta_coords = [
#     (53.454167, 3.606111),
#     (52.733333, 5.583333),
#     (52.663333, 7.168889),
#     (51.193611, 5.521389),
#     (51.607778, 3.171944),
#     (51.480556, 3.171944),
#     (51.636944, 2.500000),
#     (51.455556, 2.500000),
#     (51.500000, 2.000000),
#     (51.950556, 2.356389)
# ]
# delta_geom = ShapelyPolygon([(lon, lat) for lat, lon in delta_coords])
# bbox = delta_geom.bounds

# # Server-side filters:
# # keep only rows above ~FL330 (≈ 10058 m). 
# alt_pred = (StateVectorsData4.baroaltitude >= 10000)

# # Only fetch columns you need
# cols = [
#     StateVectorsData4.time,         # epoch seconds
#     StateVectorsData4.lat,
#     StateVectorsData4.lon,
#     StateVectorsData4.baroaltitude,
#     StateVectorsData4.velocity,
#     StateVectorsData4.heading,
#     StateVectorsData4.icao24,
#     StateVectorsData4.callsign,
# ]

# # Chunk the day into smaller window
# chunk_minutes = 60
# chunks = pd.date_range(start_adsb, stop_adsb, freq=f"{chunk_minutes}min", inclusive="left")

# collected = []

# for t0 in chunks:
#     t1 = min(t0 + timedelta(minutes=chunk_minutes), stop_adsb)

#     trf = opensky.history(
#         start=t0,
#         stop=t1,
#         bounds=bbox,                       # tuple bbox -> cheaper for the backend
#         selected_columns=cols,             # fewer columns to ship
#     )

#     if trf is None:
#         continue

#     collected.append(trf)

# # Concatenate all chunks into one Traffic
# if collected:
#     adsb_df = collected[0]
#     for tr in collected[1:]:
#         adsb_df = adsb_df + tr
# else:
#     adsb_df = None

# if adsb_df is not None:
#     df = adsb_df.data
#     df.to_parquet("data/extract_edyy.parquet")
# else:
#     print("No data returned for the requested period/filters.")

In [7]:
icao24_with_fp = []

for cs in adsb_df.icao24:
    if cs.upper() in flight_plan["flightData.aircraftAddress"].values:
        icao24_with_fp.append(cs)
        
cs_with_fp = []

for cs in adsb_df.callsigns:
    if cs in flight_plan["flightData.flightId.keys.aircraftId"].values:
        cs_with_fp.append(cs)

filtered_adsb = adsb_df[cs_with_fp]

In [8]:
start, stop = filtered_adsb.data.timestamp.min(), filtered_adsb.data.timestamp.max()

print(start)
print(stop)

2025-08-08 01:24:01+00:00
2025-08-08 23:59:59+00:00


# Detecting the matched cs within ATC communications. 

In [9]:
import json

def save_transcripts_json(transcription, path):
    # convert datetime → iso string
    serializable = [
        [(ts.isoformat(), text) for ts, text in transcripts]
        for transcripts in transcription
    ]
    with open(path, "w") as f:
        json.dump(serializable, f, indent=2)
        
def load_transcripts_json(path):
    with open(path) as f:
        data = json.load(f)
    # convert iso → datetime
    return [
        [(datetime.fromisoformat(ts), text) for ts, text in transcripts]
        for transcripts in data
    ]
        
def save_callsign_comms_json(callsign_communications, path):
    serializable = {
        cs: [
            {
                "timestamp": entry["timestamp"].isoformat(),  # datetime → string
                "sentence": entry["sentence"],
                "detected": entry["detected"],
                "score": entry["score"],
            }
            for entry in entries
        ]
        for cs, entries in callsign_communications.items()
    }
    with open(path, "w") as f:
        json.dump(serializable, f, indent=2)
    
def load_callsign_comms_json(path):
    with open(path) as f:
        data = json.load(f)
    # convert iso → datetime
    return {
        cs: [
            {
                **entry,
                "timestamp": datetime.fromisoformat(entry["timestamp"])
            }
            for entry in entries
        ]
        for cs, entries in data.items()
    }

In [None]:
from utils.cs_matching import build_timestamp_range, closest_callsign_at_time
from utils_old.process_muac_audio import build_audio_catalog, transcribe_catalog

# base_dir = "/store/kruu/atc_muac/"

# adsb_ranges = build_timestamp_range(adsb_df)
# catalog = build_audio_catalog(
#     base_dir,
#     ("2025","08","08"),
#     sectors=["muac_delta_middle", "muac_delta_high"],
#     start=start, 
#     stop=stop,
#     max_workers=128,         
#     show_progress=True,  
# )
# print(len(catalog), "files downloaded")

# transcripts = transcribe_catalog(catalog)
# save_transcripts_json(transcripts, "data/transcripts_edyy.json")

In [12]:
from collections import defaultdict
from utils.cs_matching import merge_callsign_entities
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# tokenizer_ner = AutoTokenizer.from_pretrained("Jzuluaga/bert-base-ner-atc-en-atco2-1h")
# model_ner = AutoModelForTokenClassification.from_pretrained("Jzuluaga/bert-base-ner-atc-en-atco2-1h")
# nlp = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_strategy="first")

# # Nested lists can appear if VAD split
# transcript_flat = [item for sublist in transcripts for item in sublist]

# callsign_communications = defaultdict(list)

# for i, line in enumerate(transcript_flat):
#     timestamp = line[0]
#     sentence = line[1]
    
#     atco2_res = nlp(sentence)
#     atco2_res = merge_callsign_entities(atco2_res)
#     callsigns = [e for e in atco2_res if e['entity_group'] == 'callsign']
    
#     result = closest_callsign_at_time(
#         ner_callsigns = callsigns,               
#         traffic = adsb_df, # all adsb_data fetched to not have wrongly associated cs
#         adsb_ranges = adsb_ranges,
#         comm_time = timestamp,
#         match_threshold = 0.7,
#         # sector_geom = delta_geom,              
#         time_tolerance_s = 60,      
#     )
    
#     if result:
#         callsign_communications[result["best_context_match"]].append({
#             "timestamp": timestamp,
#             "sentence": sentence,
#             "detected": result["ner_detected_callsign"],
#             "score": result["match_score"]
#         })

# save_callsign_comms_json(callsign_communications, "data/callsign_communications_match.json")

In [13]:
transcripts = load_transcripts_json("data/transcripts_edyy.json")
callsign_communications = load_callsign_comms_json("data/callsign_communications_match.json")

In [21]:
com_with_fp = {}
for cs in cs_with_fp:
    if cs in callsign_communications.keys():
        com_with_fp[cs] = callsign_communications[cs]

save_callsign_comms_json(com_with_fp, "data/callsign_communications_flight_plan.json")

In [25]:
adsb_match_with_fp = adsb_df[com_with_fp.keys()]

In [24]:
adsb_match_with_fp

Unnamed: 0_level_0,Unnamed: 1_level_0,count
icao24,callsign,Unnamed: 2_level_1
43f6c4,2JEFF,3677
48548d,KLM1568,2190
485123,TRA8R,2162
3c6450,DLH5AK,1789
40643c,EZY87PC,1783
440c38,EJU76MB,1699
47c0d9,NSZ5085,1600
4ca73f,RYR4M,1579
39856d,AFR15HA,1547
4d2212,RYR9166,1521


In [26]:
com_with_fp

{'AFR96QH': [{'timestamp': datetime.datetime(2025, 8, 8, 20, 1, 44, tzinfo=datetime.timezone.utc),
   'sentence': 'standing three control easy eight six four romeo yes just keep still and fly easy eight six five klm one two four kilo easy five six six five hotel going to liqal air france nine six quebec hotel good evening reaching three six zero down climbing to the right hand side three seven yankee yankee on course climbing flight level two three zero',
   'detected': 'eight six five one two four kilo easy five six six five france nine six quebec hotel',
   'score': 0.75},
  {'timestamp': datetime.datetime(2025, 8, 8, 20, 4, 58, tzinfo=datetime.timezone.utc),
   'sentence': 'hotel bravo ryan air nine six nine thank you request ahead one two two decimal two papa romeo papa romeo papa romeo now use affirmative air france nine six quebec hotel we re looking for flight level three eight zero three nine zero ok that s copied for now maintain flight level three six zero high altitude and d