Normalize transcript lines:
- Lowercase
- Remove punctuation
- Tokenize (split into words)

Normalize callsigns:
- Convert all to lowercase
- Optionally split alphanumerics (e.g., LOT3YM → lot 3 y m)

Match lines to callsigns:
- If a line contains all parts of a normalized callsign, assign it
- If multiple matches, assign all (or pick the best using priority rules)

In [None]:
NATO = {
    'a': 'alpha', 'b': 'bravo', 'c': 'charlie', 'd': 'delta', 'e': 'echo',
    'f': 'foxtrot', 'g': 'golf', 'h': 'hotel', 'i': 'india', 'j': 'juliett',
    'k': 'kilo', 'l': 'lima', 'm': 'mike', 'n': 'november', 'o': 'oscar',
    'p': 'papa', 'q': 'quebec', 'r': 'romeo', 's': 'sierra', 't': 'tango',
    'u': 'uniform', 'v': 'victor', 'w': 'whiskey', 'x': 'xray', 'y': 'yankee', 'z': 'zulu'
}

# To be completed
CALLSIGN_PREFIXES = {
    "AF": "air france",
    "LOT": "lot",
    "WZZ": "wizz air",
    "UAE": "emirates",
    "DLH": "lufthansa",
    "QTR": "qatar",
    "TAP": "tap air",
    "TVP": "jet travel",  
    "ENT": "enter",
    "MGH": "mavi",
    "FIN": "finnair",
    "MOC": "moc"
}

In [11]:
import re

def normalize(text):
    return re.sub(r"[^a-z0-9 ]", "", text.lower())

def split_callsign(cs):
    # "LOT3YM" -> ['lot', '3', 'y', 'm']
    return re.findall(r'[a-z]+|\d+', cs.lower())

def callsign_to_words(callsign):
    prefix = CALLSIGN_PREFIXES.get(callsign[:3], callsign[:3].lower())
    numbers_letters = callsign[3:]
    parts = []

    for char in numbers_letters:
        if char.isdigit():
            parts.append(char)
        elif char.isalpha():
            parts.append(NATO[char.lower()])

    return prefix + " " + " ".join(parts)

def identify_callsigns_transcript(transcript_lines, known_callsigns):
    callsign_variants = {
        cs: callsign_to_words(cs) for cs in known_callsigns
    }
    
    print(callsign_variant)

    matches = []
    for line in transcript_lines:
        norm_line = re.sub(r"[^a-z0-9 ]", "", line.lower())
        matched = [cs for cs, variant in callsign_variants.items() if variant in norm_line]
        matches.append({"line": line, "callsigns": matched})

    return matches


In [12]:
with open("transcripts/EPWA-epwa_app-Jun-23-2025-1000Z.txt") as f:
    lines = [line.strip() for line in f if line.strip()]

callsigns = ['UAE98X',  'WZZ495',  'LOT6KG',  'QTR51V',  'LOT8WZ',   'LOT5N',  'LOT266',
 'WZZ8241',  'LOT346',  'MGH871', 'WZZ27MF',  'LOT2LP',  'LOT3YM', 'ENT4088',
 'ENT76WP', 'FIN1143',  'LOT252',   'LOT6E',  'LOT2KT',    'LOT4', 'WZZ38KV',
  'LOT2XZ', 'WZZ6442',  'LOT318',  'LOT4LV',  'LOT6AT', 'TVP7465',  'LOT454',
 'LOT6358', 'WZZ17JZ',  'LOT3MH', 'ENT71PC',  'LOT2CH',  'LOT48F', 'TAP120Y',
  'LOT3EJ',  'LOT192',  'LOT282',  'LOT6MJ', 'LOT6126',  'DLH4LK',  'LOT3VC',
  'LOT3LN',   'MOCNY', 'LOT3908', 'TVP7405']
matches = identify_callsigns_in_transcript(lines, callsigns)

for match in matches:
    print(f"{match['line']} -> {match['callsigns']}")

thousand looking for traffic lot five tango alfa -> []
ruzyne three seven delta bravo contact radar one two five zero five five -> []
one two five zero five five wizz air three seven delta bravo -> []
approach czech air force zero eight eight descending seven thousand feet -> []
tango four zero eight eight qality descend seven thousand qnh one zero one zero traffic below -> []
descending seven thousand qnh one zero one zero copy that turkish four zero eight eight -> []
and air force zero eight eight descend altitude five thousand feet -> []
descending five thousand and turn four zero eight eight -> []
six five tango alfa for contact approach one two five decimal zero five -> []
one two five zero five five lot five tango -> []
approach vietnam lot three mike hotel heading three three zero -> []
lot three mike hotel approach descend altitude four thousand feet four thousand lot three mike hotel -> []
and jet air four zero eight eight descend altitude three thousand feet -> []
descending 