In [None]:
import os
print(os.getcwd())

import warnings
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU*")

/home/kruu/git_folder/atc_clearences


# IDEAS
The fine tuned model is working quite well to transcript ATC data. 
- It can be hard to make the difference between the pilot and between the ATCO. Can we train an additional classifier to recognise pilot/ATCO
- Can we train a model to detect specific clearences in the communication ? (ex: cleared ILS?)
- EPWA App1 (128.805) is of good quality ! What is the difference with App2 (125.055) ?
- Maybe add an indicator about the hour of each message (start recording + message timestamp)
- Associate the communications with the aircraft that are in the airspace at the moment
- Then keywords for specific clearances

****
# Voice Activity Detection
****

In [2]:
import wave
import webrtcvad
from pydub import AudioSegment

audio_path = "data/EPWA-App-May-15-2025-1330Z.mp3"
output_dir = "data/speech_segments_1330"
temp_wav_path = "notebooks/temp_audio.wav"
os.makedirs(output_dir, exist_ok=True)

# Convert MP3 to WAV
audio = AudioSegment.from_file(audio_path).set_channels(1).set_frame_rate(16000)
audio.export(temp_wav_path, format="wav")

# Read WAV file as bytes
# Minimum duration = 1 second = 16000 samples for 16kHz

min_duration_samples = 16000  # 1 second
with wave.open(temp_wav_path, 'rb') as wf:
    vad = webrtcvad.Vad(2)  # Sensitivity: 0 (low), 3 (high)
    frame_duration = 30  # ms
    sample_rate = wf.getframerate()
    frame_bytes = int(sample_rate * frame_duration / 1000) * 2  # 16-bit samples
    frames = wf.readframes(wf.getnframes())

    chunks = []
    i = 0
    voiced = False
    chunk = b''
    timestamp = 0
    for offset in range(0, len(frames), frame_bytes):
        frame = frames[offset:offset + frame_bytes]
        if len(frame) < frame_bytes:
            break
        is_speech = vad.is_speech(frame, sample_rate)
        if is_speech:
            if not voiced:
                voiced = True
                chunk = b''
            chunk += frame
        else:
            if voiced:
                voiced = False
                if len(chunk) >= min_duration_samples * 2:  # 2 bytes per sample (16-bit audio)
                    fname = os.path.join(output_dir, f"speech_{i:03d}.wav")
                    segment = AudioSegment(
                        data=chunk,
                        sample_width=2,      # 16-bit
                        frame_rate=16000,
                        channels=1
                    )
                    segment.export(fname, format="wav")
                    i += 1
                    print(f"Saved chunk {i} to {fname} ({len(chunk) / 2 / 16000:.2f}s)")

if os.path.exists(temp_wav_path):
    os.remove(temp_wav_path)
    print(f"Deleted: {temp_wav_path}")
else:
    print(f"File not found: {temp_wav_path}")

Saved chunk 1 to data/speech_segments_1330/speech_000.wav (5.01s)
Saved chunk 2 to data/speech_segments_1330/speech_001.wav (5.94s)
Saved chunk 3 to data/speech_segments_1330/speech_002.wav (1.59s)
Saved chunk 4 to data/speech_segments_1330/speech_003.wav (5.37s)
Saved chunk 5 to data/speech_segments_1330/speech_004.wav (3.90s)
Saved chunk 6 to data/speech_segments_1330/speech_005.wav (3.69s)
Saved chunk 7 to data/speech_segments_1330/speech_006.wav (3.03s)
Saved chunk 8 to data/speech_segments_1330/speech_007.wav (5.19s)
Saved chunk 9 to data/speech_segments_1330/speech_008.wav (2.55s)
Saved chunk 10 to data/speech_segments_1330/speech_009.wav (2.61s)
Saved chunk 11 to data/speech_segments_1330/speech_010.wav (4.47s)
Saved chunk 12 to data/speech_segments_1330/speech_011.wav (5.79s)
Saved chunk 13 to data/speech_segments_1330/speech_012.wav (3.18s)
Saved chunk 14 to data/speech_segments_1330/speech_013.wav (2.22s)
Saved chunk 15 to data/speech_segments_1330/speech_014.wav (6.42s)
Save

**** 
# Whisper
****

In [4]:
import whisper

results = []

model = whisper.load_model("turbo")
for f in sorted(os.listdir(output_dir)):
    if f.endswith(".wav"):
        result = model.transcribe(os.path.join(output_dir, f), language="en")
        results.append(result)
        print(f"{f}: {result['text']}")

speech_000.wav:  Flot 6 Yankee Mike continue left on heading 360 to intercept NLS 3, 3, Cliff Approach, Port Establish.
speech_001.wav:  Left 360 intercept, left 360, Kalina, South, 3, Yankee, 6, Yankee, Mike.
speech_002.wav:  Good morning, Rossblood.
speech_003.wav:  K9C3NC continue descent altitude 5000 feet 5000NC
speech_004.wav:  Drop 1 km, continue, present heading, descent altitude 3000 feet.
speech_005.wav:  Continue present heading descent 3000 feet, lot 1 km.
speech_006.wav:  Establish local island 06 Yankee Mike.
speech_007.wav:  Contact Tower 1180305, thank you, Dominic. Tower, hello, flight, thank you.
speech_008.wav:  Lot 1 km, turn left hearing 060
speech_009.wav:  Left heading 060, left 1 km.
speech_010.wav:  Flot 1 km left and heading 360 to intercept IS33, please approach for established.
speech_011.wav:  Left heading 360, cleared intercept, ILS 31 established, land, Granite, VT, lot 1, KW.
speech_012.wav:  Block 3, November Charlie, descent altitude 3000 feet.
speech_

****
# Whisper ATC
****

Typical phrases used by controllers for approach clearences: 
- “cleared ILS runway 14”
- “cleared GLS approach runway 28”
- “cleared RNP approach runway 32”
- “expect ILS 14”
- “via [STAR] and [approach type]”

In [11]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import torchaudio
from pydub import AudioSegment

In [6]:
model = WhisperForConditionalGeneration.from_pretrained("jacktol/whisper-medium.en-fine-tuned-for-ATC")
processor = WhisperProcessor.from_pretrained("jacktol/whisper-medium.en-fine-tuned-for-ATC")

In [9]:
results_atc = []

for f in sorted(os.listdir(output_dir)):
    if f.endswith(".wav"):
        audio_path = os.path.join(output_dir, f)
        audio_input, sample_rate = torchaudio.load(audio_path)
        if sample_rate != 16000:
            audio_input = torchaudio.functional.resample(audio_input, orig_freq=sample_rate, new_freq=16000)
        input_features = processor(audio_input.squeeze(), sampling_rate=16000, return_tensors="pt").input_features
        generated_ids = model.generate(input_features)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        results_atc.append(transcription)
        print(f"{f}: {transcription}")

lot six yankee mike continue left turn heading three six zero to intercept ils three three cleared for ils approach report established
left three six zero intercept ils three six kilo established with three yankee six yankee mike
approaching number three november charlie
ryan air three november charlie continue descend altitude five thousand feet five thousand three november charlie
lot one kilo mike continue present heading descend altitude three thousand feet
continue present heading descend three thousand feet lot one kilo mike
established localizer zero lux six yankee mike
contact tower one one eight decimal three zero five yankee lima
lot one kilo mike turn left heading zero six zero
left heading zero six zero lot one kilo mike
lot one kilo mike left turn heading three six zero to intercept ils three three cleared for approach report established
left heading three six zero cleared intercept ils established runway three three lot one kilo mike
lot three november charlie descend alt

In [None]:
def is_potential_approach_clearance(line):
    # keywords = ["cleared", "expect", "approach"]
    keywords = ["gls", "vor", "rnp", "ils"]
    line_lower = line.lower()
    return any(keyword in line_lower for keyword in keywords)

# Example usage
filtered_lines = [line for line in results_atc if is_potential_approach_clearance(line)]

for line in filtered_lines:
    print(line)

lot six yankee mike continue left turn heading three six zero to intercept ils three three cleared for ils approach report established
left three six zero intercept ils three six kilo established with three yankee six yankee mike
lot one kilo mike left turn heading three six zero to intercept ils three three cleared for approach report established
left heading three six zero cleared intercept ils established runway three three lot one kilo mike
lot three november charlie left turn heading three six zero cleared ils three three cleared for approach report established left three six zero cleared ils zulu three three call you lot three november charlie
lot five zero two left turn heading zero one zero to intercept ils three three cleared for approach report established
praha euro trans four kilo hotel continue descend altitude three thousand feet this heading is to intercept ils three three cleared ils approach for established
qality eight alfa uniform continue descend altitude three thou

In [None]:
import re

def is_approach_clearance(line):
    # Convert to lowercase for consistency
    line = line.lower()

    # Define patterns
    patterns = [
        r"\bcleared\s+(ils|gls|rnp|rnav|v o r|n d b)\b",        # e.g. "cleared ILS"
        r"\bexpect\s+(ils|gls|rnp|rnav|v o r|n d b)\b",         # e.g. "expect ILS"
        r"\bcleared\s+approach\b",                              # general fallback
        r"\bvia.+(ils|gls|rnp|rnav)\b"                          # e.g. "via VEBIT 1A and ILS 14"
    ]
    
    patterns = [
        r"\bcleared\s",      
        r"\bexpect\s",        
        r"\bcleared\s+approach\b",                                            
    ]

    return any(re.search(p, line) for p in patterns)

approach_clearances = [line for line in results_atc if is_approach_clearance(line)]

for ac in approach_clearances:
    print(ac, end="\n\n")

lot three november charlie left turn heading three six zero cleared ils three three cleared for approach report established left three six zero cleared ils zulu three three call you lot three november charlie

praha euro trans four kilo hotel continue descend altitude three thousand feet this heading is to intercept ils three three cleared ils approach for established

