In [12]:
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM

from datetime import datetime, timedelta
from pathlib import Path
import torchaudio.transforms as T

In [14]:

USE_LM = False
MODEL_ID = "Jzuluaga/wav2vec2-xls-r-300m-en-atc-atcosim"

model = AutoModelForCTC.from_pretrained(MODEL_ID)

if USE_LM:
    processor = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_ID)
else:
    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)

In [20]:
chunk_dir = Path("chunks/EHAA-eham_rdr_124230-Jul-29-2025-1000Z")
chunk_files = sorted(chunk_dir.glob("*.wav"))

results = []
for i, path in enumerate(chunk_files):
    waveform, sample_rate = torchaudio.load(path)

    # Resample if needed
    if sample_rate != 16000:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    input_values = processor(waveform.numpy(), return_tensors="pt", sampling_rate=16000).input_values
    
    with torch.no_grad():
        logits = model(input_values).logits

    if USE_LM:
        transcription = processor.batch_decode(logits.numpy()).text
    else:
        pred_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(pred_ids)
    
    results.append(f"{i+1} {transcription}")
    print(f"[transcribe] Chunk {i+1}/{len(chunk_files)}: {transcription}")

[transcribe] Chunk 1/20: ['ahooimo you']
[transcribe] Chunk 2/20: ['opoyoubruo']
[transcribe] Chunk 3/20: ['']
[transcribe] Chunk 4/20: ['']
[transcribe] Chunk 5/20: ['o']
[transcribe] Chunk 6/20: ['oe']
[transcribe] Chunk 7/20: ['hauo']
[transcribe] Chunk 8/20: ['oo']
[transcribe] Chunk 9/20: ['']
[transcribe] Chunk 10/20: ['oo']
[transcribe] Chunk 11/20: ['o']
[transcribe] Chunk 12/20: ['']
[transcribe] Chunk 13/20: ['oo']
[transcribe] Chunk 14/20: ['eoyo']
[transcribe] Chunk 15/20: ['om']
[transcribe] Chunk 16/20: ['eoo']
[transcribe] Chunk 17/20: ['p']
[transcribe] Chunk 18/20: ['hagoook']
[transcribe] Chunk 19/20: ['haoboyoo']
[transcribe] Chunk 20/20: ['geieom oo']


# Named-entity recognition task

In [22]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jzuluaga/bert-base-ner-atc-en-atco2-1h")
model = AutoModelForTokenClassification.from_pretrained("Jzuluaga/bert-base-ner-atc-en-atco2-1h")

tokenizer_config.json:   0%|          | 0.00/406 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [24]:
from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="first")
nlp("yeah contact austrian information one one nine seven five good bye six three alfa")

Device set to use cpu


[{'entity_group': 'command',
  'score': np.float32(0.9996774),
  'word': 'contact',
  'start': 5,
  'end': 12},
 {'entity_group': 'callsign',
  'score': np.float32(0.9311979),
  'word': 'austrian information',
  'start': 13,
  'end': 33},
 {'entity_group': 'callsign',
  'score': np.float32(0.9998721),
  'word': 'one one nine seven five',
  'start': 34,
  'end': 57},
 {'entity_group': 'callsign',
  'score': np.float32(0.9138362),
  'word': 'six three alfa',
  'start': 67,
  'end': 81}]