In [4]:
# export_drone_wavs_bytes.py
# Write WAVs straight from Parquet without using torchcodec.
import os, glob
from datasets import load_dataset, Audio
from tqdm import tqdm

# ---- paths ----
DATASET_PATH = r"D:/FYP/Datasets/Drone Datasets/drone-audio-detection-samples hf/data"
OUTPUT_ROOT  = r"D:/FYP/Datasets/Drone Datasets/drone-audio-detection-samples hf/extracted_data"
# ---------------

os.makedirs(OUTPUT_ROOT, exist_ok=True)
out_drone   = os.path.join(OUTPUT_ROOT, "drone")
out_nodrone = os.path.join(OUTPUT_ROOT, "nodrone")
os.makedirs(out_drone, exist_ok=True)
os.makedirs(out_nodrone, exist_ok=True)

# Find all parquet shards
parquet_files = sorted(glob.glob(os.path.join(DATASET_PATH, "**", "*.parquet"), recursive=True))
if not parquet_files:
    raise SystemExit(f"No .parquet files found under: {DATASET_PATH}")
print(f"Found {len(parquet_files)} parquet shards.")

total_exported = 0

for shard_idx, parquet_path in enumerate(parquet_files):
    # Load one shard; IMPORTANT: decode=False to avoid audio backend entirely
    ds = load_dataset("parquet", data_files=parquet_path, split="train")
    ds = ds.cast_column("audio", Audio(decode=False))

    pbar = tqdm(ds, desc=f"Shard {shard_idx+1:02d}/{len(parquet_files)}", unit="row")
    for row_idx, row in enumerate(pbar):
        try:
            audio = row["audio"]      # dict with possibly 'bytes' and/or 'path'
            label = row["label"]      # 0 = no-drone, 1 = drone

            # Where to save
            if label == 1:
                out_dir = out_drone
                prefix = "drone"
            else:
                out_dir = out_nodrone
                prefix = "nodrone"

            fname = f"{prefix}_shard{shard_idx:02d}_idx{row_idx:06d}.wav"
            fpath = os.path.join(out_dir, fname)

            # Skip if already written (safe resume)
            if os.path.exists(fpath):
                continue

            # Prefer raw bytes (fast, no decoding backend needed)
            if isinstance(audio, dict) and audio.get("bytes") is not None:
                with open(fpath, "wb") as f:
                    f.write(audio["bytes"])
                total_exported += 1
                continue

            # If HF stored a temp file path, copy it
            if isinstance(audio, dict) and audio.get("path"):
                # Fall back to copying the file contents
                with open(audio["path"], "rb") as src, open(fpath, "wb") as dst:
                    dst.write(src.read())
                total_exported += 1
                continue

            # If neither bytes nor path, log and continue
            pbar.write(f"Row {row_idx}: no 'bytes' or 'path' in audio; skipping.")
        except Exception as e:
            pbar.write(f"Skipping row {row_idx} in shard {shard_idx}: {e}")

print(f"Done. Exported {total_exported} WAV files into:\n- {out_drone}\n- {out_nodrone}")


Found 39 parquet shards.


Shard 01/39: 100%|██████████| 4624/4624 [00:15<00:00, 292.99row/s] 


Generating train split: 0 examples [00:00, ? examples/s]

Shard 02/39: 100%|██████████| 4624/4624 [00:19<00:00, 238.35row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 03/39: 100%|██████████| 4624/4624 [00:08<00:00, 561.64row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 04/39: 100%|██████████| 4624/4624 [00:10<00:00, 456.03row/s] 


Generating train split: 0 examples [00:00, ? examples/s]

Shard 05/39: 100%|██████████| 4624/4624 [00:03<00:00, 1412.32row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 06/39: 100%|██████████| 4624/4624 [00:02<00:00, 1794.96row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 07/39: 100%|██████████| 4624/4624 [00:03<00:00, 1539.94row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 08/39: 100%|██████████| 4624/4624 [00:04<00:00, 925.68row/s] 


Generating train split: 0 examples [00:00, ? examples/s]

Shard 09/39: 100%|██████████| 4624/4624 [00:02<00:00, 2082.37row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 10/39: 100%|██████████| 4624/4624 [00:02<00:00, 1571.63row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 11/39: 100%|██████████| 4624/4624 [00:03<00:00, 1310.19row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 12/39: 100%|██████████| 4624/4624 [00:02<00:00, 1869.53row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 13/39: 100%|██████████| 4624/4624 [00:03<00:00, 1177.11row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 14/39: 100%|██████████| 4624/4624 [00:03<00:00, 1403.59row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 15/39: 100%|██████████| 4624/4624 [00:02<00:00, 1610.28row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 16/39: 100%|██████████| 4624/4624 [00:04<00:00, 1121.55row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 17/39: 100%|██████████| 4624/4624 [00:06<00:00, 690.83row/s] 


Generating train split: 0 examples [00:00, ? examples/s]

Shard 18/39: 100%|██████████| 4624/4624 [00:04<00:00, 948.23row/s] 


Generating train split: 0 examples [00:00, ? examples/s]

Shard 19/39: 100%|██████████| 4624/4624 [00:02<00:00, 1700.86row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 20/39: 100%|██████████| 4624/4624 [00:01<00:00, 2956.30row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 21/39: 100%|██████████| 4624/4624 [00:02<00:00, 1771.34row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 22/39: 100%|██████████| 4624/4624 [00:02<00:00, 2080.64row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 23/39: 100%|██████████| 4624/4624 [00:01<00:00, 2885.32row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 24/39: 100%|██████████| 4623/4623 [00:03<00:00, 1276.43row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 25/39: 100%|██████████| 4623/4623 [00:03<00:00, 1477.67row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 26/39: 100%|██████████| 4623/4623 [00:02<00:00, 2080.03row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 27/39: 100%|██████████| 4623/4623 [00:02<00:00, 1558.48row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 28/39: 100%|██████████| 4623/4623 [00:03<00:00, 1285.60row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 29/39: 100%|██████████| 4623/4623 [00:03<00:00, 1170.28row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 30/39: 100%|██████████| 4623/4623 [00:01<00:00, 2372.70row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 31/39: 100%|██████████| 4623/4623 [00:04<00:00, 986.63row/s] 


Generating train split: 0 examples [00:00, ? examples/s]

Shard 32/39: 100%|██████████| 4623/4623 [00:03<00:00, 1279.23row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 33/39: 100%|██████████| 4623/4623 [00:03<00:00, 1538.09row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 34/39: 100%|██████████| 4623/4623 [00:02<00:00, 2158.33row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 35/39: 100%|██████████| 4623/4623 [00:02<00:00, 1650.59row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 36/39: 100%|██████████| 4623/4623 [00:02<00:00, 1794.06row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 37/39: 100%|██████████| 4623/4623 [00:04<00:00, 1061.01row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 38/39: 100%|██████████| 4623/4623 [00:03<00:00, 1445.72row/s]


Generating train split: 0 examples [00:00, ? examples/s]

Shard 39/39: 100%|██████████| 4623/4623 [00:02<00:00, 1627.63row/s]

Done. Exported 180320 WAV files into:
- D:/FYP/Datasets/Drone Datasets/drone-audio-detection-samples hf/extracted_data\drone
- D:/FYP/Datasets/Drone Datasets/drone-audio-detection-samples hf/extracted_data\nodrone



