In [None]:

!apt-get -y install ffmpeg
!pip install --quiet pydub tqdm pandas
from google.colab import drive
drive.mount('/content/drive')
from pathlib import Path
import tarfile, pandas as pd
from pydub import AudioSegment, silence
from tqdm import tqdm



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ARCHIVE_PATH = Path("/content/drive/MyDrive/cv-corpus-14.0-delta-2023-06-23-sw.tar.gz")
EXTRACT_DIR  = Path("common_voice_sw")
OUTPUT_WAVS  = Path("cleaned_sw_audio")
MANIFEST_CSV = Path("manifest_sw.csv")
TARGET_SR    = 16_000
SIL_THRESH   = -40
MIN_MS       = 200

EXTRACT_DIR.mkdir(exist_ok=True)
OUTPUT_WAVS.mkdir(exist_ok=True)

# get tsv/clips
with tarfile.open(ARCHIVE_PATH, "r:gz") as tar:
    members = [m for m in tar.getmembers() if m.name.endswith("sw/validated.tsv") or m.name.endswith(".mp3")]
    tar.extractall(path=EXTRACT_DIR, members=members)

tsv_path = next(EXTRACT_DIR.rglob("validated.tsv"))
df = pd.read_csv(tsv_path, sep="\t")

In [None]:


rows = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="processing audio"):
    mp3_abs = EXTRACT_DIR / f"cv-corpus-14.0-delta-2023-06-23/sw/clips/{row['path']}"
    if not mp3_abs.exists():
        continue

    try:
        audio = AudioSegment.from_file(mp3_abs, format="mp3")
    except Exception as e:
        print(" invalid file:", mp3_abs.name, e)
        continue

    #trim leading/silence
    chunks = silence.split_on_silence(audio, silence_thresh=SIL_THRESH, min_silence_len=150)
    if chunks:
        audio = sum(chunks)

    if len(audio) < MIN_MS:
        continue

    #resample
    audio = audio.set_frame_rate(TARGET_SR).set_channels(1)

    #export
    wav_name = row["path"].replace(".mp3", ".wav")
    wav_out  = OUTPUT_WAVS / wav_name
    wav_out.parent.mkdir(parents=True, exist_ok=True)
    audio.export(wav_out, format="wav")

    rows.append({
        "wav_path"  : str(wav_out.resolve()),
        "duration"  : len(audio) / 1000.0,
        "transcript": row["sentence"]
    })

#manifest
pd.DataFrame(rows).to_csv(MANIFEST_CSV, index=False)
print(f"\n kept {len(rows):,} clips.")
print("   wavs: ", OUTPUT_WAVS.resolve())
print("   csv: ", MANIFEST_CSV.resolve())


Processing audio:  86%|████████▋ | 233/270 [01:09<00:09,  3.75it/s]