If you are working locally:

In [None]:
from pathlib import Path
import tarfile, pandas as pd
from pydub import AudioSegment, silence
from tqdm import tqdm
# and download ffmpeg locally

If you are working on Colab:

In [1]:
!apt-get -y install ffmpeg
!pip install --quiet pydub tqdm pandas
from google.colab import drive
drive.mount('/content/drive')
from pathlib import Path
import tarfile, pandas as pd
from pydub import AudioSegment, silence
from tqdm import tqdm


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Mounted at /content/drive


In [2]:
ARCHIVE_PATH = '/content/drive/MyDrive/cv-corpus-17.0-delta-2024-03-15-sw.tar.gz'

data = []
with tarfile.open(ARCHIVE_PATH, "r:gz") as tar:
    for m in tar.getmembers():
        if m.name.endswith("sw/validated.tsv"):
            print(f"Processing {m.name}")
            f = tar.extractfile(m)
            if f:
                import io
                data = pd.read_csv(io.StringIO(f.read().decode('utf-8')), sep='\t', header=0)
                break
df = pd.DataFrame(data)
# just to print out the df
df


Processing cv-corpus-17.0-delta-2024-03-15/sw/validated.tsv


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,f2952050338d1792cc01ca9802a3c81f3028cec22aa50d...,common_voice_sw_39648928.mp3,9ecbb60c112e8ef00feca35fbdaf88b880a6ba8acc10ea...,ambao waliishia Georgia wakati wa Dola ya Ottoman,,10,5,twenties,female_feminine,,,sw,
1,f2952050338d1792cc01ca9802a3c81f3028cec22aa50d...,common_voice_sw_39649022.mp3,9a5051defde101381550b93d8426f8cf8f0047a2291360...,kwa jumla ni suala litakalojadiliwa kwa miaka ...,,2,0,twenties,female_feminine,,,sw,
2,f2952050338d1792cc01ca9802a3c81f3028cec22aa50d...,common_voice_sw_39659550.mp3,9e2e57455d62423a303b8095acd63c157518cde56fe867...,lakini kampuni hizo zinasema wanadai zaidi ya ...,,8,3,twenties,female_feminine,,,sw,
3,f2952050338d1792cc01ca9802a3c81f3028cec22aa50d...,common_voice_sw_39670193.mp3,a133c38715c106fd3c42b04f9399620c1f65418ca7785f...,kiswahili Kutoa mafunzo ya lugha za kigeni kwa...,,6,0,twenties,female_feminine,,,sw,
4,f2952050338d1792cc01ca9802a3c81f3028cec22aa50d...,common_voice_sw_40001588.mp3,a004d0209a195ed39dd736b32a2332ae82f022bbe97542...,amedhibitisha wabunge mia nne ishirini na moja,,5,3,twenties,female_feminine,,,sw,
5,f2952050338d1792cc01ca9802a3c81f3028cec22aa50d...,common_voice_sw_40001598.mp3,a0feae113dca80e281ff8c4c5439f63eb3a55958f3dc8e...,kani katika kinyanganyiro hicho ugiriki nako w...,,3,0,twenties,female_feminine,,,sw,


In [3]:
ARCHIVE_PATH = Path("/content/drive/MyDrive/cv-corpus-17.0-delta-2024-03-15-sw.tar.gz")
EXTRACT_DIR  = Path("common_voice_sw")
OUTPUT_WAVS  = Path("cleaned_sw_audio")
MANIFEST_CSV = Path("manifest_sw.csv")
TARGET_SR    = 16_000
SIL_THRESH   = -40
MIN_MS       = 200

EXTRACT_DIR.mkdir(exist_ok=True)
OUTPUT_WAVS.mkdir(exist_ok=True)

# get tsv/clips
with tarfile.open(ARCHIVE_PATH, "r:gz") as tar:
    members = [m for m in tar.getmembers() if m.name.endswith("sw/validated.tsv") or m.name.endswith(".mp3")]
    print(" extracting:", members)
    tar.extractall(path=EXTRACT_DIR, members=members)

tsv_path = next(EXTRACT_DIR.rglob("validated.tsv"))
print("tsv:", tsv_path)
df = pd.read_csv(tsv_path, sep="\t")
print(df)

 extracting: [<TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/validated.tsv' at 0x7a9e780fa680>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39594825.mp3' at 0x7a9e782f4940>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39595971.mp3' at 0x7a9e782f4a00>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39595975.mp3' at 0x7a9e782f4b80>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39595978.mp3' at 0x7a9e782f4d00>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39595979.mp3' at 0x7a9e782f4c40>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39595981.mp3' at 0x7a9e782f4e80>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39597372.mp3' at 0x7a9e782f5000>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39597373.mp3' at 0x7a9e782f50c0>, <TarInfo 'cv-corpus-17.0-delta-2024-03-15/sw/clips/common_voice_sw_39597375.mp3' at 0x7a9e782f5180

In [4]:
rows = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="processing audio"):
    mp3_abs = EXTRACT_DIR / f"cv-corpus-17.0-delta-2024-03-15/sw/clips/{row['path']}"
    if not mp3_abs.exists():
        print(" missing file:", mp3_abs.name)
        continue

    try:
        audio = AudioSegment.from_file(mp3_abs, format="mp3")
        print("hello")
    except Exception as e:
        print(" invalid file:", mp3_abs.name, e)
        continue

    #trim leading/silence
    chunks = silence.split_on_silence(audio, silence_thresh=SIL_THRESH, min_silence_len=150)
    if chunks:
        audio = sum(chunks)

    if len(audio) < MIN_MS:
        continue

    #resample
    audio = audio.set_frame_rate(TARGET_SR).set_channels(1)

    #export
    wav_name = row["path"].replace(".mp3", ".wav")
    wav_out  = OUTPUT_WAVS / wav_name
    wav_out.parent.mkdir(parents=True, exist_ok=True)
    audio.export(wav_out, format="wav")

    rows.append({
        "wav_path"  : str(wav_out.resolve()),
        "duration"  : len(audio) / 1000.0,
        "transcript": row["sentence"]
    })

#manifest
pd.DataFrame(rows).to_csv(MANIFEST_CSV, index=False)
print(f"\n kept {len(rows):,} clips.")
print("   wavs: ", OUTPUT_WAVS.resolve())
print("   csv: ", MANIFEST_CSV.resolve())


processing audio:   0%|          | 0/6 [00:00<?, ?it/s]

hello


processing audio:  17%|█▋        | 1/6 [00:01<00:07,  1.59s/it]

hello


processing audio:  50%|█████     | 3/6 [00:02<00:02,  1.23it/s]

hello


processing audio:  67%|██████▋   | 4/6 [00:03<00:01,  1.63it/s]

hello


processing audio:  83%|████████▎ | 5/6 [00:03<00:00,  1.98it/s]

hello


processing audio: 100%|██████████| 6/6 [00:03<00:00,  1.59it/s]

hello

 kept 6 clips.
   wavs:  /content/cleaned_sw_audio
   csv:  /content/manifest_sw.csv



