<a href="https://colab.research.google.com/github/kalenjin-ai/kalenjin_asr/blob/main/DataSet_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Objectives Today**
Prepare Kalenjin audio/text data into a machine‑friendly format(JSON).
- Save Clean manifets for whisper and NEMO

**Huggingface login** because we intend to save the dtaaset on github

In [1]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Step 1
Import the data saved on google drive and extract it

In [2]:
import tarfile
import os

# Path to your zip file
tar_path = '/content/drive/MyDrive/kln/cv-corpus-22.0-2025-06-20-kln.tar.gz'

# Directory where you want to extract
extract_dir = '/content/extracted_files'

# Create extraction directory if not exists
os.makedirs(extract_dir, exist_ok=True)

# Unzip
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_dir)

print("✅ Extracted to:", extract_dir)


✅ Extracted to: /content/extracted_files




*   Lets Remove the extra subfolders




In [3]:
import shutil
import os

# Path to the directory with the extra subfolder
source_dir = '/content/extracted_files/cv-corpus-22.0-2025-06-20/kln'

# Destination directory (parent directory)
destination_dir = '/content/kln_data'

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Get the list of files in the source directory
files_to_move = os.listdir(source_dir)

# Move each file from the source to the destination
for file_name in files_to_move:
    source_path = os.path.join(source_dir, file_name)
    destination_path = os.path.join(destination_dir, file_name)
    shutil.move(source_path, destination_path)

# Remove the now-empty subfolder
os.rmdir(source_dir)

print("✅ Files moved and subfolder removed.")

✅ Files moved and subfolder removed.


### Step 2: Data Format

In [4]:
import pandas as pd

# Path to the tsv file
tsv_file_path = '/content/kln_data/validated.tsv'

# Read the tsv file into a pandas DataFrame
df = pd.read_csv(tsv_file_path, sep='\t')

# Display the first 5 rows of the DataFrame
display(df.head())

Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment
0,8fd6f32ed091f9531a70b0b570f4c4735858069b5e0917...,common_voice_kln_40613865.mp3,9a3c45f78fdb29dbd7c6705c655afa7a700fe052f98fe0...,Kamet tinyei lakwengung kenyisiek ata,general,2,0,fourties,,,,kln,
1,d6f8b49c5e06ac71e04ef1db8e089f3da7bffb8856ac82...,common_voice_kln_40761915.mp3,5d417c4c1446425ac919cdf273cdd4242c841198bb681a...,Komangen ale tos rikchi chi ko u no,general,2,0,,,,,kln,
2,d6f8b49c5e06ac71e04ef1db8e089f3da7bffb8856ac82...,common_voice_kln_40761936.mp3,5be3431b4255be454ce48ba0e9d138820146d081c0f37d...,uiy ke nem atepto ne yaa ne kakichop,general,2,0,,,,,kln,
3,20ee5df5a3afc462a3c8caf70feb2c9f4b904b5866acae...,common_voice_kln_40598923.mp3,66daeb01557c36b571740be3df9df4809f8fa2053d56f0...,Kiiyan kochengei logoiwek eng oldo age,general,3,0,,,,,kln,
4,20ee5df5a3afc462a3c8caf70feb2c9f4b904b5866acae...,common_voice_kln_40598924.mp3,681a4dd5aec6ad337a0fdccd08c5dba876569890c30120...,Tos iboe nguruonikuk?,general,2,0,,,,,kln,


In [5]:
!pip install soundfile



In [6]:
import soundfile as sf
import os

# Path to the directory containing the audio clips
audio_dir = '/content/kln_data/clips'

# Get the first audio file in the directory
first_audio_file = os.listdir(audio_dir)[0]
audio_path = os.path.join(audio_dir, first_audio_file)

# Read the audio file and get its properties
with sf.SoundFile(audio_path) as f:
    print(f"File: {first_audio_file}")
    print(f"Sample rate: {f.samplerate}")
    print(f"Channels: {f.channels}")
    print(f"Format: {f.format}")
    print(f"Subtype: {f.subtype}")

File: common_voice_kln_40656582.mp3
Sample rate: 32000
Channels: 1
Format: MP3
Subtype: MPEG_LAYER_III


### Data Format observations


*   The data is a TSV file
*   Each row contains the path to an audio file, the corresponding transcription, and some metadata.
*   Audio files are in MP3 format with a sample rate of 32000Hz.






## Step 3: Clean and create manifests
For NEMO and Whisper, you want JSON lines like this:


```
{"audio_filepath": "data/clips/abc123.wav", "duration": 3.4, "text": "kale en kit ne boiyot"}

```



In [7]:
import json
import os
import soundfile as sf
import pandas as pd

def create_manifest(tsv_path, manifest_path, audio_clip_dir):
    # Read only the necessary columns
    df = pd.read_csv(tsv_path, sep='\t', usecols=['path', 'sentence'])
    with open(manifest_path, 'w') as fout:
        for index, row in df.iterrows():
            audio_path = os.path.join(audio_clip_dir, row['path'])
            try:
                with sf.SoundFile(audio_path) as f:
                    duration = f.frames / f.samplerate

                metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": row['sentence']
                }
                json.dump(metadata, fout)
                fout.write('\n')
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")

# Create train and test manifests
audio_clip_dir = '/content/kln_data/clips'
create_manifest('/content/kln_data/train.tsv', 'train_manifest.json', audio_clip_dir)
create_manifest('/content/kln_data/test.tsv', 'test_manifest.json', audio_clip_dir)

print("Manifest files created successfully: train_manifest.json, test_manifest.json")

Manifest files created successfully: train_manifest.json, test_manifest.json


In [8]:
import pandas as pd

# Load the tsv files
train_df = pd.read_csv('/content/kln_data/train.tsv', sep='\t')
test_df = pd.read_csv('/content/kln_data/test.tsv', sep='\t')

# Search for the problematic entry in the train_df
print("Searching in train.tsv:")
display(train_df[train_df['path'].str.contains('Kericho komiten rift valley en kenya')])

# Search for the problematic entry in the test_df
print("\nSearching in test.tsv:")
display(test_df[test_df['path'].str.contains('Kericho komiten rift valley en kenya')])

Searching in train.tsv:


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment



Searching in test.tsv:


Unnamed: 0,client_id,path,sentence_id,sentence,sentence_domain,up_votes,down_votes,age,gender,accents,variant,locale,segment


In [9]:
# Remove the problematic row from the test_df
test_df_cleaned = test_df[~test_df['path'].str.contains('Kericho komiten rift valley en kenya')]

# Save the cleaned test_df to a new tsv file
cleaned_test_tsv_path = '/content/kln_data/test_cleaned.tsv'
test_df_cleaned.to_csv(cleaned_test_tsv_path, sep='\t', index=False)

# Re-create the test manifest with the cleaned data
create_manifest(cleaned_test_tsv_path, 'test_manifest.json', audio_clip_dir)

print("✅ Cleaned test manifest file created successfully: test_manifest.json")

✅ Cleaned test manifest file created successfully: test_manifest.json


In [10]:
!head -n 5 train_manifest.json
!head -n 5 test_manifest.json

{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40550981.mp3", "duration": 4.0755, "text": "Tomo itinye choruet ne chepto iman?"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40550983.mp3", "duration": 3.8235, "text": "Teret nigiteer ko nenyu"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40550984.mp3", "duration": 6.1275, "text": "kiyaat baruet Eng' kasarta ne kimito inegei"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40551036.mp3", "duration": 5.2275, "text": "Borto ne kararan ko maliik che echeen ochei."}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40551038.mp3", "duration": 3.3555, "text": "Kipalei kepenonik chebokeswek"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40613865.mp3", "duration": 4.34365625, "text": "Kamet tinyei lakwengung kenyisiek ata"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40761915.mp3", "duration": 3.2475, "text": "Komangen ale tos rikchi chi ko u

In [11]:
create_manifest('/content/kln_data/validated.tsv', 'validated_manifest.json', audio_clip_dir)
print("✅ Manifest file created successfully: validated_manifest.json")

✅ Manifest file created successfully: validated_manifest.json


In [12]:
!head -n 5 validated_manifest.json

{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40613865.mp3", "duration": 4.34365625, "text": "Kamet tinyei lakwengung kenyisiek ata"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40761915.mp3", "duration": 3.2475, "text": "Komangen ale tos rikchi chi ko u no"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40761936.mp3", "duration": 2.8875, "text": "uiy ke nem atepto ne yaa ne kakichop"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40598923.mp3", "duration": 5.5875, "text": "Kiiyan kochengei logoiwek eng oldo age"}
{"audio_filepath": "/content/kln_data/clips/common_voice_kln_40598924.mp3", "duration": 3.6435, "text": "Tos iboe nguruonikuk?"}


In [14]:
import os
from pydub import AudioSegment
from joblib import Parallel, delayed
from tqdm import tqdm

# Create a directory to store the WAV files
wav_dir = '/content/kln_data/wav_clips_16k'
os.makedirs(wav_dir, exist_ok=True)

# Path to the directory containing the MP3 clips
mp3_dir = '/content/kln_data/clips'

# Get a list of the MP3 files
mp3_files = [f for f in os.listdir(mp3_dir) if f.endswith(".mp3")]

# Function to convert a single file
def convert_file(filename):
    mp3_path = os.path.join(mp3_dir, filename)
    wav_path = os.path.join(wav_dir, filename.replace(".mp3", ".wav"))

    # Convert MP3 to WAV and set the frame rate to 16000
    audio = AudioSegment.from_mp3(mp3_path)
    audio = audio.set_frame_rate(16000)
    audio.export(wav_path, format="wav")

# Convert the files in parallel with a progress bar
Parallel(n_jobs=-1)(delayed(convert_file)(f) for f in tqdm(mp3_files))

print("✅ All MP3 files converted to WAV and resampled to 16kHz.")

100%|██████████| 70200/70200 [3:36:16<00:00,  5.41it/s]


✅ All MP3 files converted to WAV and resampled to 16kHz.


In [15]:
import json

def update_manifest(manifest_path, wav_dir):
    updated_manifest = []
    with open(manifest_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            # Update the audio_filepath to point to the new WAV file
            filename = os.path.basename(data['audio_filepath'])
            new_filepath = os.path.join(wav_dir, filename.replace(".mp3", ".wav"))
            data['audio_filepath'] = new_filepath
            updated_manifest.append(data)

    # Write the updated manifest back to the file
    with open(manifest_path, 'w') as f:
        for data in updated_manifest:
            json.dump(data, f)
            f.write('\n')

# Update the train, test, and validated manifests
wav_dir = '/content/kln_data/wav_clips_16k'
update_manifest('train_manifest.json', wav_dir)
update_manifest('test_manifest.json', wav_dir)
update_manifest('validated_manifest.json', wav_dir)

print("✅ All manifest files updated successfully.")

✅ All manifest files updated successfully.


In [16]:
from datasets import Dataset, DatasetDict, Audio

# Create a dictionary with the train, test, and validated data
data = {
    "train": Dataset.from_json("train_manifest.json"),
    "test": Dataset.from_json("test_manifest.json"),
    "validated": Dataset.from_json("validated_manifest.json")
}

# Create a DatasetDict
dataset = DatasetDict(data)

# Cast the audio column to the Audio feature type
dataset = dataset.cast_column("audio_filepath", Audio())

print("✅ Hugging Face dataset created successfully.")
display(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

✅ Hugging Face dataset created successfully.


DatasetDict({
    train: Dataset({
        features: ['audio_filepath', 'duration', 'text'],
        num_rows: 11064
    })
    test: Dataset({
        features: ['audio_filepath', 'duration', 'text'],
        num_rows: 5684
    })
    validated: Dataset({
        features: ['audio_filepath', 'duration', 'text'],
        num_rows: 31528
    })
})

In [17]:
# Push the dataset to the Hugging Face Hub
dataset.push_to_hub("kalenjin-asr-data")

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ? shards/s]

Map:   0%|          | 0/2766 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/323M [00:00<?, ?B/s]

Map:   0%|          | 0/2766 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/331M [00:00<?, ?B/s]

Map:   0%|          | 0/2766 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/288M [00:00<?, ?B/s]

Map:   0%|          | 0/2766 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/256M [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Map:   0%|          | 0/2842 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/375M [00:00<?, ?B/s]

Map:   0%|          | 0/2842 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/434M [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/12 [00:00<?, ? shards/s]

Map:   0%|          | 0/2628 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/352M [00:00<?, ?B/s]

Map:   0%|          | 0/2628 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/407M [00:00<?, ?B/s]

Map:   0%|          | 0/2628 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/305M [00:00<?, ?B/s]

Map:   0%|          | 0/2628 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/284M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/292M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/350M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/275M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/330M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/285M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/250M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/243M [00:00<?, ?B/s]

Map:   0%|          | 0/2627 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/237M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Sugutt/kalenjin-asr-data/commit/f2221a52726961e0223690cc72bd99955987a62e', commit_message='Upload dataset', commit_description='', oid='f2221a52726961e0223690cc72bd99955987a62e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Sugutt/kalenjin-asr-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Sugutt/kalenjin-asr-data'), pr_revision=None, pr_num=None)

In [18]:
import pandas as pd

# Load the Excel file into a pandas DataFrame
df = pd.read_excel("/content/engnivkal.xlsx")

# Display the first few rows of the DataFrame
display(df.head())

Unnamed: 0,English,Kalenjin
0,In the beginning God created the heavens and ...,"Eng' taunet, ko ki toi Kamuktaindet koyai kips..."
1,"Now the earth was formless and empty, darknes...",Ki ma komi kiy ng'wony ak ki ma kotinyei itoon...
2,"And God said, “Let there be light,” and there...","Komwa Kamuktaindet kole, “Ingolapkeiit” ak ki ..."
3,"God saw that the light was good, and he separ...","Kogeer anyun Kamuktaindet lapkeiyet kole myee,..."
4,"God called the light “day,” and the darkness ...",Ki kuure Kamuktaindet lapkeiyet Peet ak koguur...


In [20]:
import pandas as pd

# Extract the Kalenjin text from the Excel file
kalenjin_text_excel = df['Kalenjin'].tolist()

# Load the validated.tsv file and extract the sentence column
validated_df = pd.read_csv('/content/kln_data/validated_sentences.tsv', sep='\t')
kalenjin_text_tsv = validated_df['sentence'].tolist()

# Combine the text from both sources
all_kalenjin_text = kalenjin_text_excel + kalenjin_text_tsv

# Save the combined text to a file
with open('kalenjin_text.txt', 'w') as f:
    for line in all_kalenjin_text:
        f.write(str(line) + '\n')

print("✅ Combined Kalenjin text saved to kalenjin_text.txt")

✅ Combined Kalenjin text saved to kalenjin_text.txt


In [21]:
!pip install tokenizers



In [22]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

# Customize pre-tokenization
tokenizer.pre_tokenizer = Whitespace()

# Initialize a trainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer
tokenizer.train(files=["kalenjin_text.txt"], trainer=trainer)

# Save the tokenizer
tokenizer.save("kalenjin_tokenizer.json")

print("✅ Tokenizer trained and saved to kalenjin_tokenizer.json")

✅ Tokenizer trained and saved to kalenjin_tokenizer.json


In [24]:
!ls

drive		   kalenjin_tokenizer.json  train_manifest.json
engnivkal.xlsx	   kln_data		    validated_manifest.json
extracted_files    sample_data
kalenjin_text.txt  test_manifest.json


In [28]:
from huggingface_hub import HfApi, whoami, create_repo, upload_file

# Create a new repository on the Hub
repo_name = "kalenjin-tokenizer"
create_repo(repo_name, exist_ok=True)

# Get your Hugging Face username
username = whoami()["name"]

# Upload the tokenizer and text file to the repository
api = HfApi()
api.upload_file(
    path_or_fileobj="kalenjin_tokenizer.json",
    path_in_repo="kalenjin_tokenizer.json",
    repo_id=f"{username}/{repo_name}",
)
api.upload_file(
    path_or_fileobj="kalenjin_text.txt",
    path_in_repo="kalenjin_text.txt",
    repo_id=f"{username}/{repo_name}",
)

print(f"✅ Tokenizer and text file uploaded to https://huggingface.co/{username}/{repo_name}")

No files have been modified since last commit. Skipping to prevent empty commit.


✅ Tokenizer and text file uploaded to https://huggingface.co/Sugutt/kalenjin-tokenizer
