# Dataset Preparation
The Gujarati dataset, approximately 17 gigabytes in size, was collected from the provided link and subsequently processed. The dataset was organized into an **audio folder** containing the audio files. Alongside this folder, three additional directories—**train**, **eval**, and **dev**—were present. Each of these directories included files formatted in the **Kaldi** standard.

- **segments**: maps audio segments to start and end times accurately
- **text**: stores transcriptions of utterances corresponding to audio segments
- **wav.scp**: provides file paths or commands to access audio files
- **spk2utt**: lists utterance IDs for each speaker in the dataset
- **utt2spk**: links each utterance ID to its corresponding speaker ID
- **utt2dur**: specifies duration of each utterance in seconds for reference

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Make sure that the .tar.gz zip file is present and extracted in drive.


In [None]:
import pandas as pd
from pathlib import Path
import librosa
import soundfile as sf
from pathlib import Path
import os
from tqdm import tqdm
from huggingface_hub import notebook_login

First the audio is converted into consistent format of 16kHz.

In [None]:
preprocessed_audio_dir = '/content/drive/MyDrive/preprocessed_audio'
os.makedirs(preprocessed_audio_dir, exist_ok=True)

def preprocess_audio(input_dir, output_dir):
    audio_dir = Path('/content/drive/MyDrive/extracted_dataset/SPRING_INX_Gujarati_R1/Audio')

    audio_files = list(audio_dir.rglob("*.wav"))

    for file in audio_files:
        print(f"Processing file: {file}")

        y, sr = librosa.load(file, sr=16000)

        output_path = os.path.join(output_dir, file.name)

        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        sf.write(output_path, y, 16000)
        print(f"Saved to {output_path}")

preprocess_audio('/content/drive/MyDrive/extracted_dataset/SPRING_INX_Gujarati_R1/train', preprocessed_audio_dir)

After executing this cell, the data must be in the following folder ```'/content/drive/MyDrive/preprocessed_audio'```

Following this we will create the metadata.csv


In [None]:
base_path = "/content/drive/MyDrive/preprocessed_audio"
subsets = ["train", "dev", "eval"]

metadata_entries = []

for subset in subsets:
    subset_path = os.path.join(base_path, subset)

    wav_scp_path = os.path.join(subset_path, "wav.scp")
    segments_path = os.path.join(subset_path, "segments")
    text_path = os.path.join(subset_path, "text")
    utt2dur_path = os.path.join(subset_path, "utt2dur")

    for file_path in [wav_scp_path, segments_path, text_path, utt2dur_path]:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Required file not found: {file_path}")

    with open(wav_scp_path, "r", encoding="utf-8") as f:
        wav_scp = {line.split()[0]: line.split()[1] for line in f}

    with open(segments_path, "r", encoding="utf-8") as f:
        segments = {
            line.split()[0]: {
                "file_name": line.split()[1],
                "start_time": float(line.split()[2]),
                "end_time": float(line.split()[3]),
            }
            for line in f
        }

    with open(text_path, "r", encoding="utf-8") as f:
        transcriptions = {line.split(maxsplit=1)[0]: line.split(maxsplit=1)[1].strip() for line in f}

    with open(utt2dur_path, "r", encoding="utf-8") as f:
        durations = {line.split()[0]: float(line.split()[1]) for line in f}

    for utt_id, segment_info in segments.items():
        file_name = segment_info["file_name"]
        start_time = segment_info["start_time"]
        end_time = segment_info["end_time"]

        metadata_entries.append(
            {
                "audio_file_path": wav_scp[file_name],
                "transcription": transcriptions.get(utt_id, ""),
                "duration": durations.get(utt_id, end_time - start_time),
                "start_time": start_time,
                "end_time": end_time,
            }
        )

metadata_df = pd.DataFrame(metadata_entries)

output_csv_path = os.path.join(base_path, "metadata.csv")
metadata_df.to_csv(output_csv_path, index=False, encoding="utf-8")

print(f"Metadata CSV saved to: {output_csv_path}")

Finally the next cell will process the audio files so that they are saved into ```"/content/drive/MyDrive/processed_audio"```

This will take each and every audio file and then using the segmenst and the utt2dur files cut the audio into each utterance and make the new audio file as the utterance name. This is done because the whisper model does not take in audios of length more than 30 seconds.

In [None]:
def split_audio_and_generate_metadata(subset, base_dir, output_dir, processed_audio_dir):
    """
    this func splits audio files into utterances based on segments

    args:
    subset (str): subset folder ('train', 'eval', 'dev').
    base_dir (str): base directory of the dataset.
    output_dir (str): directory to save the metadata CSV files.
    processed_audio_dir (str): directory to save processed audio files.

    """
    subset_dir = os.path.join(base_dir, subset)
    wav_scp_path = os.path.join(subset_dir, "wav.scp")
    segments_path = os.path.join(subset_dir, "segments")
    text_path = os.path.join(subset_dir, "text")

    os.makedirs(processed_audio_dir, exist_ok=True)

    wav_scp = {}
    with open(wav_scp_path, "r", encoding="utf-8") as f:
        for line in f:
            file_name, audio_path = line.strip().split(maxsplit=1)
            wav_scp[file_name] = audio_path

    segments = []
    with open(segments_path, "r", encoding="utf-8") as f:
        for line in f:
            utt_id, file_name, start_time, end_time = line.strip().split()
            segments.append({
                "utt_id": utt_id,
                "file_name": file_name,
                "start_time": float(start_time),
                "end_time": float(end_time),
            })

    text_data = {}
    with open(text_path, "r", encoding="utf-8") as f:
        for line in f:
            utt_id, transcription = line.strip().split(maxsplit=1)
            text_data[utt_id] = transcription

    for seg in tqdm(segments, desc=f"Processing {subset}"):
        utt_id = seg["utt_id"]
        file_name = seg["file_name"]
        start_time = seg["start_time"]
        end_time = seg["end_time"]
        transcription = text_data.get(utt_id, "")

        if file_name in wav_scp:
            audio_path = wav_scp[file_name]
            try:
                y, sr = librosa.load(audio_path, sr=16000, offset=start_time, duration=(end_time - start_time))
                output_audio_path = os.path.join(processed_audio_dir, f"{utt_id}.wav")
                sf.write(output_audio_path, y, 16000)
            except Exception as e:
                print(f"Error processing {audio_path}: {e}")


base_dir = "/content/drive/MyDrive/preprocessed_audio"
output_dir = "/content/dirve/MyDrive"
processed_audio_dir = "/content/drive/MyDrive/processed_audio"
os.makedirs(processed_audio_dir, exist_ok=True)

for subset in ["dev"]:
    subset_audio_dir = os.path.join(processed_audio_dir, subset)
    os.makedirs(subset_audio_dir, exist_ok=True)
    split_audio_and_generate_metadata(subset, base_dir, output_dir, subset_audio_dir)

Following this, we can upload our dataset into ```huggingface_hub```

In [None]:
notebook_login()

After this, just paste a token with *write* access to your hugging face account.

First we wuill create a ```DatasetDict``` object for our dataset.

In [None]:
import os
from datasets import Dataset, DatasetDict, Audio

audio_dict = {
    "train": {"audio": []},
    "test": {"audio": []},
    "validation": {"audio": []},
}

base_path = "/content/drive/MyDrive/processed_audio"

for split, key in zip(["train", "eval", "dev"], ["train", "test", "validation"]):
    split_path = os.path.join(base_path, split)

    if os.path.exists(split_path):
        for file in os.listdir(split_path):
            if file.endswith(".wav"):
                file_path = os.path.join(split_path, file)
                audio_dict[key]["audio"].append(file_path)

train_dataset = Dataset.from_dict(audio_dict["train"]).cast_column("audio", Audio())
test_dataset = Dataset.from_dict(audio_dict["test"]).cast_column("audio", Audio())
validation_dataset = Dataset.from_dict(audio_dict["validation"]).cast_column("audio", Audio())

audio_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": validation_dataset
})

print(audio_dataset)

You should get an output like this

```python
DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 71058
    })
    test: Dataset({
        features: ['audio'],
        num_rows: 1994
    })
    validation: Dataset({
        features: ['audio'],
        num_rows: 7983
    })
})
```

Now, just push the dataset into hugging face using the ```push_to_hub()``` function

In [None]:
audio_dataset.push_to_hub("haideraqeeb/gujrati-asr-16kHz")

This will be enough for creating a dataset and pushing it into hugging face.