In [1]:
import os
import pandas as pd
from datasets import Dataset, Audio, concatenate_datasets
from pathlib import Path
from pydub import AudioSegment

In [2]:
root_dir = Path("magichub_datasets")

SPKINFO = "SPKINFO.txt"
UTTRANSINFO = "UTTRANSINFO.txt"
AUDIOINFO = "AUDIOINFO.txt"

dataset_dirs = os.listdir(root_dir)

dataframes = {}
speaker_infos = {}
audio_infos = {}

# deal with short-form dataset 
for dataset_dir in dataset_dirs:
    if "Scripted" in dataset_dir:
        clips_dir = root_dir / dataset_dir / "clips"
        os.makedirs(clips_dir, exist_ok=True)

        for root, dirs, files in os.walk(os.path.join(root_dir, dataset_dir)):
            for f in files:
                file_path = Path(root) / f
                if f.endswith(".wav"):
                    os.rename(file_path, clips_dir / f)
                    
                if f == UTTRANSINFO:
                    df = pd.read_csv(file_path, sep="\t")
                    dataframes[dataset_dir] = df

                if f == SPKINFO:
                    df = pd.read_csv(file_path, sep="\t")
                    speaker_infos[dataset_dir] = df
    elif "Conversation" in dataset_dir:
        clips_dir = root_dir / dataset_dir / "clips"
        os.makedirs(clips_dir, exist_ok=True)
        
        for root, dirs, files in os.walk(os.path.join(root_dir, dataset_dir)):
            for f in files:
                file_path = Path(root) / f
                if f.endswith(".txt") and "TXT" in root:
                    utterances = []
                    base_name = f.split(".")[0]
                    
                    audio_path = Path(root).parent / "WAV" / f"{base_name}.wav"
                    audio = AudioSegment.from_wav(audio_path)
                    
                    with open(file_path, "r", encoding="utf-8") as f:
                        for i, line in enumerate(f):
                            parts = line.strip().split("\t")
                            assert len(parts) == 4
                            start, end = map(float, parts[0][1:-1].split(",")) # e.g. [1.000, 2.000] --> 1.000, 2.000
                            
                            speaker_id = parts[1].strip()
                            transcription = parts[3].strip()
                            
                            if speaker_id == "0": # "0" is system prompt
                                continue
                            
                            segment = audio[start * 1000 : end * 1000]

                            utterance_number = f"U{i:04d}"
                            filename_prefix = f"{base_name}_{utterance_number}"
                            
                            segment.export(f"{clips_dir}/{filename_prefix}.wav", format="wav")
                            
                            utterances.append({
                                "SPEAKER_ID": speaker_id,
                                "TRANSCRIPTION": transcription,
                                "UTTRANS_ID": f"{filename_prefix}.wav",
                            })
                            
                    if dataframes.get(dataset_dir) is None:
                        dataframes[dataset_dir] = utterances
                    else:
                        dataframes[dataset_dir].extend(utterances)

                elif f == SPKINFO:
                    df = pd.read_csv(file_path, sep="\t")
                    speaker_infos[dataset_dir] = df
                
                elif f == AUDIOINFO:
                    df = pd.read_csv(file_path, sep="\t")
                    audio_infos[dataset_dir] = df

        dataframes[dataset_dir] = pd.DataFrame(dataframes[dataset_dir])

In [3]:
print(dataframes[dataset_dirs[0]].columns)
print(speaker_infos[dataset_dirs[0]].columns)

datasets = {}

for dataset_dir in dataset_dirs:
    dataset = dataframes[dataset_dir]
    speaker_info = speaker_infos[dataset_dir]
    common_columns = list(set(dataset.columns) & set(speaker_info.columns))
    dataset = pd.merge(dataset, speaker_info, on=common_columns)
    
    audio_info = audio_infos.get(dataset_dir)
    if audio_info is not None:
        audio_info.drop(columns=["UTTRANS_ID"], inplace=True)
        common_columns = list(set(audio_info.columns) & set(dataset.columns))
        # dataset = pd.merge(dataset, audio_info, on=common_columns) # topic has duplicate rows, so merged rows get multiplied, just omit topic column entirely
    
    if "REGION,CITY" in dataset.columns or "REGIONCITY" in dataset.columns:
        dataset = dataset.rename(columns={"REGION,CITY": "REGION_CITY"})
        dataset = dataset.rename(columns={"REGIONCITY": "REGION_CITY"})
    
    datasets[dataset_dir] = Dataset.from_pandas(dataset)

Index(['CHANNEL', 'UTTRANS_ID', 'SPEAKER_ID', 'PROMPT', 'TRANSCRIPTION'], dtype='object')
Index(['CHANNEL', 'SPEAKER_ID', 'GENDER', 'AGE', 'REGION,CITY', 'DEVICE'], dtype='object')


In [4]:
print(datasets[dataset_dirs[0]])
print(datasets[dataset_dirs[1]])
print(datasets[dataset_dirs[2]])

Dataset({
    features: ['CHANNEL', 'UTTRANS_ID', 'SPEAKER_ID', 'PROMPT', 'TRANSCRIPTION', 'GENDER', 'AGE', 'REGION_CITY', 'DEVICE'],
    num_rows: 4073
})
Dataset({
    features: ['SPEAKER_ID', 'TRANSCRIPTION', 'UTTRANS_ID', 'CHANNEL', 'GENDER', 'AGE', 'REGION_CITY', 'DEVICE'],
    num_rows: 3149
})
Dataset({
    features: ['CHANNEL', 'UTTRANS_ID', 'SPEAKER_ID', 'PROMPT', 'TRANSCRIPTION', 'GENDER', 'AGE', 'REGION_CITY', 'DEVICE'],
    num_rows: 2242
})


In [5]:
assert len(datasets[dataset_dirs[0]]) == len(dataframes[dataset_dirs[0]])
assert len(datasets[dataset_dirs[1]]) == len(dataframes[dataset_dirs[1]])
assert len(datasets[dataset_dirs[2]]) == len(dataframes[dataset_dirs[2]])

In [6]:
# standardize columns

for key, dataset in datasets.items():
    for column in dataset.column_names:
        datasets[key] = datasets[key].rename_column(column, column.lower())

In [7]:
print(datasets["Filipino_Scripted_Speech_Corpus_Daily_Use_Sentence"])
print(datasets["Filipino_Scripted_Speech_Corpus_Daily_Use_Sentence"][0])

Dataset({
    features: ['channel', 'uttrans_id', 'speaker_id', 'prompt', 'transcription', 'gender', 'age', 'region_city', 'device'],
    num_rows: 4073
})
{'channel': 'C1', 'uttrans_id': 'G0004_1_S0001.wav', 'speaker_id': 'G0004', 'prompt': 'So nag iwan sila ng ilang CDs tapos sabi ipatch nlng daw.', 'transcription': 'so nag-iwan sila ng ilang C Ds tapos sabi i-patch na lang daw', 'gender': 'M', 'age': 24, 'region_city': 'Cordillera Administrative Region, Baguio', 'device': 'AKG'}


In [8]:
mappings = {
    "transcription": "sentence",
    "uttrans_id": "audio"
}
for key, dataset in datasets.items():
    for column_name, new_column_name in mappings.items():
        if column_name in dataset.column_names:
            datasets[key] = datasets[key].rename_column(column_name, new_column_name)
    
    if "audio" in datasets[key].column_names:
        datasets[key] = datasets[key].map(lambda x: {"audio": f"{key}/clips/{x['audio']}"})

Map:   0%|          | 0/4073 [00:00<?, ? examples/s]

Map:   0%|          | 0/3149 [00:00<?, ? examples/s]

Map:   0%|          | 0/2242 [00:00<?, ? examples/s]

In [9]:
os.chdir(r"C:\Users\keith\Desktop\repos\finetune-whisper\magichub_datasets")
for key, dataset in datasets.items():
    datasets[key] = datasets[key].cast_column("audio", Audio()) # cast all to 16kHz

In [10]:
print(datasets["Filipino_Scripted_Speech_Corpus_Daily_Use_Sentence"])
print(datasets["Filipino_Scripted_Speech_Corpus_Daily_Use_Sentence"][0])

Dataset({
    features: ['channel', 'audio', 'speaker_id', 'prompt', 'sentence', 'gender', 'age', 'region_city', 'device'],
    num_rows: 4073
})
{'channel': 'C1', 'audio': {'path': 'Filipino_Scripted_Speech_Corpus_Daily_Use_Sentence/clips/G0004_1_S0001.wav', 'array': array([-0.00387573, -0.00564575, -0.01025391, ...,  0.        ,
        0.        ,  0.        ]), 'sampling_rate': 16000}, 'speaker_id': 'G0004', 'prompt': 'So nag iwan sila ng ilang CDs tapos sabi ipatch nlng daw.', 'sentence': 'so nag-iwan sila ng ilang C Ds tapos sabi i-patch na lang daw', 'gender': 'M', 'age': 24, 'region_city': 'Cordillera Administrative Region, Baguio', 'device': 'AKG'}


In [11]:
combined_ms_dataset = concatenate_datasets([dataset for key, dataset in datasets.items() if "Malay" in key])
combined_ms_dataset = combined_ms_dataset.train_test_split(test_size=0.1)

combined_ms_dataset.push_to_hub("keeve101/magic-hub-ms-tl-datasets", "ms")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/2426 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Map:   0%|          | 0/2425 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/keeve101/magic-hub-ms-tl-datasets/commit/8b43a10661b084506614620827b141b703d3c8c6', commit_message='Upload dataset', commit_description='', oid='8b43a10661b084506614620827b141b703d3c8c6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/keeve101/magic-hub-ms-tl-datasets', endpoint='https://huggingface.co', repo_type='dataset', repo_id='keeve101/magic-hub-ms-tl-datasets'), pr_revision=None, pr_num=None)

In [12]:
for key, dataset in datasets.items():
    if "Filipino" in key:
        dataset = dataset.train_test_split(test_size=0.1)
        dataset.push_to_hub("keeve101/magic-hub-ms-tl-datasets", "tl")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1833 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1832 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
