In [1]:
from glob import glob
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

In [2]:
from datasets import Dataset, Audio
from datasets import load_dataset, load_from_disk
from dataspeech import rate_apply, pitch_apply, snr_apply, squim_apply
from datasets import DatasetDict
from multiprocess import set_start_method
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import argparse
import numpy as np
import pandas as pd
import os
import json
import torch

INFO - Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO - Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [3]:
files = glob('/home/husein/ssd3/malaysian-podcast_processed/**/*/*.json', recursive = True)
files.extend(glob('/home/husein/ssd3/sg-podcast_processed/**/*/*.json', recursive = True))

In [4]:
len(files)

22492

In [5]:
data = []
for file in tqdm(files):
    folder = os.path.split(file)[0]
    filename = file.replace('.json', '')
    
    try:
        with open(file) as fopen:
            d = json.load(fopen)
    except:
        continue
    
    for no, obj in enumerate(d):
        text = obj["text"].strip()
        audio_path = os.path.join(folder, f'{filename}_{no}.mp3')
        gender_path = audio_path.replace('processed/', 'processed_24k_gender/').replace('.mp3', '.gender')
        if not os.path.exists(gender_path):
            continue
            
        with open(gender_path) as fopen:
            gender = fopen.read()
            
        audio_path = audio_path.replace('processed/', 'processed_44k/')
        if not os.path.exists(audio_path):
            continue
        
        if 'sg-podcast' in file:
            country = 'singaporean'
        else:
            country = 'malaysian'
            
        data.append({
            'audio': audio_path,
            'transcription': text,
            'gender': gender,
            'country': country,
        })

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22492/22492 [00:03<00:00, 6901.25it/s]


In [6]:
data[0]

{'audio': '/home/husein/ssd3/malaysian-podcast_processed_44k/Cara Nak Apply Student Exchange [vFhLEniT9X8]/Cara Nak Apply Student Exchange [vFhLEniT9X8]_0.mp3',
 'transcription': 'Cara nak apply, macam Puteri kan time internship. So, Puteri punya keluar dekat group internship, aa, dia keluar satu form.',
 'gender': 'female',
 'country': 'malaysian'}

In [8]:
data[-1]

{'audio': '/home/husein/ssd3/sg-podcast_processed_44k/Have you heard about the 🧠& 🍑👌？ #shorts [DiQFH3xhSoo]/Have you heard about the 🧠& 🍑👌？ #shorts [DiQFH3xhSoo]_0.mp3',
 'transcription': "You just know, wherever you go in the world, the asshole is always in control. It's true.",
 'gender': 'male',
 'country': 'singaporean'}

In [7]:
len(data)

274344

In [9]:
import IPython.display as ipd
ipd.Audio(data[0]['audio'])

In [10]:
ipd.Audio(data[-1]['audio'])

In [11]:
dataset = Dataset.from_list(data)

In [12]:
dataset = dataset.cast_column("audio", Audio(sampling_rate = 22050))

In [13]:
audio_column_name = 'audio'
text_column_name = 'transcription'
num_workers_per_gpu_for_squim = 1
cpu_num_workers = 5
penn_batch_size = 512
num_workers_per_gpu_for_pitch = 1
num_workers_per_gpu_for_snr = 1
cpu_writer_batch_size = 1000

In [14]:
squim_dataset = dataset.map(
    squim_apply,
    batched=True,
    batch_size=2,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_squim if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name],
    fn_kwargs={"audio_column_name": audio_column_name,},
)

Map (num_proc=2):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [15]:
pitch_dataset = dataset.cast_column(audio_column_name, Audio(sampling_rate=16_000)).map(
    pitch_apply,
    batched=True,
    batch_size=12,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_pitch if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name],
    fn_kwargs={"audio_column_name": audio_column_name, "penn_batch_size": penn_batch_size},
)

Map (num_proc=2):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [105]:
snr_dataset = dataset.map(
    snr_apply,
    batched=True,
    batch_size=16,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_snr if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name],
    fn_kwargs={"audio_column_name": audio_column_name},
)

In [17]:
dataset = dataset.add_column('utterance_pitch_mean', pitch_dataset['utterance_pitch_mean']).add_column(
    'utterance_pitch_std', pitch_dataset['utterance_pitch_std'])
dataset = dataset.add_column("snr", snr_dataset["snr"]).add_column("c50", snr_dataset["c50"])
dataset = dataset.add_column("speech_duration", snr_dataset["speech_duration"])
dataset = dataset.add_column("stoi", squim_dataset["stoi"]).add_column("si-sdr", squim_dataset["sdr"]).add_column("pesq", squim_dataset["pesq"])

In [18]:
dataset = dataset.filter(lambda row: not np.isnan(row["snr"]))

Filter:   0%|          | 0/274344 [00:00<?, ? examples/s]

In [19]:
dataset.save_to_disk('dataset-combine')

Saving the dataset (0/68 shards):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [20]:
!du -hs dataset-combine

53G	dataset-combine


In [21]:
len(dataset)

274344

In [22]:
dataset[0]

{'audio': {'path': '/home/husein/ssd3/malaysian-podcast_processed_44k/Cara Nak Apply Student Exchange [vFhLEniT9X8]/Cara Nak Apply Student Exchange [vFhLEniT9X8]_0.mp3',
  'array': array([-0.00121594, -0.00186276, -0.0019816 , ...,  0.00016469,
          0.00020282,  0.00019048]),
  'sampling_rate': 22050},
 'transcription': 'Cara nak apply, macam Puteri kan time internship. So, Puteri punya keluar dekat group internship, aa, dia keluar satu form.',
 'gender': 'female',
 'country': 'malaysian',
 'utterance_pitch_mean': 259.931396484375,
 'utterance_pitch_std': 46.01287841796875,
 'snr': 41.81050491333008,
 'c50': 59.3415641784668,
 'speech_duration': 7.661250114440918,
 'stoi': 0.9756626486778259,
 'si-sdr': 20.618106842041016,
 'pesq': 3.326802968978882}

In [23]:
dataset[-1]

{'audio': {'path': '/home/husein/ssd3/sg-podcast_processed_44k/Have you heard about the 🧠& 🍑👌？ #shorts [DiQFH3xhSoo]/Have you heard about the 🧠& 🍑👌？ #shorts [DiQFH3xhSoo]_0.mp3',
  'array': array([-0.00126127, -0.0018822 , -0.00187837, ...,  0.00034472,
          0.0004074 ,  0.00018495]),
  'sampling_rate': 22050},
 'transcription': "You just know, wherever you go in the world, the asshole is always in control. It's true.",
 'gender': 'male',
 'country': 'singaporean',
 'utterance_pitch_mean': 124.18851470947266,
 'utterance_pitch_std': 32.084354400634766,
 'snr': 69.38728332519531,
 'c50': 59.84521484375,
 'speech_duration': 4.910624980926514,
 'stoi': 0.9785327315330505,
 'si-sdr': 16.752330780029297,
 'pesq': 2.8572096824645996}

In [24]:
dataset_dict = DatasetDict({
    'train': dataset
})

In [25]:
SPEAKER_RATE_BINS = ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"]
SNR_BINS = ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"]
REVERBERATION_BINS = ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"]
UTTERANCE_LEVEL_STD = ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"]
SI_SDR_BINS = ["extremely noisy", "very noisy", "noisy", "slightly noisy", "almost no noise", "very clear"]
PESQ_BINS = ["very bad speech quality", "bad speech quality", "slightly bad speech quality", "moderate speech quality", "great speech quality", "wonderful speech quality"]

# this one is supposed to be apply to speaker-level mean pitch, and relative to gender
SPEAKER_LEVEL_PITCH_BINS = ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]

In [26]:
text_bins_dict = {}
bin_edges_dict = {}

speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins", SPEAKER_LEVEL_PITCH_BINS)
speaker_rate_bins = text_bins_dict.get("speaker_rate_bins", SPEAKER_RATE_BINS)
snr_bins = text_bins_dict.get("snr_bins", SNR_BINS)
reverberation_bins = text_bins_dict.get("reverberation_bins", REVERBERATION_BINS)
utterance_level_std = text_bins_dict.get("utterance_level_std", UTTERANCE_LEVEL_STD)
sdr_bins = text_bins_dict.get("sdr_bins", SI_SDR_BINS)
pesq_std = text_bins_dict.get("pesq_bins", PESQ_BINS)

In [27]:
def bins_to_text(dataset, text_bins, column_name, output_column_name, leading_split_for_bins="train", batch_size = 4, num_workers = 1, std_tolerance=5, save_dir=None, only_save_plot=False, lower_range=None, bin_edges=None):
    '''
    Compute bins of `column_name` from the splits `leading_split_for_bins` and apply text bins to every split.
    `leading_split_for_bins` can be a string or a list.
    '''
    if bin_edges is None:
        values = []
        for df in dataset:
            for split in df:
                if leading_split_for_bins is None or leading_split_for_bins in split:
                    values.extend(df[split][column_name])
        
        # filter out outliers
        values = np.array(values)
        if std_tolerance is not None:
            filtered_values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
        else:
            filtered_values = values

        if save_dir is not None:
            visualize_bins_to_text(values, filtered_values, "Before filtering", "After filtering", text_bins, save_dir, output_column_name, lower_range=lower_range)
            
        # speaking_rate can easily have outliers
        if save_dir is not None and output_column_name=="speaking_rate":
            visualize_bins_to_text(filtered_values, filtered_values, "After filtering", "After filtering", text_bins, save_dir, f"{output_column_name}_after_filtering", lower_range=lower_range)
        
        values = filtered_values
        hist, bin_edges = np.histogram(values, bins = len(text_bins), range=(lower_range, values.max()) if lower_range else None)
        
        if only_save_plot:
            return dataset, bin_edges
    else:
        print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")

    def batch_association(batch):
        index_bins = np.searchsorted(bin_edges, batch, side="left")
        # do min(max(...)) when values are outside of the main bins
        # it happens when value = min or max or have been filtered out from bins computation
        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
        return {
            output_column_name: batch_bins
        }
    
    dataset = [df.map(batch_association, batched=True, batch_size=batch_size, input_columns=[column_name], num_proc=num_workers) for df in dataset]
    return dataset, bin_edges

def speaker_level_relative_to_gender(dataset, text_bins, gender_column_name, column_name, output_column_name, batch_size = 4, num_workers=1, std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=None):
    '''
    Computes mean values on a speaker level and computes bins on top relative to the gender column name.
    Then associate a text bin to the column.
    This time, doesn't use leading_split_for_bins, computes it for all. Could probably be optimized
    '''
    list_data = []
    for df in dataset:
        for split in df:
            panda_data = df[split].remove_columns([col for col in df[split].column_names if col not in {column_name, gender_column_name}]).to_pandas()
            list_data.append(panda_data)
        
    dataframe = pd.concat(list_data, ignore_index=True)
    if bin_edges is None:
        bin_edges = {}
        if save_dir is not None:
            save_dict = {}
            save_dict_afer_filtering = {}
        for category in ["male", "female"]:
            values = dataframe[dataframe[gender_column_name] == category][column_name]
            values = np.array(values)
            if save_dir is not None:
                save_dict[category] = values
            if std_tolerance is not None:
                # filter out outliers
                values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]
                if save_dir is not None:
                    save_dict_afer_filtering[category] = values
            bin_edges[category] = np.histogram(values, len(text_bins))[1]
        
        if save_dir is not None:
            visualize_bins_to_text(save_dict["male"], save_dict["female"], "Male distribution", "Female distribution", text_bins, save_dir, output_column_name)
            if std_tolerance is not None:
                visualize_bins_to_text(save_dict_afer_filtering["male"], save_dict_afer_filtering["female"], "Male distribution", "Female distribution", text_bins, save_dir, f"{output_column_name}_after_filtering")

        if only_save_plot:
            return dataset, bin_edges
     
    speaker_id_to_bins = dataframe.apply(lambda x: np.searchsorted(bin_edges[x[gender_column_name]], x[column_name]), axis=1).to_dict()
        
    def batch_association(batch):
        index_bins = [speaker_id_to_bins[speaker] for speaker in batch]
        # do min(max(...)) when values are outside of the main bins
        # it happens when value = min or max or have been filtered out from bins computation
        batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
        return {
            output_column_name: batch_bins
        }
        
    
    dataset = [df.map(batch_association, batched=True, input_columns=[speaker_column_name], batch_size=batch_size, num_proc=num_workers) for df in dataset]
    return dataset, bin_edges

In [28]:
dataset_, pitch_bin_edges = bins_to_text(
    [dataset_dict], 
    speaker_level_pitch_bins, "utterance_pitch_mean", "pitch", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, save_dir=None, 
    only_save_plot=False, bin_edges=bin_edges_dict.get("pitch",None))

Map (num_proc=5):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [29]:
dataset_[0]['train'][-1]

{'audio': {'path': '/home/husein/ssd3/sg-podcast_processed_44k/Have you heard about the 🧠& 🍑👌？ #shorts [DiQFH3xhSoo]/Have you heard about the 🧠& 🍑👌？ #shorts [DiQFH3xhSoo]_0.mp3',
  'array': array([-0.00126127, -0.0018822 , -0.00187837, ...,  0.00034472,
          0.0004074 ,  0.00018495]),
  'sampling_rate': 22050},
 'transcription': "You just know, wherever you go in the world, the asshole is always in control. It's true.",
 'gender': 'male',
 'country': 'singaporean',
 'utterance_pitch_mean': 124.18851470947266,
 'utterance_pitch_std': 32.084354400634766,
 'snr': 69.38728332519531,
 'c50': 59.84521484375,
 'speech_duration': 4.910624980926514,
 'stoi': 0.9785327315330505,
 'si-sdr': 16.752330780029297,
 'pesq': 2.8572096824645996,
 'pitch': 'very low pitch'}

In [30]:
dataset_, speaking_rate_bin_edges = bins_to_text(
    dataset_, 
    speaker_rate_bins, "speech_duration", "speaking_rate", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, save_dir=None, 
    only_save_plot=False, bin_edges=bin_edges_dict.get("speaking_rate",None))

Map (num_proc=5):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [31]:
dataset_, noise_bin_edges = bins_to_text(
    dataset_, snr_bins, "snr", "noise", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("noise",None), 
                                        lower_range=None)

Map (num_proc=5):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [32]:
dataset_, reverberation_bin_edges = bins_to_text(
    dataset_, reverberation_bins, "c50", "reverberation", 
    batch_size=100, num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, 
    save_dir=None, only_save_plot=False, 
    bin_edges=bin_edges_dict.get("reverberation",None))

Map (num_proc=5):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [33]:
dataset_, speech_monotony_bin_edges = bins_to_text(
    dataset_, utterance_level_std, 
    "utterance_pitch_std", 
    "speech_monotony", 
    batch_size=100, 
    num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, 
    save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("speech_monotony",None))

Map (num_proc=5):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [71]:
dataset_, sdr_bin_edges = bins_to_text(
    dataset_, sdr_bins, "si-sdr", "sdr_noise",
    batch_size=100, 
    num_workers=5, 
    leading_split_for_bins=None, 
    std_tolerance=None, 
    save_dir=None, only_save_plot=False, bin_edges=bin_edges_dict.get("si-sdr",None))

Map (num_proc=5):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [72]:
dataset_[0]['train'][-3]

{'audio': {'path': '/home/husein/ssd3/sg-podcast_processed_44k/What To Do And Eat In Sydney, Australia [H4y5ydsnMuk]/What To Do And Eat In Sydney, Australia [H4y5ydsnMuk]_0.mp3',
  'array': array([-0.00163894, -0.00259711, -0.00240175, ...,  0.00072436,
          0.00168878,  0.00118529]),
  'sampling_rate': 22050},
 'transcription': "First of all, I'll be showing you a little bit of the landscape. It's completely pitch black, right? So it's a strange idea to show you the landscape. Then we'll take a look up and look into the sky. The sky is so clear.",
 'gender': 'male',
 'country': 'singaporean',
 'utterance_pitch_mean': 191.78109741210938,
 'utterance_pitch_std': 49.3779296875,
 'snr': 62.618247985839844,
 'c50': 58.38713455200195,
 'speech_duration': 10.951874732971191,
 'stoi': 0.9895485639572144,
 'si-sdr': 17.239852905273438,
 'pesq': 3.289207696914673,
 'pitch': 'quite low pitch',
 'speaking_rate': 'slightly slowly',
 'noise': 'quite clear',
 'reverberation': 'very confined sou

In [73]:
dataset_[0].save_to_disk('combine-metadata')

Saving the dataset (0/68 shards):   0%|          | 0/274344 [00:00<?, ? examples/s]

In [3]:
dataset_ = [load_from_disk('combine-metadata')]

Loading dataset from disk:   0%|          | 0/68 [00:00<?, ?it/s]

In [13]:
NEW_PROMPT_WITH_ACCENT = """You will be given 7 descriptive keywords related to an audio sample of a person's speech. These keywords include:
1. The gender (male, female)
2. The level of reverberation (very distant-sounding, distant-sounding, slightly distant-sounding, slightly close-sounding, very close-sounding)
3. The amount of noise in the sample (extremely noisy, very noisy, noisy, slightly noisy, almost no noise, very clear)
4. The tone of the speaker's voice (very monotone, monotone, slightly expressive and animated, expressive and animated, very expressive and animated)
5. The pace of the speaker's delivery (very slowly, slowly, slightly slowly, moderate speed, slightly fast, fast, very fast)
6. The pitch of the speaker's voice (very low-pitch, low-pitch, slightly low-pitch, moderate pitch, slightly high-pitch, high-pitch, very high-pitch)
7. The country of the speaker.

Your task is to create a text description using these keywords that accurately describes the speech sample.
If the amount of noise is 'very noisy' and the level of reverberation is 'very distant-sounding', you must include terms such as 'very poor recording' or `very bad recording` in the description. 
Likewise, if the amount of noise is 'very clear' and the level of reverberation is 'very close-sounding', you must include terms like 'very good recording' or `excellent recording` in the description. 
You can randomly omit the following terms, as they are default terms: 'moderate speed' and 'moderate pitch'.
Do not add extra details beyond what has been provided above. You can change the order of keywords, and replace synonymous terms.

For example, given the following keywords: 'female', 'slightly distant-sounding', 'noisy', 'very expressive and animated', 'very slowly', 'moderate pitch', 'singaporean', a valid description would be: 'A Singaporean woman speaks very slowly but has a very animated delivery. The recording is noisy and there is some roominess.'
Another valid description would be: 'In a noisy room, a female Singaporean speaker delivers a very animated and expressive speech, at a very slow pace.'
Another valid description would be: 'A Singaporean woman enunciates a very expressive speech. Her voice is slightly distant-sounding, with some background noise present. She speaks very slowly with a moderate pitch but a very expressive tone.'

Ensure that the generated description is grammatically correct, easy to understand, and concise. Only return one and only one description.

For the keywords: '[gender]', '[reverberation]', '[sdr_noise]', '[speech_monotony]', '[speaking_rate]', '[pitch]', '[country]', the corresponding description is:
"""

In [14]:
EXPECTED_COLUMNS = {"gender", "pitch", "sdr_noise", "reverberation", "speech_monotony", "speaking_rate", 'country'}

In [15]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py
import mp

In [16]:
def loop(indices):
    indices, _ = indices
    prompts = []
    dataset_ = [load_from_disk('combine-metadata')]
    for i in tqdm(indices):
        sample = dataset_[0]['train'][i]
        sample_prompt = NEW_PROMPT_WITH_ACCENT
        for key in EXPECTED_COLUMNS:
            sample_prompt = sample_prompt.replace(f"[{key}]", sample[key])

        prompts.append((i, sample_prompt))
    return prompts

In [18]:
prompts = mp.multiprocessing(range(len(dataset_[0]['train'])), loop, cores = 10)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [04:38<00:00, 98.49it/s]
 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████        | 25800/27434 [04:39<00:18, 89.06it/s]

Loading dataset from disk:   0%|          | 0/68 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 161.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [04:41<00:00, 97.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [04:41<00:00, 97.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [04:45<00:00, 96.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [04:47<00:00, 95.47it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [19]:
prompts[0]

(0,
 "You will be given 7 descriptive keywords related to an audio sample of a person's speech. These keywords include:\n1. The gender (male, female)\n2. The level of reverberation (very distant-sounding, distant-sounding, slightly distant-sounding, slightly close-sounding, very close-sounding)\n3. The amount of noise in the sample (extremely noisy, very noisy, noisy, slightly noisy, almost no noise, very clear)\n4. The tone of the speaker's voice (very monotone, monotone, slightly expressive and animated, expressive and animated, very expressive and animated)\n5. The pace of the speaker's delivery (very slowly, slowly, slightly slowly, moderate speed, slightly fast, fast, very fast)\n6. The pitch of the speaker's voice (very low-pitch, low-pitch, slightly low-pitch, moderate pitch, slightly high-pitch, high-pitch, very high-pitch)\n7. The country of the speaker.\n\nYour task is to create a text description using these keywords that accurately describes the speech sample.\nIf the amoun

In [29]:
# !rm -rf malaysian-emilia-podcast-prompt
# !mkdir malaysian-emilia-podcast-prompt

In [21]:
import requests
import json

def answer(i, q):
    filename = os.path.join('malaysian-emilia-podcast-prompt', f'{i}.json')
    try:
        with open(filename) as fopen:
            json.load(fopen)
        return
    except:
        pass
    sample_prompt = [{"role": "user", "content": q}]
    while True:
        r = requests.post('http://localhost:8000/v1/chat/completions',
                 json = {'messages': sample_prompt, 'model': 'mistral', 
                         'temperature': 0.6, 'max_tokens': 256}).json()

        try:
            r = r['choices'][0]['message']['content'].strip()
            if isinstance(r, str) and '[' not in r and ']' not in r:
                with open(filename, 'w') as fopen:
                    json.dump(r, fopen)
                    return
        except Exception as e:
            print(r)
            pass

In [22]:
prompts[0]

(0,
 "You will be given 7 descriptive keywords related to an audio sample of a person's speech. These keywords include:\n1. The gender (male, female)\n2. The level of reverberation (very distant-sounding, distant-sounding, slightly distant-sounding, slightly close-sounding, very close-sounding)\n3. The amount of noise in the sample (extremely noisy, very noisy, noisy, slightly noisy, almost no noise, very clear)\n4. The tone of the speaker's voice (very monotone, monotone, slightly expressive and animated, expressive and animated, very expressive and animated)\n5. The pace of the speaker's delivery (very slowly, slowly, slightly slowly, moderate speed, slightly fast, fast, very fast)\n6. The pitch of the speaker's voice (very low-pitch, low-pitch, slightly low-pitch, moderate pitch, slightly high-pitch, high-pitch, very high-pitch)\n7. The country of the speaker.\n\nYour task is to create a text description using these keywords that accurately describes the speech sample.\nIf the amoun

In [23]:
answer(*prompts[0])

In [25]:
# import IPython.display as ipd
# ipd.Audio(dataset_[0]['train'][0]['audio']['array'], rate = 22050)

In [26]:
def consumer(queue, name):
    while True:
        if queue.qsize() == 0:
            break
        item = queue.get()
        answer(*item)
    print(f'consumer {name} done')

In [27]:
from threading import Thread
from queue import Queue
from tqdm import tqdm

queue = Queue()
for u in prompts:
    queue.put(u)
    
ori_size = queue.qsize()

In [28]:
max_worker = 30
consumers = [Thread(target=consumer, args=(queue,i)) for i in range(max_worker)]
for i in range(len(consumers)):
    consumers[i].start()
    
pbar = tqdm(total=ori_size, position=0, leave=True)
last_size = 0
while True:
    size = queue.qsize()
    if size == 0:
        break
    left = ori_size - size
    minus = left - last_size
    if minus > 0:
        pbar.update(minus)
        last_size += minus

pbar.close()

 47%|██████████████████████████████████████████████████████████▍                                                                  | 128290/274344 [10:09:24<12:46:05,  3.18it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 80%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 219072/274344 [17:20:19<3:19:59,  4.61it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

def loop(indices):
    indices, _ = indices
    results = []
    dataset_ = [load_from_disk('combine-metadata')]
    for i in tqdm(indices):
        filename = os.path.join('malaysian-emilia-podcast-prompt', f'{i}.json')
        if not os.path.exists(filename):
            continue
        with open(filename) as fopen:
            p = json.load(fopen)
        row = dataset_[0]['train'][i].copy()
        try:
            dense = CountVectorizer(ngram_range = (3,3)).fit_transform([row['transcription']]).todense()
        except:
            continue
        repeat = (dense > 3).sum() >= 1
        if repeat:
            continue
        path = row['audio']['path']
        folder = path.split('_', -1)[0]
        base = '/home/husein/ssd3/malaysian-podcast_processed_44k'
        new_path = os.path.join(base, folder, path)
        if os.path.exists(os.path.join(base, folder, path)):
            path = new_path
        else:
            path = os.path.join('/home/husein/ssd3/sg-podcast_processed_44k', folder, path)

        if os.path.exists(path):
            row['audio_filename'] = path.split('ssd3/')[1]
            row.pop('audio', None)
            row['prompt'] = p
            results.append(row)
    return results

In [67]:
loop((range(10), 0))

Loading dataset from disk:   0%|          | 0/68 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 86.08it/s]


[{'transcription': 'Cara nak apply, macam Puteri kan time internship. So, Puteri punya keluar dekat group internship, aa, dia keluar satu form.',
  'gender': 'female',
  'country': 'malaysian',
  'utterance_pitch_mean': 259.931396484375,
  'utterance_pitch_std': 46.01287841796875,
  'snr': 41.81050491333008,
  'c50': 59.3415641784668,
  'speech_duration': 7.661250114440918,
  'stoi': 0.9756626486778259,
  'si-sdr': 20.618106842041016,
  'pesq': 3.326802968978882,
  'pitch': 'moderate pitch',
  'speaking_rate': 'quite slowly',
  'noise': 'moderate ambient sound',
  'reverberation': 'very confined sounding',
  'speech_monotony': 'very monotone',
  'sdr_noise': 'almost no noise',
  'audio_filename': 'malaysian-podcast_processed_44k/Cara Nak Apply Student Exchange [vFhLEniT9X8]/Cara Nak Apply Student Exchange [vFhLEniT9X8]_0.mp3',
  'prompt': 'A Malaysian woman delivers a very monotone speech with a moderate pitch, speaking quite slowly in a very confined and almost noise-free environment.

In [68]:
results = mp.multiprocessing(range(len(dataset_[0]['train'])), loop, cores = 10)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [05:00<00:00, 91.26it/s]
 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 27024/27434 [05:02<00:06, 61.93it/s]

Loading dataset from disk:   0%|          | 0/68 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [05:02<00:00, 90.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 134.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [05:03<00:00, 90.52it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [05:03<00:00, 90.39it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27434/27434 [05:05<00:00, 89.70it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [69]:
len(results)

235405

In [73]:
results[0]

{'transcription': 'Cara nak apply, macam Puteri kan time internship. So, Puteri punya keluar dekat group internship, aa, dia keluar satu form.',
 'gender': 'female',
 'country': 'malaysian',
 'utterance_pitch_mean': 259.931396484375,
 'utterance_pitch_std': 46.01287841796875,
 'snr': 41.81050491333008,
 'c50': 59.3415641784668,
 'speech_duration': 7.661250114440918,
 'stoi': 0.9756626486778259,
 'si-sdr': 20.618106842041016,
 'pesq': 3.326802968978882,
 'pitch': 'moderate pitch',
 'speaking_rate': 'quite slowly',
 'noise': 'moderate ambient sound',
 'reverberation': 'very confined sounding',
 'speech_monotony': 'very monotone',
 'sdr_noise': 'almost no noise',
 'audio_filename': 'malaysian-podcast_processed_44k/Cara Nak Apply Student Exchange [vFhLEniT9X8]/Cara Nak Apply Student Exchange [vFhLEniT9X8]_0.mp3',
 'prompt': 'A Malaysian woman delivers a very monotone speech with a moderate pitch, speaking quite slowly in a very confined and almost noise-free environment.'}

In [81]:
pd.DataFrame(results).to_parquet('malaysian-emilia-podcast.parquet')

In [82]:
!ls -lh malaysian-emilia-podcast.parquet

-rw-rw-r-- 1 husein husein 49M Nov  19 16:21 malaysian-emilia-podcast.parquet


In [84]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="malaysian-emilia-podcast.parquet",
    path_in_repo="malaysian-emilia-podcast.parquet",
    repo_id="mesolitica/Malaysian-Emilia-annotated",
    repo_type="dataset",
)

malaysian-emilia-podcast.parquet:   0%|          | 0.00/50.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-annotated/commit/dc338f1fbd66d3eb57a670c6bccf12159f7f9b7a', commit_message='Upload malaysian-emilia-podcast.parquet with huggingface_hub', commit_description='', oid='dc338f1fbd66d3eb57a670c6bccf12159f7f9b7a', pr_url=None, pr_revision=None, pr_num=None)