In [2]:
from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import json
import os
import mp
import re

In [51]:
files = glob('malaysian-podcast_processed/**/*/*.json', recursive = True)
files.extend(glob('parlimen-24k-chunk_processed/**/*/*.json', recursive = True))
files.extend(glob('filtered-24k_processed/**/*/*.json', recursive = True))
files.extend(glob('klasik_processed/**/*/*.json', recursive = True))
files.extend(glob('/home/husein/ssd3/dialects_processed/**/*/*.json', recursive = True))
files.extend(glob('/home/husein/ssd3/sg-podcast_processed/**/*/*.json', recursive = True))

len(files)

184723

In [52]:
rejected = [
    'terima kasih kerana menonton',
    'terima kasih',
    'thank you for watching',
]

In [53]:
def new_path(f):
    f = f.replace('.mp3', '.alignment')
    f = f.replace('_processed/', '_processed_alignment/')
    return f

def loop(files):
    files, _ = files
    data = []
    for file in tqdm(files):
        folder = os.path.split(file)[0]
        folder_folder = os.path.split(folder)[1]
        filename = file.replace('.json', '')

        try:
            with open(file) as fopen:
                d = json.load(fopen)
        except:
            continue

        for no, obj in enumerate(d):
            text = obj["text"].strip()
            
            rt_ = re.sub('[^a-z ]+', '', text.lower()).strip()
            if any([s == rt_ for s in rejected]):
                continue
            
            try:
                dense = CountVectorizer(ngram_range = (3,3)).fit_transform([text]).todense()
                repeat = (dense > 3).sum() >= 1
                if repeat:
                    continue
            except:
                continue
            
            audio_path = os.path.join(folder, f'{folder_folder}_{no}.mp3')
            align_path = new_path(audio_path)
            
            if not os.path.exists(align_path):
                continue
                
            with open(align_path) as fopen:
                align = json.load(fopen)
                
            scores = [a for a in align if a['score'] <= -15]
            if len(scores):
                continue

            data.append({
                'audio': audio_path,
                'transcription': text,
            })
    
    return data

In [54]:
d = loop((files[-100:], 0))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 217.00it/s]


In [55]:
data = mp.multiprocessing(files, loop, cores = 15)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12314/12314 [03:04<00:00, 66.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 97.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12314/12314 [03:05<00:00, 66.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12314/12314 [03:10<00:00, 64.73it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12314/12314 [03:19<00:00, 61.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12314/12314 [03:22<00:00, 60.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 12314/12314 [03:23<00:00, 60.

In [56]:
from collections import defaultdict

uniques = defaultdict(int)
for d in tqdm(data):
    uniques[d['audio'].split('_processed')[0]] += 1
    
uniques

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1518322/1518322 [00:00<00:00, 2266598.82it/s]


defaultdict(int,
            {'malaysian-podcast': 60736,
             'parlimen-24k-chunk': 445376,
             'filtered-24k': 549279,
             'klasik': 6100,
             '/home/husein/ssd3/dialects': 441602,
             '/home/husein/ssd3/sg-podcast': 15229})

In [48]:
'/'.join(os.path.split(d['audio'])[0].split('/')[:-1])

'/home/husein/ssd3/sg-podcast_processed'

In [44]:
data[-100:]

[{'audio': '/home/husein/ssd3/sg-podcast_processed/Let’s talk about infertility： Pregnancy, Miscarriages and Treatments ｜ Head Over Heels #46 [eBq6076W-SY]/Let’s talk about infertility： Pregnancy, Miscarriages and Treatments ｜ Head Over Heels #46 [eBq6076W-SY]_16.mp3',
  'transcription': 'Found out so many friends actually had miscarriages but they kept it down low and then we had to'},
 {'audio': '/home/husein/ssd3/sg-podcast_processed/Let’s talk about infertility： Pregnancy, Miscarriages and Treatments ｜ Head Over Heels #46 [eBq6076W-SY]/Let’s talk about infertility： Pregnancy, Miscarriages and Treatments ｜ Head Over Heels #46 [eBq6076W-SY]_17.mp3',
  'transcription': 'As doctor Google, as other doctors, um, what can we do? How do we, how do we get over this?'},
 {'audio': '/home/husein/ssd3/sg-podcast_processed/Let’s talk about infertility： Pregnancy, Miscarriages and Treatments ｜ Head Over Heels #46 [eBq6076W-SY]/Let’s talk about infertility： Pregnancy, Miscarriages and Treatments 

In [57]:
len(data)

1518322

In [60]:
data[0]

{'audio': 'malaysian-podcast_processed/Super. Sunday： Brand Jahat Local [hQkCidjHoVM]/Super. Sunday： Brand Jahat Local [hQkCidjHoVM]_1.mp3',
 'transcription': 'Ah, benda tu kalau jadi, memang, satu eksperimen yang besar, melibatkan kos yang besar, tapi, impact kita tak tahu, kita just nak buat satu benda gila lah, Voltron dengan Super Sunday. Insya Allah, tahun ni kalau sempat lah.'}

In [61]:
import IPython.display as ipd
ipd.Audio(data[0]['audio'])

In [62]:
import pandas as pd

df = pd.DataFrame(data)
df.head()

Unnamed: 0,audio,transcription
0,malaysian-podcast_processed/Super. Sunday： Bra...,"Ah, benda tu kalau jadi, memang, satu eksperim..."
1,malaysian-podcast_processed/Macamana Pramugari...,"Okay, I sebenarnya macam ni. Masa I lapan bela..."
2,malaysian-podcast_processed/Macamana Pramugari...,"Haa, lepas tu, kawan I. Yang kita pergi interv..."
3,malaysian-podcast_processed/Macamana Pramugari...,"Ada interview lagi, I tengok. I buka newspaper..."
4,malaysian-podcast_processed/Macamana Pramugari...,"ajar kat Tok I, boleh tak nak pergi dekat ruma..."


In [63]:
df.to_parquet('verify-text.parquet')

In [64]:
!ls -lh verify-text.parquet

-rw-rw-r-- 1 husein husein 134M Jan   4 09:04 verify-text.parquet


In [65]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="verify-text.parquet",
    path_in_repo="data/verify_text-00000-of-00001.parquet",
    repo_id="mesolitica/Malaysian-Voice-Conversion",
    repo_type="dataset",
)

verify-text.parquet:   0%|          | 0.00/140M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion/commit/6be829ab3e54c94438dccd6098ad27dbc07c2131', commit_message='Upload data/verify_text-00000-of-00001.parquet with huggingface_hub', commit_description='', oid='6be829ab3e54c94438dccd6098ad27dbc07c2131', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Malaysian-Voice-Conversion'), pr_revision=None, pr_num=None)