In [1]:
import json
import pandas as pd
import os
from tqdm import tqdm
from glob import glob
from datasets import Dataset, Audio

audio = Audio(sampling_rate = 16000)

In [2]:
with open('instructions-keys.json') as fopen:
    instructions = json.load(fopen)

In [3]:
with open('fix-instructions-mixtral-multiturn.json') as fopen:
    mixtral_multiturn = json.load(fopen)

In [4]:
files = glob('short-coding-*.json')
files = [f for f in files if os.path.exists(f.replace('.json', ''))]
files

['short-coding-2.json', 'short-coding-0.json', 'short-coding-1.json']

In [5]:
filtered = []
for f in files:
    folder = f.replace('.json', '')
    with open(f) as fopen:
        data = json.load(fopen)
    
    for i, d in tqdm(enumerate(data)):
        filename = os.path.join(folder, f'{i}.mp3')
        if not os.path.exists(filename):
            continue
        
        d = {
            'prompt': json.dumps([
                {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
                {'role': 'assistant', 'content': d['answer']},
            ]),
            'question': d['question'],
            'audio_filename': filename,
            'dataset': 'short-coding',
            'speaker': d['speaker']
        }
        filtered.append(d)
len(filtered)

18990it [00:00, 91460.81it/s] 
18991it [00:00, 117140.51it/s]
18991it [00:00, 83742.94it/s]


45415

In [6]:
files = glob('partition-instructions-part-*.json')
files = [f for f in files if os.path.exists(f.replace('.json', ''))]
files

['partition-instructions-part-7.json',
 'partition-instructions-part-16.json',
 'partition-instructions-part-8.json',
 'partition-instructions-part-0.json',
 'partition-instructions-part-2.json',
 'partition-instructions-part-15.json',
 'partition-instructions-part-4.json',
 'partition-instructions-part-5.json',
 'partition-instructions-part-6.json',
 'partition-instructions-part-9.json',
 'partition-instructions-part-14.json',
 'partition-instructions-part-10.json',
 'partition-instructions-part-3.json',
 'partition-instructions-part-11.json',
 'partition-instructions-part-1.json',
 'partition-instructions-part-13.json',
 'partition-instructions-part-17.json',
 'partition-instructions-part-12.json']

In [7]:
from collections import defaultdict

selected = []
already = defaultdict(set)
count = defaultdict(int)
for f in files:
    folder = f.replace('.json', '')
    with open(f) as fopen:
        data = json.load(fopen)
        
    for i, d in tqdm(enumerate(data)):
        filename = os.path.join(folder, f'{i}.mp3')
        if not os.path.exists(filename):
            continue
        
        d['prompt'] = json.dumps(d['prompt'])
        d['audio_filename'] = filename
        d['dataset'] = instructions.get(d['prompt'], 'unknown')
        if 'mixtral' in d['dataset'] and d['question'] in mixtral_multiturn:
            d['prompt'] = mixtral_multiturn[d['question']]
        
        count[d['dataset']] += 1
        
        q = d['question'].lower()
        if q in already[d['dataset']]:
            continue
        
        already[d['dataset']].add(q)
        filtered.append(d)
        
len(filtered)

30000it [00:00, 87804.87it/s]
30000it [00:00, 58165.87it/s]
30000it [00:00, 84969.50it/s]
30000it [00:00, 116673.19it/s]
30000it [00:00, 50024.84it/s]
30000it [00:00, 67247.16it/s] 
30000it [00:00, 80041.37it/s]
30000it [00:00, 142441.18it/s]
30000it [00:00, 177842.30it/s]
30000it [00:00, 87117.20it/s]
30000it [00:00, 91090.03it/s] 
30000it [00:00, 89711.98it/s]
30000it [00:00, 60648.23it/s]
30000it [00:00, 88069.07it/s]
30000it [00:00, 105905.80it/s]
30000it [00:00, 113934.27it/s]
30000it [00:01, 24961.49it/s]
30000it [00:00, 84347.12it/s]


464971

In [8]:
for k, v in already.items():
    print(k, len(v), count[k])

mixtral_critis_malaysia 88792 88831
unknown 489 495
force_jawi 23660 63930
mixtral_critis_politician 91244 91340
chatgpt4_malaysian_general_qa 26505 26573
malaysian_ultrachat 87266 89992
malaysian_alpaca 18243 18245
synthetic_coding 3282 3282
mixtral_conversation_stupid 43059 43506
mixtral_factually_wrong 35901 38166
force_tamil 40 40
force_mandarin 1075 1125


In [9]:
with open('tatabahasa.json') as fopen:
    tatabahasa = json.load(fopen)
    
for i, row in tqdm(enumerate(tatabahasa)):
    filename = os.path.join('tatabahasa', f'{i}.mp3')
    if not os.path.exists(filename):
        continue
    q = row['question']
    if 'IV' in q or 'II' in q:
        continue
    d = {
        'prompt': json.dumps([
            {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
            {'role': 'assistant', 'content': row['answer']},
        ]),
        'question': row['question'],
        'audio_filename': filename,
        'dataset': 'tatabahasa',
        'speaker': row['speaker']
    }
    filtered.append(d)

1284it [00:00, 237581.01it/s]


In [10]:
filtered[-1]

{'prompt': '[{"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]}, {"role": "assistant", "content": "C. dengan, ke"}]',
 'question': 'Isi tempat kosong dalam ayat-ayat di bawah dengan jawapan yang paling sesuai.\nKami dijangka __________ terlewat sampai __________ Pulau Langkawi kerana kereta mengalami kerosakan.\n\nA. dari, di\nB. akan, di\nC. akan, dari\nD. dengan, ke',
 'audio_filename': 'tatabahasa/1283.mp3',
 'dataset': 'tatabahasa',
 'speaker': {'audio': 'dedup-parliament/parlimen-24k-LANGSUNG ： Persidangan Dewan Rakyat 11 November 2020 l Sesi Pagi [ImO1sg1QiG0]_000_25.mp3',
  'transcription': 'Dan kita ada satu kaji yang kita buat. Dia panggil model integrasi yang kita perkenalkan.'}}

In [11]:
with open('mallm.json') as fopen:
    mallm = json.load(fopen)
    
for i, row in tqdm(enumerate(mallm)):
    filename = os.path.join('mallm', f'{i}.mp3')
    if not os.path.exists(filename):
        continue
    q = row['question']
    if 'IV' in q or 'II' in q:
        continue
    d = {
        'prompt': json.dumps([
            {"role": "user", "content": [{"type": "audio", "audio_url": "audio.wav"}]},
            {'role': 'assistant', 'content': row['answer']},
        ]),
        'question': row['question'],
        'audio_filename': filename,
        'dataset': 'mallm',
        'speaker': row['speaker']
    }
    filtered.append(d)

6564it [00:00, 173054.49it/s]


In [12]:
len(filtered)

469477

In [17]:
pd.DataFrame(filtered).to_parquet('speech-instructions.parquet')

In [19]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="speech-instructions.parquet",
    path_in_repo="without-audio/speech-instructions.parquet",
    repo_id="malaysia-ai/Speech-Instructions",
    repo_type="dataset",
)

speech-instructions.parquet:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/Speech-Instructions/commit/e7ea5cad290e2d44f57a898a08436b2576d315b9', commit_message='Upload without-audio/speech-instructions.parquet with huggingface_hub', commit_description='', oid='e7ea5cad290e2d44f57a898a08436b2576d315b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/malaysia-ai/Speech-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='malaysia-ai/Speech-Instructions'), pr_revision=None, pr_num=None)

In [None]:
dataset = Dataset.from_list(filtered)

In [None]:
dataset = dataset.cast_column("audio_filename", audio)
dataset

In [None]:
dataset.push_to_hub('malaysia-ai/Speech-Instructions')