In [1]:
import pickle
from glob import glob

files = glob('/home/husein/processed-youtube-asr-whisper-large-v2/*.pkl')
files.extend(glob('/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/*.pkl'))
files = sorted(files)
len(files)

24390

In [2]:
from datetime import datetime, timedelta
import pathlib

atleast = datetime(2023, 8, 5)

filtered = []
for f in files:
    fname = pathlib.Path(f)
    if datetime.fromtimestamp(fname.stat().st_mtime) < atleast:
        filtered.append(f)
        
files = sorted(filtered)
len(files)

17414

In [3]:
from datasets import Audio
from tqdm import tqdm
import numpy as np
import IPython.display as ipd

In [4]:
sr = 16000
reader = Audio(sampling_rate = sr)

In [5]:
with open(files[0], 'rb') as fopen:
    data = pickle.load(fopen)

In [6]:
ms_score = [s['avg_logprob'] for s in data[0]['asr_model'][1]['segments']]
en_score = [s['avg_logprob'] for s in data[0]['asr_model'][2]['segments']]

if np.mean(ms_score) > np.mean(en_score):
    selected = data[0]['asr_model'][1]['segments']
else:
    selected = data[0]['asr_model'][2]['segments']

In [7]:
i = 0

s = selected[i]['start']
e = selected[i]['end']
t = selected[i]['text']
s, e, t

(0.0, 9.0, ' The Fashion Week.')

In [8]:
f = data[0]['wav_data'].replace('/home/ubuntu/', '/home/husein/')
audio = reader.decode_example(reader.encode_example(f))['array']
y = audio[int(s * sr): int(e * sr)]

In [9]:
ipd.Audio(y, rate = sr)

In [10]:
import malaya_speech

2023-09-08 02:00:51.855651: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-08 02:00:52.303081: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-08 02:00:53.630295: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-08 02:00:53.630510: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [11]:
vad = malaya_speech.vad.webrtc()

In [12]:
y_int = malaya_speech.astype.float_to_int(y)
frames_int = list(malaya_speech.utils.generator.frames(y_int, 30, sr))
np.mean([vad(frame) for frame in frames_int])

0.9833333333333333

In [13]:
import malaya
from malaya.text.normalization import cardinal

tokenizer = malaya.tokenizer.Tokenizer(hypen = False, parliament = False, time = False, time_pukul = False,
                                      temperature = False, distance = False, volume = False, duration = False,
                                      weight = False, date = False, money = False)

In [14]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    tokenized = tokenizer.tokenize(string)
    string = ' '.join(tokenized)
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [16]:
directory = '/home/husein/ssd3/stt/whisper-latest'
!du -hs {directory}

124G	/home/husein/ssd3/stt/whisper-latest


In [24]:
# !rm -rf {directory}
!mkdir {directory}

In [2]:
import torchaudio
import torch
import os

In [26]:
def loop(files):
    files, outside_index = files
    dataset = []
    index = 0
    
    for no in tqdm(range(len(files))):
    
        f = files[no]

        with open(f, 'rb') as fopen:
            data = pickle.load(fopen)

        for i in range(len(data)):
            
            try:
                lang = max(data[i]['asr_model'][0], key=data[i]['asr_model'][0].get)

                if lang not in {'ms', 'en'}:
                    continue

                ms_score = [s['avg_logprob'] for s in data[i]['asr_model'][1]['segments']]
                en_score = [s['avg_logprob'] for s in data[i]['asr_model'][2]['segments']]

                if np.mean(ms_score) > np.mean(en_score):
                    selected = data[i]['asr_model'][1]['segments']
                else:
                    selected = data[i]['asr_model'][2]['segments']

                f_audio = data[i]['wav_data'].replace('/home/ubuntu/', '/home/husein/')
                if not os.path.exists(f_audio):
                    continue

                audio = reader.decode_example(reader.encode_example(f_audio))['array']
                for k in range(len(selected)):
                    s = selected[k]['start']
                    e = selected[k]['end'] + 0.1
                    t = selected[k]['text']
                    cleaned_t = preprocessing_text(t)
                    if not len(cleaned_t):
                        continue

                    audio_path = os.path.join(directory, f'{outside_index}-{index}.mp3')
                    a = audio[int(s * sr): int(e * sr)]
                    torchaudio.save(audio_path, 
                            torch.tensor(a.astype('float32')).unsqueeze(0), 
                            16000, format='mp3')
                    try:
                        y_int = malaya_speech.astype.float_to_int(a)
                        frames_int = list(malaya_speech.utils.generator.frames(y_int, 30, sr, append_ending_trail = False))

                        dataset.append(
                            {
                                'file': audio_path,
                                'text': t,
                                'cleaned': cleaned_t,
                                'no': no,
                                'i': i,
                                'k': k,
                                'avg_logprob': selected[k]['avg_logprob'],
                                'no_speech_prob': selected[k]['no_speech_prob'],
                                'vad': float(np.mean([vad(frame) for frame in frames_int])),
                            }
                        )
                    except:
                        pass

                    index += 1
            except Exception as e:
                print(e)
                
    return dataset

In [27]:
# a = loop((files[:1],0))

In [28]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py
import mp

In [29]:
a = mp.multiprocessing(files, loop, cores = 10)

 60%|████████████████████████████████████████████████████████████▍                                       | 1052/1741 [4:13:54<10:16:50, 53.72s/it]

[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2/processed-youtube-asr-whisper-large-v3/[LANGSUNG]_100_Hari_Kerajaan_Perpaduan'
[Errno 2] No such file or directory: '/home/husein/ssd2

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1741/1741 [4:49:25<00:00,  9.97s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:52<00:00, 88.07s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1741/1741 [4:58:18<00:00, 10.28s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1741/1741 [5:45:40<00:00, 11.91s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1741/1741 [6:04:42<00:00, 12.57s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1741/1741 [6:20:19<00:00, 13.11s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1741/174

In [31]:
a[0]

{'file': '/home/husein/ssd3/stt/whisper-latest/0-0.mp3',
 'text': ' The Fashion Week.',
 'cleaned': 'the fashion week',
 'no': 0,
 'i': 0,
 'k': 0,
 'avg_logprob': -0.35195762232730265,
 'no_speech_prob': 0.06681658327579498,
 'vad': 0.9801980198019802}

In [33]:
import json

with open('dataset-whisper-stt-2023-06-19.jsonl', 'w') as fopen:
    for d in tqdm(a):
        fopen.write(f'{json.dumps(d)}\n')

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 7210208/7210208 [00:50<00:00, 142966.69it/s]


In [34]:
!ls -lh dataset-whisper-stt-2023-06-19.jsonl

-rw-r--r-- 1 husein husein 2.1G Sep  10 10:29 dataset-whisper-stt-2023-06-19.jsonl


In [35]:
!du -hs {directory}

131G	/home/husein/ssd3/stt/whisper-latest


In [36]:
len(a)

7210208

In [43]:
a[-3]

{'file': '/home/husein/ssd3/stt/whisper-latest/10-5891.mp3',
 'text': ' M-Star, terbaik, terima kasih',
 'cleaned': 'm star terbaik terima kasih',
 'no': 3,
 'i': 18,
 'k': 8,
 'avg_logprob': -0.5078456566768622,
 'no_speech_prob': 0.1324165314435959,
 'vad': 0.0}

In [44]:
ipd.Audio(a[-3]['file'], rate = sr)