In [1]:
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/populated-text.json
# !wget https://huggingface.co/datasets/huseinzol05/Malay-TTS-Yasmin/resolve/main/populated-parliament.json

In [2]:
import numpy as np
import os
import random
import IPython.display as ipd

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [3]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_numbers = '0123456789'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

INITIAL_SYMBOLS = list(_letters) + list(_numbers)

In [4]:
import json

with open('populated-text.json') as fopen:
    texts = json.load(fopen)
    
with open('populated-parliament.json') as fopen:
    parliament = json.load(fopen)

In [5]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [6]:
from unidecode import unidecode
from malaya.text.normalization import digit, cardinal
import malaya

normalizer = malaya.normalize.normalizer()

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def put_spacing(string, chars = '()-'):
    for c in chars:
        string = string.replace(c, f' {c} ')
    return string

before = {';': ',', '_': '', '=': 'sama dengan', '*': 'asterisk',
          "'": '', '~': '', '`': ''}

after = {'/': 'garis miring'}

def replace_chars(string, chars):
    for k, v in chars.items():
        string = string.replace(k, f' {v} ')
    return string

patterns_num = [(r"\b\d+(?:[\.,']\d+)?\b\/\b\d+(?:[\.,']\d+)?\b", '/', 'garis miring'),
           (r"\b\d+(?:[\.,']\d+)?\b\-\b\d+(?:[\.,']\d+)?\b", '-', '')]

pattern_rm = r"RM \b\d+(?:[\.,']\d+)?\b (?:ribu|puluh|juta)"

replaces = {'dollar bilion': 'bilion dollar', 'dollar ribu': 'ribu dollar', 'dollar juta': 'juta dollar'}

def fix_pattern_num(string):
    for p in patterns_num:
        results = re.findall(p[0], string)
        for r in results:
            l_, r_ = r.split(p[1])
            string = string.replace(r, f'{digit(l_)} {p[2]} {digit(r_)}')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def fix_dash_num(string):
    results = re.findall(r"-\d+", string)
    for r in results:
        string = string.replace(r, f" {cardinal(r).replace('negatif', 'dash')} ")
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def fix_num_dash(string):
    results = re.findall(r"\d+-", string)
    for r in results:
        string = string.replace(r, f" {cardinal(r.replace('-', ''))} ")
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def cleaning(string, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = replace_chars(string, before)
    string = fix_zeros(string)
    string = fix_dash_date(string)
    string = fix_date(string)
    string = fix_1900(string)
    string = fix_isbn(string)
    string = fix_pattern_num(string)
    string = fix_dash_num(string)
    string = fix_num_dash(string)
    string = fix_rm(string)
    string = put_spacing(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = normalizer.normalize(string, normalize_text = False)['normalize']
    string = replace_chars(string, after)
    string = replace_chars(string, replaces)

    if string[-1] in '-,':
        string = string[:-1]
    if string[-2] in '-,!:;':
        string = string[:-2]
    if string[-1] != '.':
        string = string + '.'
    if string[0] not in INITIAL_SYMBOLS:
        string = string[1:]
    string = put_spacing_num(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string, tts_encode(string, add_eos = add_eos)

In [7]:
texts[22053]['cleaned']

'Pementasan Laskar Bayaran pada Jumat - Sabtu (25-26/8/2017) mengisahkan sebuah koloni yang dikuasai korporasi global bernama Paradize Capitol Corporation .'

In [8]:
def fix_dash_date(string):
    results = re.findall(r'\b\d+-\d+\/\d+\/\d+', string)
    for r in results:
        ori = r
        r = r.replace('-', ' dash ').replace('/', ' garis miring ')
        r = re.sub(r'[ ]+', ' ', r).strip()
        string = string.replace(ori, ' '.join([cardinal(i) for i in r.split()]))
    return string
        
    
fix_dash_date(texts[22053]['cleaned'])

'Pementasan Laskar Bayaran pada Jumat - Sabtu (dua puluh lima dash dua puluh enam garis miring lapan garis miring dua ribu tujuh belas) mengisahkan sebuah koloni yang dikuasai korporasi global bernama Paradize Capitol Corporation .'

In [9]:
rejected = ['Coppa Italia', 'Pak Ramli memerlukan']

In [10]:
def fix_rm(string):
    outer_results = {}
    t = string + ' .'
    compound = ['ribu', 'puluh', 'juta', 'bilion']
    pattern_rm = r"RM \b\d+(?:[\.,']\d+)?(?:[\.,']\d+)?\b (?:ribu|puluh|juta|bilion)"
    results = re.findall(pattern_rm, string)
    for r in results:
        splitted = r.split()
        if t[string.find(r) + len(r)] in '(/':
            s_ = splitted[-2].split('.')
            c = ' , '.join([cardinal(s__) for s__ in s_])
        else:
            c = cardinal(splitted[-2])
        if t[-2][-1] == '0' and '.' in splitted[-2]:
            c = f'{c} kosong'
        if t[string.find(r) + len(r)] in '(/':
            outer_results[r] = f'RM {c} {splitted[-1]}'.replace('perpuluhan', ',')
        else:
            outer_results[r] = f'{c} {splitted[-1]} RM'

    pattern_rm = r"RM \b\d+(?:[\.,']\d+)?(?:[\.,']\d+)?\b"
    results = re.findall(pattern_rm, string)
    for r in results:
        splitted = r.split()
        if t[string.find(r) + len(r)] in '(/':
            s_ = splitted[-1].split('.')
            c = ' , '.join([cardinal(s__) for s__ in s_])
        else:
            c = cardinal(splitted[-1])
        if splitted[-1][-1] == '0' and '.' in splitted[-1]:
            c = f'{c} kosong'
        if t[string.find(r) + len(r)] in '(/':
            outer_results[r] = f'RM {c}'.replace('perpuluhan', ',')
        else:
            outer_results[r] = f'{c} RM'
            
    for k, v in outer_results.items():
        string = string.replace(k, v)
    
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

fix_rm('KK RM 12.00; KN RM 8.10.')

'KK dua belas perpuluhan kosong kosong RM; KN lapan perpuluhan satu kosong RM.'

In [11]:
def fix_1900_year(string):
    results = re.findall(r'19\d\d-\d+', string)
    for r in results:
        splitted = r.split('-')
        string = string.replace(r, ' dash '.join(digit(s) for s in splitted))
    return string

def fix_1900(string):
    string = fix_1900_year(string)
    results = re.findall(r'19\d\d', string)
    for r in results:
        if r[-2:] != '00':
            if r[-2] == '0':
                c = 'kosong ' + cardinal(r[-1])
            else:
                c = cardinal(r[-2:])
            string = string.replace(r, 'sembilan belas ' + c)
        else:
            string = string.replace(r, 'sembilan belas ratus')
    return string

fix_1900('1902-'), fix_1900_year('1978-1978')

('sembilan belas kosong dua-',
 'satu sembilan tujuh lapan dash satu sembilan tujuh lapan')

In [12]:
def fix_isbn(string):
    results = re.findall(r'[0-9\-]+', string)
    results_ = []
    for r in results:
        try:
            if string[:string.find(r) - 1].split()[-1].lower() == 'isbn':
                results_.append(r)
        except:
            pass
    for r in results_:
        splitted = r.split('-')
        string = string.replace(r, ' dash '.join([digit(s) for s in splitted]))
    return string
    
fix_isbn(texts[1122]['cleaned'])

'ISBN kosong lapan kosong tiga dua lapan tujuh lapan tiga enam * Pollack , Kenneth (2005).'

In [13]:
def fix_zeros(string):
    r = []
    for s in string.split():
        if len(re.findall(r'^(0+)\b', s)):
            s = digit(s)
        r.append(s)
    return ' '.join(r)

fix_zeros(texts[2521]['cleaned']), fix_zeros('RM 50,000')

("kosong kosong kosong kosong kosong kosong 1000000 muaylaan muoy lean muay lan Dari Bahasa Thai laan '' lan ''.",
 'RM 50,000')

In [33]:
bulan = malaya.text.tatabahasa.bulan

def fix_date(string):
    short_date = re.findall(malaya.text.regex._short_date, string)
    if len(short_date):
        for r in short_date:
            if string[string.find(r) - 1] == '-':
                continue
            if '/' in r:
                splitted = r.split('/')
                if len(splitted) == 3:
                    if int(splitted[1]) > 12:
                        continue
                    day = cardinal(splitted[0])
                    month = bulan[int(splitted[1])].title()
                    year = cardinal(fix_1900(splitted[2]))
                else:
                    if int(splitted[0]) > 12:
                        continue
                    day = ''
                    month = bulan[int(splitted[0])].title()
                    year = cardinal(fix_1900(splitted[1]))
                string = string.replace(r, f'{day} {month} {year}')
            elif '.'in r:
                splitted = r.split('.')
                if len(splitted) == 3:
                    if len(splitted[-1]) == 2:
                        day = digit(splitted[0]) if splitted[0][0] == '0' else cardinal(splitted[0])
                        month = digit(splitted[1]) if splitted[1][0] == '0' else cardinal(splitted[1])
                        year = digit(splitted[2]) if splitted[2][0] == '0' else cardinal(splitted[2])
                        s = f'{day} {month} {year}'
                    else:
                        day = digit(splitted[0]) if splitted[0][0] == '0' else cardinal(splitted[0])
                        month = digit(splitted[1]) if splitted[1][0] == '0' else cardinal(splitted[1])
                        year = cardinal(fix_1900(splitted[2]))
                        s = f'{day} , {month} , {year}'
                    string = string.replace(r, s)
            elif '-' in r:
                splitted = r.split('-')
                if len(splitted) == 3:
                    if len(splitted[-1]) == 2:
                        day = digit(splitted[0]) if splitted[0][0] == '0' else cardinal(splitted[0])
                        month = digit(splitted[1]) if splitted[1][0] == '0' else cardinal(splitted[1])
                        year = digit(splitted[2]) if splitted[2][0] == '0' else cardinal(splitted[2])
                        s = f'{day} dash {month} dash {year}'
                    else:
                        day = digit(splitted[0]) if splitted[0][0] == '0' else cardinal(splitted[0])
                        month = digit(splitted[1]) if splitted[1][0] == '0' else cardinal(splitted[1])
                        year = cardinal(fix_1900(splitted[2]))
                        s = f'{day} dash {month} dash {year}'
                    string = string.replace(r, s)
                    
                
    date = re.findall(malaya.text.regex._date, string)
    if len(date):
        for r in date:
            if string[string.find(r) - 1] == '-':
                continue
            string = string.replace(r, ' '.join([cardinal(i) for i in r.split()]))
    return string

(fix_date('7/2014'), 
 fix_date('1/6/2014'), 
 fix_date('16 April 2013'), 
 fix_date('01.05.1991'), 
 fix_date('14.11.94'), 
 fix_date('1-12-1989'),
 fix_date('1-12-01'),
 fix_date('-1-12-01')
)

(' Julai dua ribu empat belas',
 'satu Jun dua ribu empat belas',
 'enam belas April dua ribu tiga belas',
 'kosong satu , kosong lima , sembilan belas sembilan puluh satu',
 'empat belas sebelas sembilan puluh empat',
 'satu dash dua belas dash sembilan belas lapan puluh sembilan',
 'satu dash dua belas dash kosong satu',
 '-1-12-01')

In [17]:
for t in txts:
    # r = re.findall(malaya.text.regex._date, t[1])
    r = re.findall(r'\b\d\d\/\d\d\/ \b', t[1])
    if len(r):
        print(t[2])
        print(t[1])
        print(cleaning(t[1])[0])
        print(r)
        print()

In [18]:
for i in random.sample(txts, 50):
    print(i[-1])
    print(i[-2])
    print(cleaning(i[-2])[0])
    print()

32490
Proses kajian semula ini sedang dijalankan oleh Kementerian Luar Negeri dengan kerjasama Kementerian dan Agensi 5 .
Proses kajian semu lah ini sedang dijalankan oleh Kementerian Luar Negeri dengan kerjasama Kementerian dan Agensi lima .

50174
Bidang - bidang yang terlibat adalah perubatan , syariah dan hal ehwal Islam ( Mufti ), perkhidmatan tadbir dan diplomatik , penerbangan awam serta pendidikan .
Bidang - bidang yang terlibat adalah perubatan , syariah dan hal ehwal Islam ( Mufti ) , perkhidmatan tadbir dan diplomatik , penerbangan awam serta pendidikan .

36145
timbul isu lMDB sudah muflis .
timbul isu yang kekosong sudah muflis .

12060
KPM yakin kerajaan tidak akan mengabaikan penyediaan peruntukan penyelenggaraan .
KPM yakin kerajaan tidak akan mengabaikan penyediaan peruntukan penyelenggaraan .

17772
Selain penetapan sasaran KPI MKRA YB .
Selain penetapan sasaran KPI MKRA YB .

38211
Pengagihan secara baucar pula bermula pada 23 Februari dan dibuat secara berperingkat 

In [50]:
from tqdm import tqdm

def process(txts):
    txts = txts[0]
    output = []
    
    for f in tqdm(txts):
        index = f[2]
        text = f[1]
        f = f[0]
        try:
            normalized = cleaning(text)[0]
            output.append({'index': index, 'text': text, 'normalized': normalized})
        except Exception as e:
            print(index, e)
        
    return output

In [40]:
cleaning(txts[12687][-2])[0]

'tiga belas garis miring sembilan belas sembilan puluh dua yang menegaskan bahawa pelabur swasta mahupun pemerintah daerah diberi peluang untuk menguruskan perkhidmatan pengangkutan kereta api di Indonesia .'

In [32]:
ipd.Audio('female/12687.wav')

In [41]:
o = process((txts[:10],))

100%|██████████| 10/10 [00:00<00:00, 664.67it/s]


In [42]:
txts = [(f'female/{i}.wav', texts[i]['cleaned'], i) for i in range(len(texts))]

In [51]:
import mp

out = mp.multiprocessing(txts, process, cores = 15, returned = True)

100%|██████████| 3333/3333 [00:04<00:00, 810.44it/s]
100%|██████████| 3333/3333 [00:04<00:00, 809.15it/s]
100%|██████████| 5/5 [00:00<00:00, 768.75it/s]9it/s]
100%|██████████| 3333/3333 [00:05<00:00, 631.90it/s]
100%|██████████| 3333/3333 [00:05<00:00, 629.81it/s]
100%|██████████| 3333/3333 [00:05<00:00, 623.61it/s]
100%|██████████| 3333/3333 [00:05<00:00, 606.97it/s]
100%|██████████| 3333/3333 [00:05<00:00, 599.49it/s]
100%|██████████| 3333/3333 [00:06<00:00, 555.31it/s]
100%|██████████| 3333/3333 [00:06<00:00, 519.26it/s]
100%|██████████| 3333/3333 [00:06<00:00, 506.23it/s]
100%|██████████| 3333/3333 [00:06<00:00, 503.20it/s]
100%|██████████| 3333/3333 [00:06<00:00, 499.63it/s]
100%|██████████| 3333/3333 [00:06<00:00, 485.03it/s]
100%|██████████| 3333/3333 [00:07<00:00, 453.78it/s]
100%|██████████| 3333/3333 [00:07<00:00, 443.30it/s]


In [52]:
len(out)

50000

In [56]:
with open('normalized-texts.json', 'w') as fopen:
    json.dump(out, fopen)

In [55]:
out[2]

{'index': 2,
 'text': 'Berguru daripada memoir : Pendidikan kewartawanan dalam memoir A . Samad Ismail , Said Zahari dan Arena Wati .',
 'normalized': 'Berguru daripada memoir : Pendidikan kewartawanan dalam memoir A . Samad Ismail , Said Zahari dan Arena Wati .'}

In [57]:
txts = [(f'female-parliament/{i}.wav', parliament[i]['cleaned'], i) for i in range(len(parliament))]

In [58]:
out = mp.multiprocessing(txts, process, cores = 15, returned = True)

 97%|█████████▋| 3845/3973 [00:05<00:00, 710.91it/s]

22851 invalid literal for int() with base 10: '.7'


100%|██████████| 3973/3973 [00:05<00:00, 698.29it/s]


19767 invalid literal for int() with base 10: '.7'


100%|██████████| 9/9 [00:00<00:00, 798.24it/s]8it/s]
100%|██████████| 3973/3973 [00:05<00:00, 680.58it/s]
100%|██████████| 3973/3973 [00:05<00:00, 682.29it/s]
100%|██████████| 3973/3973 [00:05<00:00, 677.10it/s]
100%|██████████| 3973/3973 [00:05<00:00, 676.76it/s]
100%|██████████| 3973/3973 [00:05<00:00, 679.55it/s]
100%|██████████| 3973/3973 [00:05<00:00, 670.78it/s]
100%|██████████| 3973/3973 [00:06<00:00, 659.00it/s]
100%|██████████| 3973/3973 [00:06<00:00, 656.02it/s]
 85%|████████▌ | 3388/3973 [00:06<00:00, 708.91it/s]

6836 invalid literal for int() with base 10: '.1'


100%|██████████| 3973/3973 [00:06<00:00, 582.03it/s]
100%|██████████| 3973/3973 [00:06<00:00, 585.47it/s]
100%|██████████| 3973/3973 [00:07<00:00, 559.72it/s]
100%|██████████| 3973/3973 [00:07<00:00, 510.06it/s]
100%|██████████| 3973/3973 [00:08<00:00, 483.28it/s]
100%|██████████| 3973/3973 [00:09<00:00, 429.20it/s]


In [59]:
len(out)

59601

In [60]:
with open('normalized-parliaments.json', 'w') as fopen:
    json.dump(out, fopen)