# Prepare `iium-clean` Data

## Load Data Into Dataframes

In [2]:
from glob import glob

DIR_ROOT = '.'

audio_paths = glob(f'{DIR_ROOT}/audio-iium/*.wav')
labels = []

In [4]:
with open(f'{DIR_ROOT}/shuffled-iium.json') as f:
    labels = f.readline()
    labels = labels[1:-2].split('",')
    labels = list(map(lambda l: l.strip('"'), labels))

In [5]:
audio_paths[0]

'./audio-iium/96.wav'

In [1]:
import pandas as pd

In [7]:
labels_df = pd.read_json(f'{DIR_ROOT}/shuffled-iium.json')
labels_df

Unnamed: 0,0
0,"Kawan sy office boy, dia banyak kenal staff2 d..."
1,"Akhir2 ni, aku rasa macam dah semakin tenat."
2,"Eh, kau xkena macam aku, so jangan banyak buny..."
3,Aku bangun dan menghempas tubuh ke atas tilam ...
4,"Hmm menyesal tulis tajuk guna nombor roman, ke..."
...,...
1121973,Maafkanlah aku andai aku pesan tak kena cara k...
1121974,Teruskan berkawan dgn mereka yg nk berkawan dg...
1121975,Ingatan saya pada kak Na membuak-buak.
1121976,Alhamdulillah… Tapiii… kenapa hati saya tak ra...


In [8]:
labels_df = labels_df.rename(columns={0: 'label'})
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1121978 entries, 0 to 1121977
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   label   1121978 non-null  object
dtypes: object(1)
memory usage: 8.6+ MB


In [9]:
audios_df = pd.DataFrame(audio_paths)
audios_df

Unnamed: 0,0
0,./audio-iium/96.wav
1,./audio-iium/87.wav
2,./audio-iium/92.wav
3,./audio-iium/55.wav
4,./audio-iium/112.wav
...,...
92,./audio-iium/81.wav
93,./audio-iium/80.wav
94,./audio-iium/89.wav
95,./audio-iium/31.wav


In [10]:
audios_df = audios_df.rename(columns={0: 'path'})

In [11]:
audios_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    97 non-null     object
dtypes: object(1)
memory usage: 904.0+ bytes


In [12]:
def generate_index(p: str) -> int:
    p = p.removeprefix(f'{DIR_ROOT}/audio-iium/')
    p = p.removesuffix('.wav')
    return int(p)

In [13]:
audios_df['index'] = audios_df['path'].map(generate_index)
audios_df

Unnamed: 0,path,index
0,./audio-iium/96.wav,96
1,./audio-iium/87.wav,87
2,./audio-iium/92.wav,92
3,./audio-iium/55.wav,55
4,./audio-iium/112.wav,112
...,...,...
92,./audio-iium/81.wav,81
93,./audio-iium/80.wav,80
94,./audio-iium/89.wav,89
95,./audio-iium/31.wav,31


In [14]:
audios_df = audios_df.sort_values(by='index').reset_index(drop=True)
audios_df

Unnamed: 0,path,index
0,./audio-iium/3.wav,3
1,./audio-iium/4.wav,4
2,./audio-iium/5.wav,5
3,./audio-iium/6.wav,6
4,./audio-iium/7.wav,7
...,...,...
92,./audio-iium/106.wav,106
93,./audio-iium/107.wav,107
94,./audio-iium/110.wav,110
95,./audio-iium/111.wav,111


## Align Audios and Labels

In [None]:
import IPython.display as ipd

ipd.Audio(audios_df['path'][0])

In [None]:
labels_df

Unnamed: 0,label
0,"Kawan sy office boy, dia banyak kenal staff2 d..."
1,"Akhir2 ni, aku rasa macam dah semakin tenat."
2,"Eh, kau xkena macam aku, so jangan banyak buny..."
3,Aku bangun dan menghempas tubuh ke atas tilam ...
4,"Hmm menyesal tulis tajuk guna nombor roman, ke..."
...,...
1121973,Maafkanlah aku andai aku pesan tak kena cara k...
1121974,Teruskan berkawan dgn mereka yg nk berkawan dg...
1121975,Ingatan saya pada kak Na membuak-buak.
1121976,Alhamdulillah… Tapiii… kenapa hati saya tak ra...


In [None]:
audios_df['index']

0       3
1       4
2       5
3       6
4       7
     ... 
92    106
93    107
94    110
95    111
96    112
Name: index, Length: 97, dtype: int64

In [None]:
index = audios_df['path'].where(audios_df['index'] == 106).dropna().values[0]
index

'./audio-iium/106.wav'

In [None]:
import IPython.display as ipd

ipd.Audio(index)

In [None]:
labels_df.loc[labels_df['label'].str.contains('kali ini aku nak cuba kerja lain pula', case=False)]

Unnamed: 0,label
106,Kali ini aku nak cuba kerja lain pula.


The index is aligned with the audio number.

In [None]:
labels_df.shape

(1121978, 1)

In [None]:
import numpy as np

labels_df = labels_df.reset_index()
labels_df

Unnamed: 0,index,label
0,0,"Kawan sy office boy, dia banyak kenal staff2 d..."
1,1,"Akhir2 ni, aku rasa macam dah semakin tenat."
2,2,"Eh, kau xkena macam aku, so jangan banyak buny..."
3,3,Aku bangun dan menghempas tubuh ke atas tilam ...
4,4,"Hmm menyesal tulis tajuk guna nombor roman, ke..."
...,...,...
1121973,1121973,Maafkanlah aku andai aku pesan tak kena cara k...
1121974,1121974,Teruskan berkawan dgn mereka yg nk berkawan dg...
1121975,1121975,Ingatan saya pada kak Na membuak-buak.
1121976,1121976,Alhamdulillah… Tapiii… kenapa hati saya tak ra...


In [None]:
audios_df

Unnamed: 0,path,index
0,./audio-iium/3.wav,3
1,./audio-iium/4.wav,4
2,./audio-iium/5.wav,5
3,./audio-iium/6.wav,6
4,./audio-iium/7.wav,7
...,...,...
92,./audio-iium/106.wav,106
93,./audio-iium/107.wav,107
94,./audio-iium/110.wav,110
95,./audio-iium/111.wav,111


In [None]:
df = pd.merge(audios_df, labels_df, how='inner') # Intersection
df

Unnamed: 0,path,index,label
0,./audio-iium/3.wav,3,Aku bangun dan menghempas tubuh ke atas tilam ...
1,./audio-iium/4.wav,4,"Hmm menyesal tulis tajuk guna nombor roman, ke..."
2,./audio-iium/5.wav,5,"Sini few things lah kita kena take note, bagi ..."
3,./audio-iium/6.wav,6,Petang tu mak bawa aku jumpa Uncle Man.
4,./audio-iium/7.wav,7,Tangan aku disentuh lembut.
...,...,...,...
92,./audio-iium/106.wav,106,Kali ini aku nak cuba kerja lain pula.
93,./audio-iium/107.wav,107,c) Tudung/rambut: sama seperti di sektor swasta.
94,./audio-iium/110.wav,110,"Hari jumaat lepas kerja, siap-siap untuk nak n..."
95,./audio-iium/111.wav,111,Ok tu lupakan.


In [None]:
ipd.Audio(df.where(df['index'] == 112).dropna()['path'].values[0])

In [None]:
df.where(df['index'] == 112).dropna()['label'].values[0]

'Cerita ni berkenaan dengan pengorbanan mama pada opah.'

In [None]:
df = df[['index', 'path', 'label']]
df

Unnamed: 0,index,path,label
0,3,./audio-iium/3.wav,Aku bangun dan menghempas tubuh ke atas tilam ...
1,4,./audio-iium/4.wav,"Hmm menyesal tulis tajuk guna nombor roman, ke..."
2,5,./audio-iium/5.wav,"Sini few things lah kita kena take note, bagi ..."
3,6,./audio-iium/6.wav,Petang tu mak bawa aku jumpa Uncle Man.
4,7,./audio-iium/7.wav,Tangan aku disentuh lembut.
...,...,...,...
92,106,./audio-iium/106.wav,Kali ini aku nak cuba kerja lain pula.
93,107,./audio-iium/107.wav,c) Tudung/rambut: sama seperti di sektor swasta.
94,110,./audio-iium/110.wav,"Hari jumaat lepas kerja, siap-siap untuk nak n..."
95,111,./audio-iium/111.wav,Ok tu lupakan.


## Split Into Train, Dev, Test

In [None]:
from sklearn.model_selection import train_test_split

train, devtest = train_test_split(df, test_size=0.3)
train, devtest

(    index                  path  \
 0       3    ./audio-iium/3.wav   
 86     97   ./audio-iium/97.wav   
 76     86   ./audio-iium/86.wav   
 27     31   ./audio-iium/31.wav   
 95    111  ./audio-iium/111.wav   
 ..    ...                   ...   
 84     95   ./audio-iium/95.wav   
 39     45   ./audio-iium/45.wav   
 29     33   ./audio-iium/33.wav   
 77     87   ./audio-iium/87.wav   
 63     70   ./audio-iium/70.wav   
 
                                                 label  
 0   Aku bangun dan menghempas tubuh ke atas tilam ...  
 86  Aku cuba banyak benda sebelum aku dapat kerja ...  
 76           apa yang aku kurang dan kawan aku lebih.  
 27                         sambil menyelam minum air.  
 95                                     Ok tu lupakan.  
 ..                                                ...  
 84                             Dia tak maki-maki kau.  
 39                                  Memang salah aku.  
 29                      Tak seindah khabar dari rupa

In [None]:
dev, test = train_test_split(pd.DataFrame(devtest), test_size=0.5)
dev, test

(    index                  path  \
 10     13   ./audio-iium/13.wav   
 49     55   ./audio-iium/55.wav   
 92    106  ./audio-iium/106.wav   
 94    110  ./audio-iium/110.wav   
 51     57   ./audio-iium/57.wav   
 87     99   ./audio-iium/99.wav   
 50     56   ./audio-iium/56.wav   
 47     53   ./audio-iium/53.wav   
 62     69   ./audio-iium/69.wav   
 52     58   ./audio-iium/58.wav   
 8      11   ./audio-iium/11.wav   
 6       9    ./audio-iium/9.wav   
 11     14   ./audio-iium/14.wav   
 46     52   ./audio-iium/52.wav   
 22     26   ./audio-iium/26.wav   
 
                                                 label  
 10                            Kerana sayang gamaknya.  
 49  Benda ni sangat menganggu kehidupan seharian a...  
 92             Kali ini aku nak cuba kerja lain pula.  
 94  Hari jumaat lepas kerja, siap-siap untuk nak n...  
 51                   Tapi ini langsung tiada jawapan.  
 87                                         Paham dah?  
 50                akak

In [None]:
pd.DataFrame(train)

Unnamed: 0,index,path,label
0,3,./audio-iium/3.wav,Aku bangun dan menghempas tubuh ke atas tilam ...
86,97,./audio-iium/97.wav,Aku cuba banyak benda sebelum aku dapat kerja ...
76,86,./audio-iium/86.wav,apa yang aku kurang dan kawan aku lebih.
27,31,./audio-iium/31.wav,sambil menyelam minum air.
95,111,./audio-iium/111.wav,Ok tu lupakan.
...,...,...,...
84,95,./audio-iium/95.wav,Dia tak maki-maki kau.
39,45,./audio-iium/45.wav,Memang salah aku.
29,33,./audio-iium/33.wav,Tak seindah khabar dari rupa.
77,87,./audio-iium/87.wav,"Menangis sendirian, menyesal atas sikap sendiri."


In [None]:
pd.DataFrame(dev)

Unnamed: 0,index,path,label
10,13,./audio-iium/13.wav,Kerana sayang gamaknya.
49,55,./audio-iium/55.wav,Benda ni sangat menganggu kehidupan seharian a...
92,106,./audio-iium/106.wav,Kali ini aku nak cuba kerja lain pula.
94,110,./audio-iium/110.wav,"Hari jumaat lepas kerja, siap-siap untuk nak n..."
51,57,./audio-iium/57.wav,Tapi ini langsung tiada jawapan.
87,99,./audio-iium/99.wav,Paham dah?
50,56,./audio-iium/56.wav,akak memang malu sangat nak mintak.
47,53,./audio-iium/53.wav,Belajar Bersyukur.
62,69,./audio-iium/69.wav,kenapa aku perlu diam?
52,58,./audio-iium/58.wav,Sebab aku percaya bila kita senangkan kerja or...


In [None]:
pd.DataFrame(test)

Unnamed: 0,index,path,label
60,66,./audio-iium/66.wav,Saya tahu dia mengharapkan saya bermurah hati ...
91,103,./audio-iium/103.wav,Itu ujian hidup berkeluarga.
26,30,./audio-iium/30.wav,Bisnes berjalan macan biasa.
89,101,./audio-iium/101.wav,Betul tidak?
21,25,./audio-iium/25.wav,sebab tu mak aku.
24,28,./audio-iium/28.wav,Dakwah jugak tu tanpa kita sedar.
5,8,./audio-iium/8.wav,"Pengalaman aku, aku satu geng nak makan tengha..."
33,37,./audio-iium/37.wav,Sayangnya Rasulullah pada kita.
44,50,./audio-iium/50.wav,Walaupun mereka membawa air masuk ke dalam kel...
66,73,./audio-iium/73.wav,"Sebagai manusia yg sihat, aku percaya kau yg s..."


## Export as File

In [None]:
train.to_csv(f'{DIR_ROOT}/iium-clean-train.csv', index=False)
dev.to_csv(f'{DIR_ROOT}/iium-clean-dev.csv', index=False)
test.to_csv(f'{DIR_ROOT}/iium-clean-test.csv', index=False)

In [3]:
train_loaded = pd.read_csv(f'{DIR_ROOT}/iium-clean-train.csv')
train_loaded

Unnamed: 0,index,path,label
0,3,./audio-iium/3.wav,Aku bangun dan menghempas tubuh ke atas tilam ...
1,97,./audio-iium/97.wav,Aku cuba banyak benda sebelum aku dapat kerja ...
2,86,./audio-iium/86.wav,apa yang aku kurang dan kawan aku lebih.
3,31,./audio-iium/31.wav,sambil menyelam minum air.
4,111,./audio-iium/111.wav,Ok tu lupakan.
...,...,...,...
62,95,./audio-iium/95.wav,Dia tak maki-maki kau.
63,45,./audio-iium/45.wav,Memang salah aku.
64,33,./audio-iium/33.wav,Tak seindah khabar dari rupa.
65,87,./audio-iium/87.wav,"Menangis sendirian, menyesal atas sikap sendiri."


In [4]:
dev_loaded = pd.read_csv(f'{DIR_ROOT}/iium-clean-dev.csv')
dev_loaded

Unnamed: 0,index,path,label
0,13,./audio-iium/13.wav,Kerana sayang gamaknya.
1,55,./audio-iium/55.wav,Benda ni sangat menganggu kehidupan seharian a...
2,106,./audio-iium/106.wav,Kali ini aku nak cuba kerja lain pula.
3,110,./audio-iium/110.wav,"Hari jumaat lepas kerja, siap-siap untuk nak n..."
4,57,./audio-iium/57.wav,Tapi ini langsung tiada jawapan.
5,99,./audio-iium/99.wav,Paham dah?
6,56,./audio-iium/56.wav,akak memang malu sangat nak mintak.
7,53,./audio-iium/53.wav,Belajar Bersyukur.
8,69,./audio-iium/69.wav,kenapa aku perlu diam?
9,58,./audio-iium/58.wav,Sebab aku percaya bila kita senangkan kerja or...


In [5]:
test_loaded = pd.read_csv(f'{DIR_ROOT}/iium-clean-test.csv')
test_loaded

Unnamed: 0,index,path,label
0,66,./audio-iium/66.wav,Saya tahu dia mengharapkan saya bermurah hati ...
1,103,./audio-iium/103.wav,Itu ujian hidup berkeluarga.
2,30,./audio-iium/30.wav,Bisnes berjalan macan biasa.
3,101,./audio-iium/101.wav,Betul tidak?
4,25,./audio-iium/25.wav,sebab tu mak aku.
5,28,./audio-iium/28.wav,Dakwah jugak tu tanpa kita sedar.
6,8,./audio-iium/8.wav,"Pengalaman aku, aku satu geng nak makan tengha..."
7,37,./audio-iium/37.wav,Sayangnya Rasulullah pada kita.
8,50,./audio-iium/50.wav,Walaupun mereka membawa air masuk ke dalam kel...
9,73,./audio-iium/73.wav,"Sebagai manusia yg sihat, aku percaya kau yg s..."


In [30]:
def toList(series: pd.Series) -> pd.Series:
    return series.tolist()

combined = list(map(toList, [train_loaded['label'], dev_loaded['label'], test_loaded['label']]))

['c' in w for l in combined for w in l]

[False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True]

In [64]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': 'iium-clean-train.csv', 'dev': 'iium-clean-dev.csv', 'test': 'iium-clean-test.csv'})
dataset

Using custom data configuration default-6918cd0b291bc018
Reusing dataset csv (/home/dsonic/.cache/huggingface/datasets/csv/default-6918cd0b291bc018/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
100%|██████████| 3/3 [00:00<00:00, 303.48it/s]


DatasetDict({
    train: Dataset({
        features: ['index', 'path', 'label'],
        num_rows: 67
    })
    dev: Dataset({
        features: ['index', 'path', 'label'],
        num_rows: 15
    })
    test: Dataset({
        features: ['index', 'path', 'label'],
        num_rows: 15
    })
})

In [79]:
import re

def remove_special_chars(df: pd.DataFrame):
    chars_to_ignore = '[\,\?\.\!\-\;\:\"\(\)/…0-9]'
    df['label'] = re.sub(chars_to_ignore, '', df['label']).lower() + ' '
    return df

def joinStr(s: list):
    return ' '.join(s)

dataset = dataset.map(remove_special_chars)
all = joinStr(dataset['train']['label'])
all += joinStr(dataset['dev']['label'])
all += joinStr(dataset['test']['label'])

100%|██████████| 67/67 [00:00<00:00, 6862.98ex/s]
100%|██████████| 15/15 [00:00<00:00, 2212.73ex/s]
100%|██████████| 15/15 [00:00<00:00, 5331.29ex/s]


In [82]:
vocabs = set(all)
vocab_dict = {v: k for k, v in enumerate(vocabs)}
vocabs

{' ',
 "'",
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}