Import Library

In [2]:
# !pip install datasets
# !pip install transformers
# !pip install jiwer
# !apt install git-lfs

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset, load_metric
import librosa
import numpy as np
import IPython.display as ipd
import torch
import re
import json
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer
from mutagen.mp3 import MP3

CUDA Check

In [5]:
print(f"cuDNN Version : {torch.backends.cudnn.version()}")
print(f"Check CUDA/GPU Can Be Used : {torch.cuda.is_available()}")

cuDNN Version : 8600
Check CUDA/GPU Can Be Used : True


Load Data

In [6]:
total_data = load_dataset('mozilla-foundation/common_voice_11_0', name='id')
print(total_data)

Found cached dataset common_voice_11_0 (/home/alckylzer/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/id/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)


  0%|          | 0/5 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 5048
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 3226
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 3618
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 24238
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 2466
    })
})


In [7]:
# Main Dataset
data_train = load_dataset('mozilla-foundation/common_voice_11_0', name='id', split="train+validation", use_auth_token=True)
data_test = load_dataset('mozilla-foundation/common_voice_11_0', name='id', split="test", use_auth_token=True)

Found cached dataset common_voice_11_0 (/home/alckylzer/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/id/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)
Found cached dataset common_voice_11_0 (/home/alckylzer/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/id/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0)


In [8]:
# Corpus for LM Dataset
oscar_corpus = load_dataset("oscar-corpus/OSCAR-2201", use_auth_token=True, language="id", split="train", streaming=True)

Using custom data configuration id-language=id


In [9]:
print(f"Data Train Features : \n {data_train.features}\n\n")
print(f"Data Test Features : \n {data_test.features}")

Data Train Features : 
 {'client_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None), 'up_votes': Value(dtype='int64', id=None), 'down_votes': Value(dtype='int64', id=None), 'age': Value(dtype='string', id=None), 'gender': Value(dtype='string', id=None), 'accent': Value(dtype='string', id=None), 'locale': Value(dtype='string', id=None), 'segment': Value(dtype='string', id=None)}


Data Test Features : 
 {'client_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None), 'up_votes': Value(dtype='int64', id=None), 'down_votes': Value(dtype='int64', id=None), 'age': Value(dtype='string', id=None), 'gender': Value(dtype='string', id=None), 'accent': Value(dtype='string', id=None), 'locale': Value(dtype='string', id=None), 's

In [10]:
total_dataset = data_train.num_rows + data_test.num_rows
data_train_percent =  data_train.num_rows / total_dataset * 100
data_test_percent = data_test.num_rows / total_dataset * 100

print(f"Total Data Train : {data_train.num_rows} atau {round(data_train_percent)}%")
print(f"Total Data Test : {data_test.num_rows} atau {round(data_test_percent)}%")

Total Data Train : 8274 atau 70%
Total Data Test : 3618 atau 30%


Remove Columns

In [11]:
data_train = data_train.remove_columns(['client_id', 'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])
data_test = data_test.remove_columns(['client_id',  'audio', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'])

In [12]:
print(f"Data Train Features : \n {data_train.features}\n\n")
print(f"Data Test Features : \n {data_test.features}")

Data Train Features : 
 {'path': Value(dtype='string', id=None), 'sentence': Value(dtype='string', id=None)}


Data Test Features : 
 {'path': Value(dtype='string', id=None), 'sentence': Value(dtype='string', id=None)}


In [13]:
data_train_path = list(data_train['path'])
data_train_length = 0
for path in data_train_path:
    audio = MP3(path)
    length = audio.info.length
    data_train_length += length
print(f"Data Train = {int(data_train_length)//3600} Jam : {int(data_train_length)%3600//60} Menit : {int(data_train_length)%3600%60} Detik")

data_test_path = list(data_test['path'])
data_test_length = 0
for path in data_test_path:
    audio = MP3(path)
    length = audio.info.length
    data_test_length += length
print(f"Data Test = {int(data_test_length)//3600} Jam : {int(data_test_length)%3600//60} Menit : {int(data_test_length)%3600%60} Detik")

Data Train = 11 Jam : 33 Menit : 28 Detik
Data Test = 4 Jam : 7 Menit : 27 Detik


Resample

In [14]:
import audioread
from librosa.util import buf_to_float

def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
    y = []
    with audioread.audio_open(path) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) * n_channels)

        n = 0

        for frame in input_file:
            frame = buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[: s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev) :]

            # tack on the current frame
            y.append(frame)

    if y:
        y = np.concatenate(y)
        if n_channels > 1:
            y = y.reshape((-1, n_channels)).T
    else:
        y = np.empty(0, dtype=dtype)

    return y, sr_native

In [15]:
def resample(batch):
    audio, sample_rate = audioread_load(batch['path'])
    audio = librosa.to_mono(audio)
    batch['audio_resampled'] = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) 

    return batch
    
data_train_resample = data_train.map(resample)
data_test_resample = data_test.map(resample) 

Loading cached processed dataset at /home/alckylzer/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/id/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-8df27d66ddf84352.arrow
Loading cached processed dataset at /home/alckylzer/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/id/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-32dbb2877c311299.arrow


In [None]:
data_train_resample['audio_resampled'][0]

In [None]:
print(f"Data Train Features : \n {data_train_resample.features}\n\n")
print(f"Data Test Features : \n {data_test_resample.features}")

Clean Sentence

In [None]:
print("Data Train Sentence Example : \n")
for i in range(10):
    print(data_train_resample['sentence'][i])

print("\n\nData Test Sentence Example : \n")
for i in range(10):
    print(data_test_resample['sentence'][i])

In [None]:
def cleaning_sentence(dataframe, oscar = False):
    chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\[\]\(\)\~\|\\\]'
    if oscar == False:
      col = 'sentence'
    else:
      col = 'text'

    dataframe[col] = dataframe[col].encode("ascii", "ignore").decode()
    dataframe[col] = dataframe[col].lower()
    dataframe[col] = re.sub(chars_to_remove_regex, '', dataframe[col])
    dataframe[col] = re.sub('&', 'dan', dataframe[col])
    dataframe[col] = re.sub('#', 'hashtag', dataframe[col])
    dataframe[col] = re.sub('@', 'at', dataframe[col])
    dataframe[col] = ' '.join(dataframe[col].split())

    return dataframe

In [None]:
data_train_cleaned = data_train_resample.map(cleaning_sentence)
data_test_cleaned = data_test_resample.map(cleaning_sentence)
oscar_corpus_cleaned = oscar_corpus.map(lambda x: cleaning_sentence(x, True))

In [37]:
text = list(list(data_train_cleaned['sentence']) + list(data_test_cleaned['sentence']))

with open("text.txt", "w") as file:
  file.write(" ".join(text))

for data in oscar_corpus_cleaned.take(1_000_000):
    with open("text.txt", "a") as file:
      file.write(" ".join(data['text']))

In [38]:
print("Data Train Sentence Cleaned Example : \n")
for i in range(10):
    print(data_train_cleaned['sentence'][i])

print("\n\nData Test Sentence Cleaned Example : \n")
for i in range(10):
    print(data_test_cleaned['sentence'][i])

Data Train Sentence Cleaned Example : 

halo dunia
sudah makan sudah sholat
udah keluar hasil testnya
dimanakah sate paling enak di jakarta selatan
coba terus sampai berhasil
kapan saya harus melapor pajak
biro terdiri dari ketua komite dua wakil ketua dan pelapor
seorang anak lakilaki berkulit hitam mengenakan celana jins biru dan baju berwarna terang sedang menatap tangannya
dia mengakhiri musim bermain untuk wellington di liga birmingham
setiap dewan memiliki ketua yang dipilih oleh dewan


Data Test Sentence Cleaned Example : 

maha suci allah
inilah dunia kecil
nol
aku tidak tahu artinya kebencian
tugas saya belom kelar
itu tadi menarik ya
saya paling suka lagu ini
jalanan sepi dari kendaraan
jangan pergi
semua kamus mengandung kesalahan


Vocabulary List

In [39]:
vocab_list = []
for w in data_train_cleaned['sentence']:
    vocab_list.extend(" ".join(w))
for w in data_test_cleaned['sentence']:
    vocab_list.extend(" ".join(w))
vocab_list = list(set(vocab_list))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

In [40]:
print(f"Vocabulary List :\n {vocab_dict}")

Vocabulary List :
 {'j': 0, 'k': 1, 'e': 2, 'h': 3, 'd': 4, 's': 5, 'f': 6, 'n': 7, 'y': 8, 'i': 9, 'p': 10, 'g': 11, 'o': 12, 'b': 13, 'c': 14, 'v': 15, 'x': 16, 'z': 17, 'r': 18, 't': 19, 'l': 20, 'm': 21, 'q': 23, 'u': 24, 'a': 25, 'w': 26, '|': 22, '[UNK]': 27, '[PAD]': 28}


Save Vocab (To .json)

In [41]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

Tokenizer (Encode & Tokenize Every Letter) 

In [42]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [43]:
# tokenizer.push_to_hub("asr_skripsi_local_common_voice")

Feature Extractor (Return Tensor)

In [44]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

Processor (Combine Tokenizer & Feature Extractor)

In [45]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained("asr_skripsi_local_common_voice/")

Create Languange Modelling n-gram

In [46]:
import os
cwd = os.getcwd()
lm_text_path = cwd + "/text.txt"
kenlm_path = "kenlm/build/bin/"
os.system(f"{kenlm_path}lmplz -o 5 <'{lm_text_path}' > '5gram.arpa'")

=== 1/5 Counting and sorting n-grams ===
Reading /home/alckylzer/Desktop/ASR/text.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 5364062058 types 15518
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:186216 2:1138852352 3:2135348224 4:3416556800 5:4982479360
Statistics:
1 15517 D1=0.675778 D2=1.0492 D3+=1.2184
2 99862 D1=0.773385 D2=0.883375 D3+=1.11667
3 368178 D1=0.713328 D2=0.880577 D3+=1.1837
4 1851617 D1=0.642623 D2=0.962668 D3+=1.30063
5 8085148 D1=0.598595 D2=1.02009 D3+=1.39688
Memory estimate for binary LM:
type     MB
probing 192 assuming -p 1.5
probing 205 assuming -r models -p 1.5
trie     71 without quantization
trie     36 assuming -q 8 -b 8 quantization 
trie     67 assuming -a 22 array pointer compression
trie     32 assuming -a 22 -q 8 -b 8 array pointer compression and quantizat

0

In [47]:
os.system(f"head -20 {cwd}/5gram.arpa")

\data\
ngram 1=15517
ngram 2=99862
ngram 3=368178
ngram 4=1851617
ngram 5=8085148

\1-grams:
-5.0476174	<unk>	0
0	<s>	-0.11160436
-4.1142364	halo	-0.11160436
-3.541874	dunia	-0.15686736
-3.023158	sudah	-0.23925924
-3.09807	makan	-0.3598049
-4.913347	sholat	-0.11160436
-4.4339643	udah	-0.11160436
-3.435012	keluar	-0.32991758
-3.683923	hasil	-0.16463552
-4.7332344	testnya	-0.11160436
-4.4339643	dimanakah	-0.11160436


0

In [48]:
with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
  has_added_eos = False
  for line in read_file:
    if not has_added_eos and "ngram 1=" in line:
      count=line.strip().split("=")[-1]
      write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
    elif not has_added_eos and "<s>" in line:
      write_file.write(line)
      write_file.write(line.replace("<s>", "</s>"))
      has_added_eos = True
    else:
      write_file.write(line)

In [49]:
os.system(f"head -20 {cwd}/5gram_correct.arpa")

\data\
ngram 1=15518
ngram 2=99862
ngram 3=368178
ngram 4=1851617
ngram 5=8085148

\1-grams:
-5.0476174	<unk>	0
0	<s>	-0.11160436
0	</s>	-0.11160436
-4.1142364	halo	-0.11160436
-3.541874	dunia	-0.15686736
-3.023158	sudah	-0.23925924
-3.09807	makan	-0.3598049
-4.913347	sholat	-0.11160436
-4.4339643	udah	-0.11160436
-3.435012	keluar	-0.32991758
-3.683923	hasil	-0.16463552
-4.7332344	testnya	-0.11160436


0

In [26]:
from pyctcdecode import build_ctcdecoder

vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="5gram_correct.arpa",
)

['j', 'k', 'e', 'h', 'd', 's', 'f', 'n', 'y', 'i', 'p', 'g', 'o', 'b', 'c', 'v', 'x', 'z', 'r', 't', 'l', 'm', ' ', 'q', 'u', 'a', 'w', '[UNK]', '[PAD]', '<s>', '</s>']


In [51]:
sorted_vocab_dict

{'j': 0,
 'k': 1,
 'e': 2,
 'h': 3,
 'd': 4,
 's': 5,
 'f': 6,
 'n': 7,
 'y': 8,
 'i': 9,
 'p': 10,
 'g': 11,
 'o': 12,
 'b': 13,
 'c': 14,
 'v': 15,
 'x': 16,
 'z': 17,
 'r': 18,
 't': 19,
 'l': 20,
 'm': 21,
 '|': 22,
 'q': 23,
 'u': 24,
 'a': 25,
 'w': 26,
 '[unk]': 27,
 '[pad]': 28}

In [52]:
from transformers import Wav2Vec2ProcessorWithLM

processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)
if os.path.exists(f"{cwd}/asr_LM_skripsi_local_common_voice/"):
    os.system(f"rm -rf {cwd}/asr_LM_skripsi_local_common_voice/")
    processor_with_lm.save_pretrained("asr_LM_skripsi_local_common_voice/")
                  
else:
    processor_with_lm.save_pretrained("asr_LM_skripsi_local_common_voice/")

In [53]:
!tree -h asr_LM_skripsi_local_common_voice/

[4.0K]  [01;34masr_LM_skripsi_local_common_voice/[0m
├── [ 178]  alphabet.json
├── [4.0K]  [01;34mlanguage_model[0m
│   ├── [238M]  5gram_correct.arpa
│   ├── [  78]  attrs.json
│   └── [120K]  unigrams.txt
├── [ 262]  preprocessor_config.json
├── [  96]  special_tokens_map.json
├── [ 339]  tokenizer_config.json
└── [ 320]  vocab.json

1 directory, 8 files


In [54]:
os.system(f"{kenlm_path}build_binary asr_LM_skripsi_local_common_voice/language_model/5gram_correct.arpa asr_LM_skripsi_local_common_voice/language_model/5gram.bin")
os.system("rm asr_LM_skripsi_local_common_voice/language_model/5gram_correct.arpa")

Reading asr_LM_skripsi_local_common_voice/language_model/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


0

Resample Demo

In [55]:
# for i in range(10):
#     print(f"Sentence : {data_train_cleaned['sentence'][i]}")
#     ipd.Audio(data=data_train_cleaned['audio_resampled'][i], autoplay=True, rate=16000)

Preprocesss Input with Processor

In [56]:
def prepare_dataset(batch):

    batch["input_values"] = processor(batch["audio_resampled"], sampling_rate=16_000).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch


In [57]:
data_train_input = data_train_cleaned.map(prepare_dataset, remove_columns=data_train_cleaned.column_names)
data_test_input = data_test_cleaned.map(prepare_dataset, remove_columns=data_test_cleaned.column_names)

  0%|          | 0/8274 [00:00<?, ?ex/s]



  0%|          | 0/3618 [00:00<?, ?ex/s]

Remove Audio That Has More Than 5sec

In [58]:
max_input_length_in_sec = 5.0
data_train_filtered = data_train_input.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])
data_test_filtered = data_test_input.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

Padding 

In [59]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [60]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Word Error Rate Metric

In [61]:
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


Computer WER While Training

In [62]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Clear Cache and Trainer Model Variable

In [63]:
# import gc
# del trainer
# del model
# del training_args
# gc.collect()
# torch.cuda.empty_cache()

In [64]:
state_dict = torch.load(f"/home/alckylzer/.cache/huggingface/hub/models--indonesian-nlp--wav2vec2-large-xlsr-indonesian/snapshots/68fbcbd947e32184a704b401b71973d6c27de0c1/pytorch_model.bin", map_location='cpu')
state_dict.pop('lm_head.weight')
state_dict.pop('lm_head.bias')

tensor([-0.0160, -0.0121, -0.0101,  0.0007, -0.0418, -0.0089, -0.0063, -0.0200,
        -0.0106, -0.0140, -0.0244, -0.0264, -0.0341, -0.0121, -0.0232, -0.0013,
        -0.0211, -0.0273, -0.0090, -0.0077,  0.0071, -0.0165, -0.0511, -0.0129,
        -0.0185, -0.0085, -0.1896,  0.0354])

In [65]:
model = Wav2Vec2ForCTC.from_pretrained(
    "indonesian-nlp/wav2vec2-large-xlsr-indonesian",
    #############################
    # attention_dropout=0.094,
    # hidden_dropout=0.047,
    # feat_proj_dropout=0.04,
    # mask_time_prob=0.04,
    # layerdrop=0.041,
    # activation_dropout=0.055,
    #############################
    # attention_dropout=0.094,
    # hidden_dropout=0.047,
    # feat_proj_dropout=0.04,
    # mask_time_prob=0.082,
    # layerdrop=0.041,
    # activation_dropout=0.055,
    #############################
    # attention_dropout=0.3,
    # activation_dropout=0.2,
    # hidden_dropout=0.3,
    # mask_time_prob=0.05,
    #############################
    # attention_dropout=0.0,
    # hidden_dropout=0.0,
    # feat_proj_dropout=0.0,
    # mask_time_prob=0.05,
    # layerdrop=0.0,
    state_dict=state_dict,
    ctc_loss_reduction="mean",
    bos_token_id=processor.tokenizer.bos_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at indonesian-nlp/wav2vec2-large-xlsr-indonesian and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freeze Parameter

In [66]:
model.freeze_feature_encoder()

In [67]:
training_args = TrainingArguments(
    dataloader_num_workers=2,
    output_dir="asr_skripsi_local_common_voice",
    group_by_length=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    num_train_epochs=20,
    gradient_checkpointing=True,
    save_steps=400,
    warmup_steps=500,
    eval_steps=400,
    logging_steps=400,
    save_total_limit=2,
    push_to_hub=False,
    optim='adamw_bnb_8bit',
    ###############################
    learning_rate=7.5e-5,
    # learning_rate=4.42184e-05,
    # learning_rate=1e-4,
    # weight_decay=0.0354792,
    # weight_decay=1e-2,
    # learning_rate=1e-4,
    weight_decay=0.0005,
)

In [68]:
# import bitsandbytes as bnb
# from transformers.trainer_pt_utils import get_parameter_names
# from torch import nn

# decay_parameters = get_parameter_names(model, [nn.LayerNorm])
# decay_parameters = [name for name in decay_parameters if "bias" not in name]

# optimizer_grouped_parameters = [
#     {
#         "params": [p for n, p in model.named_parameters() if n in decay_parameters],
#         "weight_decay": training_args.weight_decay,
#     },
#     {
#         "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
#         "weight_decay": 0.0,
#     },
# ]

# adam_bnb_optim = bnb.optim.Adam8bit(
#     optimizer_grouped_parameters,
#     betas=(training_args.adam_beta1, training_args.adam_beta2),
#     eps=training_args.adam_epsilon,
#     lr=training_args.learning_rate,
# )

In [69]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    # optimizers=(adam_bnb_optim, None),
    compute_metrics=compute_metrics,
    train_dataset=data_train_filtered,
    eval_dataset=data_test_filtered,
    tokenizer=processor.feature_extractor,
)

In [70]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4384
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 2740
  Number of trainable parameters = 311258269


Step,Training Loss,Validation Loss,Wer
400,4.6896,0.314434,0.306355
800,0.5919,0.151268,0.272998
1200,0.4698,0.151505,0.274743
1600,0.4255,0.144252,0.272287
2000,0.4045,0.142727,0.268278
2400,0.3794,0.143434,0.268214


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to asr_skripsi_local_common_voice/checkpoint-400
Configuration saved in asr_skripsi_local_common_voice/checkpoint-400/config.json
Model weights saved in asr_skripsi_local_common_voice/checkpoint-400/pytorch_model.bin
Feature extractor saved in asr_skripsi_local_common_voice/checkpoint-400/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2917
  Batch size = 8
Saving model checkpoint to asr_sk

TrainOutput(global_step=2740, training_loss=1.062227254714409, metrics={'train_runtime': 17336.4129, 'train_samples_per_second': 5.058, 'train_steps_per_second': 0.158, 'total_flos': 1.0045163306413486e+19, 'train_loss': 1.062227254714409, 'epoch': 20.0})

In [71]:
# trainer.push_to_hub()

Evaluation

In [1]:
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained("asr_skripsi_local_common_voice")
model = Wav2Vec2ForCTC.from_pretrained("asr_skripsi_local_common_voice/checkpoint-2400").to("cuda")


2023-05-25 16:47:43.337241: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
processorLM = Wav2Vec2ProcessorWithLM.from_pretrained("asr_LM_skripsi_local_common_voice", eos_token=None, bos_token=None)

In [76]:
input_dict = processor(data_test_cleaned[7]["audio_resampled"], sampling_rate=16_000, return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]
transcript = data_test_cleaned[7]["sentence"].lower()

In [77]:
# logits.shape

In [79]:
transcriptionLM = processorLM.batch_decode(logits.cpu().detach().numpy()).text

In [80]:
print("Prediction:")
print(processor.decode(pred_ids))

print("\nPrediction LM:")
print(transcriptionLM[0])

print("\nReference:")
print(transcript)

Prediction:
jalanan sepi dari kendaraan

Prediction LM:
jalanan sepi dari kendaraan

Reference:
jalanan sepi dari kendaraan


In [21]:
def evaluate(batch):

    input_dict = processor(batch["audio_resampled"], sampling_rate=16_000, return_tensors="pt", padding=True)


    logits = model(input_dict.input_values.to("cuda")).logits
    pred_ids = torch.argmax(logits, dim=-1)[0]

    decoded = processor.decode(pred_ids)
    batch["pred_strings"] = decoded
    return batch

result = data_test_cleaned.map(evaluate)



  0%|          | 0/3618 [00:00<?, ?ex/s]

In [22]:
wer = load_metric("wer")
print("WER Without LM : {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))

  wer = load_metric("wer")


WER Without LM : 12.940301


In [28]:
def evaluateLM(batch):

    input_dict = processor(batch["audio_resampled"], sampling_rate=16_000, return_tensors="pt", padding=True)


    logits = model(input_dict.input_values.to("cuda")).logits
    transcriptionLM = processorLM.batch_decode(logits.cpu().detach().numpy()).text[0]
    
    batch["pred_strings"] = transcriptionLM
    return batch

resultLM = data_test_cleaned.map(evaluateLM)

wer = load_metric("wer")
print("WER With LM : {:2f}".format(100 * wer.compute(predictions=resultLM["pred_strings"], references=result["sentence"])))

Loading cached processed dataset at /home/alckylzer/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/id/11.0.0/2c65b95d99ca879b1b1074ea197b65e0497848fd697fdb0582e0f6b75b6f4da0/cache-061ad0d2021ac2cb.arrow


WER With LM : 6.016472
