# Environment setup

In [None]:
BASE_WAV2VEC_MODEL = "nguyenvulebinh/wav2vec2-base-vietnamese-250h"
BASE_WAV2VEC_PROCESSOR = BASE_WAV2VEC_MODEL
OUTPUT_DIR= '/data'
MY_MODEL_DIR = "/checkpoints"

# Data preprocess

In [None]:
import os
import glob
import librosa
from tqdm import tqdm
import soundfile as sf
import numpy as np
import uuid

In [None]:
import re
import json

vocab = 'ẻ6ụí3ỹýẩởềõ7êứỏvỷalựqờjốàỗnéủуôuyằ4wbệễsìầỵ8dểrũcạ9ếùỡ2tiǵử̀á0ậeộmẳợĩhâúọồặfữắỳxóãổị̣zảđèừòẵ1ơkẫpấẽỉớẹăoư5|'
def clear_text(row):
  correct = [
    ['%', ' phần trăm '],

  ]

  text = row['text'].lower()
  for item in correct:
    text = text.replace(item[0], item[1])

  text = re.sub('[^' + vocab + ']', ' ', text).strip()
  text = ' '.join(text.split())

  row['text'] = text
  return row

#Create vocab+tokenizer+processor
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

def create_vocab(train_ds, test_ds, vocab_json):
  train_ds = train_ds.map(remove_special_characters)
  test_ds = test_ds.map(remove_special_characters)

  vocab_train = train_ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_ds.column_names)
  vocab_test = test_ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_ds.column_names)

  vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
  vocab_dict = {v: k for k, v in enumerate(vocab_list)}

  vocab_dict["|"] = vocab_dict[" "]
  del vocab_dict[" "]
  vocab_dict["[UNK]"] = len(vocab_dict)
  vocab_dict["[PAD]"] = len(vocab_dict)

  with open(vocab_json, 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
from transformers import (
    AutoProcessor,
    Wav2Vec2Processor,
    Wav2Vec2ProcessorWithLM,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
def create_tokenizer(model_path, train_ds, test_ds):
  try:
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_path, do_lower_case=True)
  except:
    vocab_json = "./tmp_vocab.json"
    create_vocab(train_ds, test_ds, vocab_json)

    tokenizer = Wav2Vec2CTCTokenizer(vocab_json, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
  return tokenizer

def create_processor(model_path):
  try:
    processor = AutoProcessor.from_pretrained(model_path)
  except:
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
    tokenizer = create_tokenizer(model_path)
    processor = Wav2Vec2Processor(feature_extractor, tokenizer)
  return processor

In [None]:
import jsonlines
import pandas as pd
from sklearn.model_selection import train_test_split
label_path = 'label.jsonl'
results = []
with jsonlines.open(label_path) as jsonl_file:
    for line in jsonl_file:
      results.append({"file": 'D:/Downloads/SLU/SLU/train/data/'+line["file"],"text": line["sentence"]})
df = pd.DataFrame(results)
df = df[df["text"].str.find('???') == -1]
df = df.apply(clear_text, axis=1)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)


In [None]:
train_df.to_csv(os.path.join(OUTPUT_DIR, "train_df.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_df.csv"), index=False)

In [None]:
import datasets

## load train dataset from pre-step
train_ds = datasets.load_dataset('csv', data_files=os.path.join(OUTPUT_DIR, "train_df.csv"), keep_in_memory=True, split='train')
test_ds = datasets.load_dataset('csv', data_files=os.path.join(OUTPUT_DIR, "test_df.csv"), keep_in_memory=True, split='train')
processor = create_processor(BASE_WAV2VEC_PROCESSOR)
def speech_file_to_array_fn(batch):
  speech_array, sampling_rate = sf.read(batch["file"])
  batch["input_values"] = processor(speech_array, sampling_rate=sampling_rate).input_values[0]
  with processor.as_target_processor():
      batch["labels"] = processor(batch["text"]).input_ids
  return batch
train_ds = train_ds.map(speech_file_to_array_fn, remove_columns=train_ds.column_names)
test_ds = test_ds.map(speech_file_to_array_fn, remove_columns=test_ds.column_names)
train_ds.save_to_disk(os.path.join(OUTPUT_DIR, "hf_datastet", "train"))
test_ds.save_to_disk(os.path.join(OUTPUT_DIR, "hf_datastet", "test"))

# Train model

## Distilling Model

In [None]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import datasets

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(BASE_WAV2VEC_MODEL)
processor = Wav2Vec2Processor.from_pretrained(BASE_WAV2VEC_MODEL)
print("Number of teacher model parameters: ", model.num_parameters())

In [None]:
model

In [None]:
import torch.nn as nn

def deleteEncodingLayers(model, num_layers_to_keep):
    oldModuleList = model.wav2vec2.encoder.layers
    newModuleList = nn.ModuleList()

    for i in range(0, num_layers_to_keep):
        newModuleList.append(oldModuleList[i])

    model.wav2vec2.encoder.layers = newModuleList
    return model


In [None]:
### Load distilled model from checkpoint
distilled_wav2vec2 = Wav2Vec2ForCTC.from_pretrained(MY_MODEL_DIR)
distilled_wav2vec2 = deleteEncodingLayers(distilled_wav2vec2, 6)
print("Number of teacher model parameters: ", distilled_wav2vec2.num_parameters())

In [None]:
distilled_wav2vec2

## Distillation Train

In [None]:
train_ds = datasets.load_from_disk(os.path.join(OUTPUT_DIR, "hf_datastet", "train"))
test_ds = datasets.load_from_disk(os.path.join(OUTPUT_DIR, "hf_datastet", "test"))

In [None]:
from torch.utils.data import DataLoader

train_ds.set_format("torch")
test_ds.set_format("torch")
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=1)
eval_dataloader = DataLoader(test_ds, batch_size=1)

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(distilled_wav2vec2.parameters(), lr=5e-5)

epochs = 15
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

$$ L_{\text{distil}} = KLLoss(\sigma(z_{student}/T) , \sigma(z_{teacher}/T)) * T^2 $$

$$ L_{\text{final}} = \alpha L_{\text{distil}} + (1 - \alpha) L_{\text{student}} $$

In [None]:
T = 4
alpha = 0.8

def compute_distil_loss(student_outputs, teacher_outputs):
  kl_loss = torch.nn.KLDivLoss()
  student_logits = student_outputs.logits
  teacher_logits = teacher_outputs.logits
  distil_loss = kl_loss(
      F.log_softmax(student_logits/T, dim=1),
      F.softmax(teacher_logits/T, dim=1)
  ) * T * T
  return distil_loss

In [None]:
import torch
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
teacher_model = Wav2Vec2ForCTC.from_pretrained(BASE_WAV2VEC_MODEL)
teacher_model.eval()
teacher_model.to(device)
distilled_wav2vec2.to(device)

progress_bar = tqdm(range(num_training_steps))

distilled_wav2vec2.train()
for epoch in range(epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        student_outputs = distilled_wav2vec2(**batch)
        teacher_outputs = teacher_model(**batch)

        distil_loss = compute_distil_loss(student_outputs, teacher_outputs)
        student_loss = student_outputs.loss
        final_loss = alpha * distil_loss + (1. - alpha) * student_loss
        final_loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    distilled_wav2vec2.save_pretrained(save_directory=MY_MODEL_DIR)
    processor.save_pretrained(save_directory=MY_MODEL_DIR)

## Language Model

In [None]:
# !apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
# !wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
# !mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2
# !ls kenlm/build/bin

In [None]:
#! kenlm/build/bin/lmplz -o 4 <"/content/all_text.txt" > "4gram.arpa"

In [None]:
import kenlm
from pyctcdecode import Alphabet, BeamSearchDecoderCTC, LanguageModel

def get_decoder_ngram_model(tokenizer, ngram_lm_path):
    vocab_dict = tokenizer.get_vocab()
    sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
    vocab = [x[1] for x in sort_vocab][:-2]
    vocab_list = vocab
    # convert ctc blank character representation
    vocab_list[tokenizer.pad_token_id] = ""
    # replace special characters
    vocab_list[tokenizer.unk_token_id] = ""
    # vocab_list[tokenizer.bos_token_id] = ""
    # vocab_list[tokenizer.eos_token_id] = ""
    # convert space character representation
    vocab_list[tokenizer.word_delimiter_token_id] = " "
    # specify ctc blank char index, since conventially it is the last entry of the logit matrix
    alphabet = Alphabet.build_alphabet(vocab_list, ctc_token_idx=tokenizer.pad_token_id)
    lm_model = kenlm.Model(ngram_lm_path)
    decoder = BeamSearchDecoderCTC(alphabet,
                                   language_model=LanguageModel(lm_model))
    return decoder

In [None]:
my_lm_file = os.path.join(MY_MODEL_DIR, "lm_4.arpa")
ngram_lm_model = get_decoder_ngram_model(processor.tokenizer, my_lm_file)

# Evaluation

## Teacher

In [None]:
import time
from datasets import load_metric

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
teacher_model = Wav2Vec2ForCTC.from_pretrained(BASE_WAV2VEC_MODEL)
teacher_model.eval()
teacher_model.to(device)

def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device=device).unsqueeze(0)
        outputs = teacher_model(input_values)
        logits = outputs.logits

    pred_ids = torch.argmax(logits, dim=-1)
    # batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["pred_str_lm"] = ngram_lm_model.decode(logits[0].cpu().detach().numpy(), beam_width=64)
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)
    batch["outputs"] = outputs
    batch["logits"] = logits

    return batch

start_time = time.perf_counter()
teacher_results = test_ds.map(map_to_result, remove_columns=test_ds.column_names)
print("Inference time: {:.3f}".format(time.perf_counter() - start_time))

wer_metric = load_metric("wer")
# print("Test WER without LM: {:.3f}".format(wer_metric.compute(predictions=teacher_results["pred_str"], references=teacher_results["text"])))
print("Test WER with LM: {:.3f}".format(wer_metric.compute(predictions=teacher_results["pred_str_lm"], references=teacher_results["text"])))

In [None]:
teacher_model.num_parameters()

In [None]:
teacher_results["pred_str"][:10]

In [None]:
teacher_results["pred_str_lm"][:10]

In [None]:
teacher_results["text"][:10]

## Student

In [None]:
import time
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_metric

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
distilled_wav2vec2.eval()
distilled_wav2vec2.to(device)
# processor = Wav2Vec2Processor.from_pretrained(MY_MODEL_DIR)

def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device=device).unsqueeze(0)
        outputs = distilled_wav2vec2(input_values)
        logits = outputs.logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["pred_str_lm"] = ngram_lm_model.decode(logits[0].cpu().detach().numpy(), beam_width=64)
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)
    batch["outputs"] = outputs
    batch["logits"] = logits

    return batch

start_time = time.perf_counter()
student_results = test_ds.map(map_to_result, remove_columns=test_ds.column_names)
print("Inference time: {:.3f}".format(time.perf_counter() - start_time))

wer_metric = load_metric("wer")
print("Test WER without LM: {:.3f}".format(wer_metric.compute(predictions=student_results["pred_str"], references=student_results["text"])))
print("Test WER with LM: {:.3f}".format(wer_metric.compute(predictions=student_results["pred_str_lm"], references=student_results["text"])))

In [None]:
distilled_wav2vec2.num_parameters()

In [None]:
student_results["pred_str"][:10]

In [None]:
student_results["pred_str_lm"][:10]

In [None]:
student_results["text"][:10]

In [None]:
import csv
test_csv = os.path.join(OUTPUT_DIR, 'test_df.csv')
audio_files = []
with open(test_csv, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in reader:
        audio_files.append(row[0])

In [None]:
audio_files[1]

# ONNX

In [None]:
def convert_to_onnx(model, onnx_model_path):
    print(f"Converting model to onnx")

    audio_len = 250000
    x = torch.randn(1, audio_len, requires_grad=True)

    torch.onnx.export(model,                        # model being run
                    x,                              # model input (or a tuple for multiple inputs)
                    onnx_model_path,                # where to save the model (can be a file or file-like object)
                    export_params=True,             # store the trained parameter weights inside the model file
                    opset_version=11,               # the ONNX version to export the model to
                    do_constant_folding=True,       # whether to execute constant folding for optimization
                    input_names = ['input'],        # the model's input names
                    output_names = ['output'],      # the model's output names
                    dynamic_axes={'input' : {1 : 'audio_len'},    # variable length axes
                                'output' : {1 : 'audio_len'}})

def quantize_onnx_model(onnx_model_path, quantized_model_path):
    print("Starting quantization...")
    from onnxruntime.quantization import quantize_dynamic, QuantType
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QUInt8)

    print(f"Quantized model saved to: {quantized_model_path}")

quantize = False
onnx_model_path = os.path.join(MY_MODEL_DIR, "wav2vec.onnx")
convert_to_onnx(distilled_wav2vec2, onnx_model_path)
if (quantize):
    quantized_model_name = os.path.join(MY_MODEL_DIR, "wav2vec.quant.onnx")
    quantize_onnx_model(onnx_model_path, quantized_model_name)