<a href="https://colab.research.google.com/github/indra622/tutorials/blob/master/Nemo_finetuning_English_to_ZerothKorean_char.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisities

In [None]:
!pip install nemo_toolkit['all']

In [None]:
import os
import glob
import subprocess
import tarfile
import wget
import copy
from omegaconf import OmegaConf, open_dict

In [None]:
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import logging, exp_manager
from collections import abc as container_abcs

In [None]:
!pip install datasets
from datasets import load_dataset

# Pre-trained model

In [None]:
char_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_quartznet15x5", map_location='cpu')

In [None]:
ds = load_dataset("kresnik/librispeech_asr_test", "clean")

In [None]:
test_ds = ds['test']
sample = test_ds[0]
sample

In [None]:
import IPython

IPython.display.Audio(sample['file'])

In [None]:
result = char_model.transcribe([sample['file']])
results = char_model.transcribe(test_ds['file'][:10])

In [None]:
print("Hypothesis: "+ result[0])
print("Reference:  " +sample['text'].lower())

# Fine-tuning

In [None]:
print(OmegaConf.to_yaml(char_model.cfg))

## Korean datasets

### label set (character set)

In [None]:
ds = load_dataset("kresnik/zeroth_korean", "clean")

In [None]:
train_ds = ds['train']
test_ds = ds['test']

In [None]:
test_ds

In [None]:
print(OmegaConf.to_yaml(char_model.cfg))

In [None]:
test_ds[0]

In [None]:
train_ds = train_ds.remove_columns(["speaker_id", "chapter_id", "id", "audio"])
test_ds = test_ds.remove_columns(["speaker_id", "chapter_id", "id", "audio"])

In [None]:
import soundfile as sf
def get_duration(batch):
  speech = sf.SoundFile(batch['file'])
  duration = speech.frames / speech.samplerate
  batch['duration'] = duration
  return batch

#def rename_key(batch):
#  batch['audio_filepath'] = batch['file']
  #batch.remove_columns(['file'])

#  return batch


In [None]:
train_ds = train_ds.map(get_duration)
test_ds = test_ds.map(get_duration)

train_ds = train_ds.rename_column(original_column_name='file', new_column_name='audio_filepath')
test_ds = test_ds.rename_column(original_column_name='file', new_column_name='audio_filepath')

### write manifest

In [None]:
import json
from tqdm.auto import tqdm

def read_manifest(path):
    manifest = []
    with open(path, 'r') as f:
        for line in tqdm(f, desc="Reading manifest data"):
            line = line.replace("\n", "")
            data = json.loads(line)
            manifest.append(data)
    return manifest

In [None]:
import os

train_json_path = os.path.abspath('train.json')
test_json_path = os.path.abspath('test.json')

train_json = train_ds.to_json(train_json_path)
test_json = test_ds.to_json(test_json_path)

train_manifest = read_manifest('train.json')
test_manifest = read_manifest('test.json')

## extract chars

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocab_train = train_ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_ds.column_names)
vocab_test = test_ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_ds.column_names)

In [None]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [None]:
vocab_list

In [None]:
char_model.change_vocabulary(new_vocabulary=vocab_list)

In [None]:
#@title Freeze Encoder { display-mode: "form" }
freeze_encoder = True #@param ["False", "True"] {type:"raw"}
freeze_encoder = bool(freeze_encoder)

In [None]:
import torch
import torch.nn as nn

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

In [None]:
if freeze_encoder:
    char_model.encoder.freeze()
    char_model.encoder.apply(enable_bn_se)
    logging.info("Model encoder has been frozen, and batch normalization has been unfrozen")
else:
    char_model.encoder.unfreeze()
    logging.info("Model encoder has been un-frozen")

In [None]:
char_model.cfg.labels = vocab_list

In [None]:
cfg = copy.deepcopy(char_model.cfg)

In [None]:
# Setup train, validation, test configs
with open_dict(cfg):    
  # Train dataset  (Concatenate train manifest cleaned and dev manifest cleaned)
  cfg.train_ds.manifest_filepath = f"{train_json_path}"
  cfg.train_ds.labels = vocab_list
  cfg.train_ds.normalize_transcripts = False
  cfg.train_ds.batch_size = 16
  cfg.train_ds.num_workers = 2
  cfg.train_ds.pin_memory = True
  cfg.train_ds.trim_silence = True

  # Validation dataset  (Use test dataset as validation, since we train using train + dev)
  cfg.validation_ds.manifest_filepath = test_json_path
  cfg.validation_ds.labels = vocab_list
  cfg.validation_ds.normalize_transcripts = False
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.trim_silence = True

In [None]:
# setup data loaders with new configs
char_model.setup_training_data(cfg.train_ds)
char_model.setup_multiple_validation_data(cfg.validation_ds)

In [None]:
# Original optimizer + scheduler
print(OmegaConf.to_yaml(char_model.cfg.optim))

In [None]:
with open_dict(char_model.cfg.optim):
    char_model.cfg.optim.lr = 0.01
    char_model.cfg.optim.betas = [0.95, 0.5]  # from paper
    char_model.cfg.optim.weight_decay = 0.001  # Original weight decay
    char_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
    char_model.cfg.optim.sched.warmup_ratio = 0.05  # 5 % warmup
    char_model.cfg.optim.sched.min_lr = 1e-5

In [None]:
print(OmegaConf.to_yaml(char_model.cfg.spec_augment))

In [None]:
# with open_dict(char_model.cfg.spec_augment):
#   char_model.cfg.spec_augment.freq_masks = 2
#   char_model.cfg.spec_augment.freq_width = 25
#   char_model.cfg.spec_augment.time_masks = 2
#   char_model.cfg.spec_augment.time_width = 0.05

char_model.spec_augmentation = char_model.from_config_dict(char_model.cfg.spec_augment)

In [None]:
#@title Metric
use_cer = True #@param ["False", "True"] {type:"raw"}
log_prediction = True #@param ["False", "True"] {type:"raw"}



In [None]:
char_model._wer.use_cer = use_cer
char_model._wer.log_prediction = log_prediction

In [None]:
import torch
import pytorch_lightning as ptl

if torch.cuda.is_available():
    gpus = 1
else:
    gpus = 0

EPOCHS = 50  # 100 epochs would provide better results, but would take an hour to train

trainer = ptl.Trainer(gpus=gpus, 
                      max_epochs=EPOCHS, 
                      accumulate_grad_batches=1,
                      checkpoint_callback=False,
                      logger=False,
                      log_every_n_steps=50,
                      check_val_every_n_epoch=10)

# Setup model with the trainer
char_model.set_trainer(trainer)

# Finally, update the model's internal config
char_model.cfg = char_model._cfg

In [None]:
# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/lang/',
    name=f"ASR-Char-Model-Korean",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

In [None]:
try:
    from google import colab
    COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
    COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
    %load_ext tensorboard
    %tensorboard --logdir /content/experiments/lang/ASR-Char-Model-Korean/
else:
    print("To use tensorboard, please use this notebook in a Google Colab environment.")

In [None]:
%%time
trainer.fit(char_model)