# process corpus in preparation for MFA

- moves wav and lab files into correct directories for MFA
- processes transcript files to remove punctuation and make lowercase
- generate MFA alignments (these can be used getting wordaligned speechbrain and fastpitch mels)

In [22]:
import os
from fastpitch.common.text.cleaners import lowercase_no_punc
from tqdm import tqdm

In [23]:
corpus_name = 'ljspeech'
corpus_dir = '/home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/LJSpeech-1.1'
out_folder = 'wav_and_labs_for_MFA'
out_dir = os.path.join(corpus_dir, out_folder)
aligns_out_folder = 'MFA_alignments_lowercase_nopunc'
aligns_out_dir = os.path.join(corpus_dir, aligns_out_folder)
os.makedirs(out_dir, exist_ok=True)

In [13]:
# TODO - make this a function that depends on the corpus

# load transcript file for ljspeech
transcript_file = os.path.join(corpus_dir, 'metadata.csv')

# grab normalised text from transcript file
with open(transcript_file) as f:
    lines = f.read().splitlines()
    lines = [line.split('|') for line in lines]
    lines = [(line[0], lowercase_no_punc(line[2])) for line in lines]

print(lines[:5])

[('LJ001-0001', 'printing in the only sense with which we are at present concerned differs from most if not from all the arts and crafts represented in the exhibition'), ('LJ001-0002', 'in being comparatively modern'), ('LJ001-0003', 'for although the chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the netherlands by a similar process'), ('LJ001-0004', 'produced the block books which were the immediate predecessors of the true printed book'), ('LJ001-0005', 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing')]


In [14]:
# copy .wav files into outdir
for wav_name, _ in tqdm(lines):
    wav_name = wav_name + '.wav'
    wav_path = os.path.join(corpus_dir, 'wavs', wav_name)
    out_path = os.path.join(out_dir, wav_name)
    if not os.path.exists(out_path):
        os.system(f'cp {wav_path} {out_path}')

100%|██████████| 13100/13100 [00:03<00:00, 3797.88it/s]


In [15]:
# make .lab files in outdir
for wav_name, text in tqdm(lines):
    lab_name = wav_name + '.lab'
    lab_path = os.path.join(out_dir, lab_name)
    with open(lab_path, 'w') as f:
        f.write(text)

100%|██████████| 13100/13100 [07:52<00:00, 27.75it/s] 


# print MFA command to run

install mfa:
```bash
# update conda
conda update -n base -c defaults conda

#install MFA in new conda env
conda create -n aligner -c conda-forge montreal-forced-aligner
source ~/.bashrc
conda activate aligner

# download models
mfa model download acoustic english_us_arpa
mfa model download dictionary english_us_arpa
```

In [26]:
# print a MFA command to run at the command line
print('# *** Run following command on GPU node: ***')
print('conda activate aligner')
print(f"\nmfa validate {out_dir} english_us_arpa english_us_arpa")
print(f"\nmfa align --clean {out_dir} english_us_arpa english_us_arpa {aligns_out_dir}")

# *** Run following command on GPU node: ***
conda activate aligner

mfa validate /home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/LJSpeech-1.1/wav_and_labs_for_MFA english_us_arpa english_us_arpa

mfa align --clean /home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/LJSpeech-1.1/wav_and_labs_for_MFA english_us_arpa english_us_arpa /home/s1785140/speechbrain/templates/speech_recognition_CharTokens_NoLM/data/LJSpeech-1.1/MFA_alignments_lowercase_nopunc
