## Part, the first
### Setting up MFA

In [1]:
%%capture
import os
os.chdir('/tmp')
!wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
!tar zxvf montreal-forced-aligner_linux.tar.gz
!ln -s /tmp/montreal-forced-aligner/lib/libpython3.6m.so.1.0 /tmp/montreal-forced-aligner/lib/libpython3.6m.so

In [2]:
os.chdir('/kaggle/working')
os.environ['LD_LIBRARY_PATH'] = f'{os.environ["LD_LIBRARY_PATH"]}:/tmp/montreal-forced-aligner/lib/'
os.environ['PATH'] = f'{os.environ["PATH"]}:/tmp/montreal-forced-aligner/bin/'

In [3]:
%%capture
!apt-get -y install libgfortran3

To create the same data, fork and run [this notebook](https://www.kaggle.com/jimregan/scrape-fuaimeanna-ie)

In [4]:
!mkdir /tmp/m
!mkdir /tmp/c
!mkdir /tmp/u

!cp ../input/scrape-fuaimeanna-private/wav/*s1.wav /tmp/u
!cp ../input/scrape-fuaimeanna-private/wav/*s2.wav /tmp/m
!cp ../input/scrape-fuaimeanna-private/wav/*s3.wav /tmp/c

In [5]:
%%writefile fuaimeanna-write.pl
#!/usr/bin/perl
use warnings;
use strict;
use utf8;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");

my %cr_files = (
	'mo shmidiú' => 'mo chuid smidiú',
	'mo shmior' => 'mo chuid smior',
	'mo shmólach' => 'mo smólach',
	'shmachtaigh' => 'smachtaigh',
	'shmaoinigh' => 'smaoinigh',
	'shmear' => 'smear',
	'deamhain' => 'diabhail',
	'folach' => 'i bhfolach',
);
my %empty = (
	'/sounds/gob_i3_s3.mp3' => 1,
	'/sounds/iioctha_i3_s3.mp3' => 1,
	'/sounds/mo_shuiiochaan_i3_s3.mp3' => 1,
	'/sounds/riail_i3_s3.mp3' => 1
);

open(LEXM, '>>', '/tmp/lexicon-munster.raw');
binmode LEXM, ':utf8';
open(LEXU, '>>', '/tmp/lexicon-ulster.raw');
binmode LEXU, ':utf8';
open(LEXC, '>>', '/tmp/lexicon-connaught.raw');
binmode LEXC, ':utf8';

sub write_text {
	my $file = shift;
	my $text = shift;
	open(OUTF, '>>', $file);
	binmode OUTF, ':utf8';
	print OUTF $text;
	close OUTF;
}

sub write_pron {
	my $file = shift;
	my $text = shift;
	my $pron = shift;
	if ($text eq 'ar tí') {
		$pron =~ s/ \. ˈ / # /g;
	}
	$pron =~ s/ [ˈˌ] / /g;
	$pron =~ s/^[ˈˌ] //g;
	$pron =~ s/ \. / /g;
	my @words = split/ /, $text;
	my @prons = split/ \# /, $pron;
	if($#words != $#prons) {
		print STDERR "ERROR: $file $text $pron\n";
	}
	if($#words == 0) {
		print $file "$text $pron\n";
	} else {
		for(my $i = 0; $i <= $#words; $i++) {
			print $file "$words[$i] $prons[$i]\n";
		}
	}
}

while(<STDIN>) {
	chomp;
	my @line = split/\t/;
	next if($line[0] eq 'Orthographic');
	my $text = lc($line[0]);
	next if($line[0] eq "d'fhág");
	my $uout = $line[1];
	$uout =~ s!/sounds/!!;
	$uout =~ s/\.mp3$/.txt/;
	my $cout = $line[3];
	$cout =~ s!/sounds/!!;
	$cout =~ s/\.mp3$/.txt/;
	my $mout = $line[5];
	$mout =~ s!/sounds/!!;
	$mout =~ s/\.mp3$/.txt/;
	$uout = '/tmp/u/' . $uout;
	$cout = '/tmp/c/' . $cout;
	$mout = '/tmp/m/' . $mout;

	my $pronu = $line[2];
	my $pronc = $line[4];
	my $pronm = $line[6];

	if($text eq 'Gaeilge') {
		write_text($uout, "gaeilic");
		write_text($cout, "gaeilge");
		write_text($mout, "gaelainn");
		write_pron(\*LEXU, "gaeilic", $pronu);
		write_pron(\*LEXC, "gaeilge", $pronc);
		write_pron(\*LEXM, "gaelainn", $pronm);
		next;
	}
	if($line[0] eq 'bocht' || $line[0] eq 'teacht' || $line[0] eq 'teocht') {
		$pronu =~ s/x t̪ˠ/ɾˠ t̪ˠ/;
	}
	write_text($uout, $text);
	write_pron(\*LEXU, $text, $pronu);
	write_text($mout, $text);
	write_pron(\*LEXM, $text, $pronm);
	if(!exists $empty{$line[3]}) {
		my $cfix = exists $cr_files{$text} ? $cr_files{$text} : $text;
		write_text($cout, $cfix);
		write_pron(\*LEXC, $cfix, $pronc);
	}
}

Writing fuaimeanna-write.pl


In [6]:
!cat ../input/scrape-fuaimeanna-private/all-fuaimeanna-data.tsv | perl fuaimeanna-write.pl

In [7]:
!cat /tmp/lexicon-connaught.raw | sort | uniq > /tmp/lexicon-connaught.txt
!cat /tmp/lexicon-ulster.raw | sort | uniq > /tmp/lexicon-ulster.txt
!cat /tmp/lexicon-munster.raw | sort | uniq > /tmp/lexicon-munster.txt
!cat /tmp/lexicon-connaught.raw /tmp/lexicon-ulster.raw /tmp/lexicon-munster.raw | sort | uniq > /tmp/lexicon-all.txt

In [8]:
!mkdir /tmp/all
!cp /tmp/c/* /tmp/all
!cp /tmp/m/* /tmp/all
!cp /tmp/u/* /tmp/all
!mkdir /tmp/mfa-temp

### Run MFA

In [9]:
!mfa_train_and_align -t /tmp/mfa-temp -o ./munster-model /tmp/m /tmp/lexicon-munster.txt ./textgrid-munster
!mfa_train_and_align -t /tmp/mfa-temp -o ./ulster-model /tmp/u /tmp/lexicon-ulster.txt ./textgrid-ulster
!mfa_train_and_align -t /tmp/mfa-temp -o ./connaught-model /tmp/c /tmp/lexicon-connaught.txt ./textgrid-connaught
!mfa_train_and_align -t /tmp/mfa-temp -o ./all-model /tmp/all /tmp/lexicon-all.txt ./textgrid-all

Setting up corpus information...
Creating dictionary information...
Setting up training data...
Calculating MFCCs...
Calculating CMVN...
Number of speakers in corpus: 1, average number of utterances per speaker: 760.0
b'number of phones 259\nnumber of pdfs 198\nnumber of transition-ids 1734\nnumber of transition-states 807\nfeature dimension 39\nnumber of gaussians 198\n'
None
b'number of phones 259\nnumber of pdfs 198\nnumber of transition-ids 1734\nnumber of transition-states 807\nfeature dimension 39\nnumber of gaussians 198\n'
None
Beginning monophone training...
100%|███████████████████████████████████████████| 39/39 [02:26<00:00,  2.47s/it]
Initializing triphone training...
Beginning triphone training...
100%|███████████████████████████████████████████| 34/34 [06:27<00:00,  8.53s/it]
Initializing speaker-adapted triphone training...
Beginning speaker-adapted triphone training...
100%|███████████████████████████████████████████| 34/34 [04:38<00:00,  7.20s/it]
Sav

In [10]:
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-ulster.txt ./g2p-ulster
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-munster.txt ./g2p-munster
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-connaught.txt ./g2p-connaught
!mfa_train_g2p -t /tmp/mfa-temp /tmp/lexicon-all.txt ./g2p-all


GitRevision: kaldi-6-g64719c
Loading input file: /tmp/mfa-temp/g2p-ulster/input.txt
Starting EM...
Finished first iter...
Iteration: 1 Change: 2.79152
Iteration: 2 Change: 0.0378847
Iteration: 3 Change: 0.0240593
Iteration: 4 Change: 0.0121169
Iteration: 5 Change: 0.00603533
Iteration: 6 Change: 0.00318336
Iteration: 7 Change: 0.0019784
Iteration: 8 Change: 0.00152206
Iteration: 9 Change: 0.00115299
Iteration: 10 Change: 0.000938416
Iteration: 11 Change: 0.000809669
Last iteration: 
GitRevision: kaldi-6-g64719c
Initializing...
Converting...
Saved model to ./g2p-ulster
GitRevision: kaldi-6-g64719c
Loading input file: /tmp/mfa-temp/g2p-munster/input.txt
Starting EM...
Finished first iter...
Iteration: 1 Change: 2.83583
Iteration: 2 Change: 0.0388265
Iteration: 3 Change: 0.0203438
Iteration: 4 Change: 0.00895786
Iteration: 5 Change: 0.00451708
Iteration: 6 Change: 0.0029273
Iteration: 7 Change: 0.00200415
Iteration: 8 Change: 0.0013833
Iteration: 9 Change: 