-
Notifications
You must be signed in to change notification settings - Fork 0
/
tutorial.tex
4382 lines (3661 loc) · 223 KB
/
tutorial.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass{article}
\bibliographystyle{unsrt}
% Section
\newcommand{\secref}[1]{Subsection~\ref{sec.#1}}
\newcommand{\seclabel}[1]{\label{sec.#1}}
% Appendix
\newcommand{\appref}[1]{Appendix~\ref{app.#1}}
\newcommand{\applabel}[1]{\label{app.#1}}
% Table
\newcommand{\tabnum}[1]{\ref{tab.#1}}
\newcommand{\tabref}[1]{Table~\tabnum{#1}}
\newcommand{\tablabel}[1]{\label{tab.#1}}
% Equation
\newcommand{\eqnref}[1]{Equation~\ref{Equations.#1}}
\newcommand{\eqnlabel}[1]{\label{Equations.#1}}
% Figure
\newcommand{\figref}[1]{Figure~\ref{Figures.#1}}
\newcommand{\figlabel}[1]{\label{Figures.#1}}
\newcommand{\easyfig}[4]{
\begin{figure}
\includegraphics[#2]{#1}
\caption{ \figlabel{#3} #4}
\end{figure}}
\newcommand{\pngfig}[2]{\easyfig{#1.png}{}{#1}{#2}}
\newcommand{\pdffig}[2]{\easyfig{#1-fig.pdf}{}{#1}{#2}}
\newcommand{\widepngfig}[2]{\easyfig{#1.png}{width=\textwidth}{#1}{#2}}
\newcommand{\tallpngfig}[2]{\easyfig{#1.png}{height=.8\textheight}{#1}{#2}}
\newcommand{\widepdffig}[2]{\easyfig{#1-fig.pdf}{width=\textwidth}{#1}{#2}}
\newcommand{\tallpdffig}[2]{\easyfig{#1-fig.pdf}{height=.8\textheight}{#1}{#2}}
\newcommand{\sidewayspngfig}[2]{
\begin{sidewaysfigure}
\includegraphics[width=\textwidth]{#1.png}
\caption{\figlabel{#1} #2}
\end{sidewaysfigure}
}
% ToDo
\newcommand{\needfig}[1]{{\bf Need figure: } #1 }
\newcommand{\needfigref}[1]{Figure~??? [#1] }
\newcommand{\needcite}[1]{[CITE #1]}
\newcommand{\todo}[1]{[TODO: #1]}
% Packages
\usepackage{graphicx}
\usepackage{url}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{color}
\usepackage{ifthen}
\usepackage{rotating}
%\usepackage{soul}
\newcommand{\hl}[1]{#1}
\newcommand{\profileSizeLimit}{p}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Imported parsetree.sty
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% parse trees
%\input{parsetree.sty}
%parsetree.sty - LaTeX style file with macros for typesetting parse trees
% See parsetree.doc for extensive section-by-section comments to the code.
% Eirik Hektoen, 1994-2-24
% Some Initial Variables and Auxiliary Macros:
\def\pterror#1{\errmessage{Parsetree ERROR: #1}}
\newdimen\pthgap\def\pthorgap#1{\pthgap=#1}
\newdimen\ptvgap\def\ptvergap#1{\ptvgap=#1}
\newbox\ptnodestrutbox\def\ptnodestrut{\unhcopy\ptnodestrutbox}
\newbox\ptleafstrutbox\def\ptleafstrut{\unhcopy\ptleafstrutbox}
\def\ptnodefont#1#2#3{\def\ptnodefn{#1}
\setbox\ptnodestrutbox=\hbox{\vrule height#2 width0pt depth#3}}
\def\ptleaffont#1#2#3{\def\ptleaffn{#1}
\setbox\ptleafstrutbox=\hbox{\vrule height#2 width0pt depth#3}}
% Customizable Definitions (- NOTE: MAY BE REDEFINED IN DOCUMENT -):
\pthorgap{12pt} % horizontal gap betw sisters
\ptvergap{12pt} % vertical gap betw mother/daughter
\ptnodefont{\normalsize\rm}{11pt}{3pt} % font and strut height/depth: nodes
\ptleaffont{\normalsize\it}{11pt}{3pt} % font and strut height/depth: leaves
% Hierarchy-building macros:
\newcount\ptdepth % current bracketing depth
\newcount\ptn % 1: m, 2: ma, 3: mab, 4: mabc
\newbox\ptm \newdimen\ptmx % m = mother
\newbox\pta \newdimen\ptax % a = 1st daughter
\newbox\ptb \newdimen\ptbx % b = 2nd daughter
\newbox\ptc \newdimen\ptcx % c = 3rd daughter (max)
\newbox\ptx \newdimen\ptxx % x = used for passing results
\newif\ifpttri % choose triangular subtree with \pttritrue
\def\ptnext{\advance\ptn by 1 \ifcase\ptn
\or \setbox\ptm=\box\ptx \ptmx=\ptxx \or \setbox\pta=\box\ptx \ptax=\ptxx
\or \setbox\ptb=\box\ptx \ptbx=\ptxx \or \setbox\ptc=\box\ptx \ptcx=\ptxx
\else \pterror{More than 3 daughters in (sub)tree}\fi}
\def\ptbegtree{\ptdepth=0}
\def\ptendtree
{\ifnum\ptdepth>0 \pterror{Mismatched bracketing: too few ')'s!}\fi}
\def\ptbeg{\ifnum\ptdepth=0 \leavevmode\fi\begingroup
\advance\ptdepth1 \ptn=0\pttrifalse}
\def\ptend{\ifnum\ptdepth=0 \pterror{Mismatched bracketing: too many ')'s!}
\else\ptcons\endgroup\ifnum\ptdepth=0 \box\ptx\else\ptnext\fi\fi}
\def\ptnodeaux#1{\setbox\ptx=\hbox{#1}\ptxx=0.5\wd\ptx\ptnext}
\def\ptnode#1{\ptnodeaux{\ptnodefn\ptnodestrut #1}}
\def\ptleaf#1{\ptnodeaux{\ptleaffn\ptleafstrut #1}}
\def\pthoradjust#1{\ifcase\ptn
\or \pthadjbox{\ptm}{#1} \or \pthadjbox{\pta}{#1}
\or \pthadjbox{\ptb}{#1} \or \pthadjbox{\ptc}{#1}
\else \pterror{More than 3 daughters in (sub)tree}\fi}
\def\pthadjbox#1#2{\setbox#1=\hbox{\box#1\kern#2}}
% Subtree-constructing macros:
\def\ptcons
{\ifnum\ptn=0 \ptconsz
\else
\ifpttri\ptconstri\else
\ifcase\ptn \or\ptconsm\or\ptconsma\or\ptconsmab\or\ptconsmabc \fi \fi
\ptax=\ptxx \advance\ptax-\ptmx \ptbx=0pt
\ifdim\ptax<0pt \ptbx=-\ptax\ptax=0pt\ptxx=\ptmx \fi
\setbox\ptx=\vtop{\hbox{\kern\ptax\box\ptm}\nointerlineskip
\hbox{\kern\ptbx\box\ptx}}\fi
\global\ptxx=\ptxx\global\setbox\ptx=\box\ptx}
\def\ptavg#1#2#3{#1=#2\advance#1#3#1=0.5#1} % #1 := average(#2,#3)
\def\ptadv#1#2{\advance#1#2\advance#1\pthgap} % #1 := #1 + #2 + \pthgap
\def\ptconsz{\ptxx=0pt \setbox\ptx=\vtop{}} % empty (zero) tree
\def\ptconsm{\ptxx=0pt
\setbox\ptx=\hbox{\ptedge{1}{0}{}{}}} % mother only
\def\ptconsma % mother and one daughter
{\ptxx=\ptax\setbox\ptx=\vtop{
\hbox{\ptedge{1}\ptax{}{}}\nointerlineskip
\hbox{\box\pta}}}
\def\ptconsmab % mother and two daughters
{\ptadv\ptbx{\wd\pta}\ptavg\ptxx\ptax\ptbx
\setbox\ptx=\vtop{
\hbox{\ptedge{2}\ptax\ptbx{}}\nointerlineskip
\hbox{\box\pta\kern\pthgap\box\ptb}}}
\def\ptconsmabc % mother and three daughters
{\ptadv\ptbx{\wd\pta}\ptadv\ptcx{\wd\pta}%
\ptadv\ptcx{\wd\ptb}\ptavg\ptxx\ptax\ptcx
\setbox\ptx=\vtop{
\hbox{\ptedge{3}\ptax\ptbx\ptcx}\nointerlineskip
\hbox{\box\pta\kern\pthgap\box\ptb\kern\pthgap\box\ptc}}}
\def\ptconstri % triangular subtree
{\ifcase\ptn\or\setbox\pta\hbox{\kern2\pthgap}\or
\or\setbox\pta=\hbox{\box\pta\kern\pthgap\box\ptb}
\or\setbox\pta=\hbox{\box\pta\kern\pthgap\box\ptb\kern\pthgap\box\ptc}\fi
\ptxx=0.5\wd\pta
\setbox\ptx=\vtop{
\hbox{\ptedge{0}{0}{\wd\pta}{}}\nointerlineskip
\box\pta}}
% Macros for `diagonal rules':
\newcount\pted % edge mode
\newcount\ptedm % position of mother
\newcount\pteda % position a
\newcount\ptedb % position b
\newcount\ptedc % position c
\newcount\ptedl % edge length
\newcount\ptedh % edge height
\newcount\ptedhs % horizontal slope
\newcount\ptedvs % vertical slope
\newcount\ptedtemp % temporary variable
\def\ptedge#1#2#3#4{\pted=#1%
\pteda=#2\ifcase\pted\ptedb=#3\or\or\ptedb=#3\or\ptedb=#3\ptedc=#4\fi
\ptedm=\pteda\advance\ptedm\ifcase\pted\ptedb\or\pteda\or\ptedb\or\ptedc\fi
\divide\ptedm by 2
\ptedh=\ptvgap\ptedtemp=\ptedm\advance\ptedtemp-\pteda\divide\ptedtemp by 6
\ifnum\ptedh<\ptedtemp\ptedh=\ptedtemp\fi
\unitlength=1sp%
\begin{picture}(0,\ptedh)
\ifnum\pted=3 \ptedput\ptedc\fi
\ifnum\pted=1 \else\ptedput\ptedb\fi
\ptedput\pteda
\ifnum\pted=0 \ptedbot\fi
\end{picture}}
\def\ptedput#1{\ptedl=#1\advance\ptedl-\ptedm
\ifnum\ptedl>0 \ptedslope\else
\ptedl=-\ptedl\ptedslope\ptedhs=-\ptedhs\fi
\ifnum\ptedhs=0 \ptedl=\ptedh\fi
\put(\ptedm,\ptedh){\line(\ptedhs,-\ptedvs){\ptedl}}}
\def\ptedbot
{\ptedtemp=\ptedl\multiply\ptedtemp by \ptedvs\divide\ptedtemp by \ptedhs
\ifnum\ptedtemp>0\ptedtemp=-\ptedtemp\fi \advance\ptedtemp-1
\advance\ptedtemp\ptedh\multiply\ptedl by 2
\put(\pteda,\ptedtemp){\line(1,0){\ptedl}}}
\def\ptedslope
{\ifnum\ptedl>\ptedh\ptedhs=\ptedl\ptedvs=\ptedh
\else \ptedvs=\ptedl\ptedhs=\ptedh \fi
\divide\ptedhs by 60 \divide\ptedvs by \ptedhs
\ifnum \ptedvs < 5 \ptedvs=0 \ptedhs=1 \else
\ifnum \ptedvs < 11 \ptedvs=1 \ptedhs=6 \else
\ifnum \ptedvs < 13 \ptedvs=1 \ptedhs=5 \else
\ifnum \ptedvs < 17 \ptedvs=1 \ptedhs=4 \else
\ifnum \ptedvs < 22 \ptedvs=1 \ptedhs=3 \else
\ifnum \ptedvs < 27 \ptedvs=2 \ptedhs=5 \else
\ifnum \ptedvs < 33 \ptedvs=1 \ptedhs=2 \else
\ifnum \ptedvs < 38 \ptedvs=3 \ptedhs=5 \else
\ifnum \ptedvs < 42 \ptedvs=2 \ptedhs=3 \else
\ifnum \ptedvs < 46 \ptedvs=3 \ptedhs=4 \else
\ifnum \ptedvs < 49 \ptedvs=4 \ptedhs=5 \else
\ifnum \ptedvs < 55 \ptedvs=5 \ptedhs=6 \else
\ptedvs=1 \ptedhs=1
\fi \fi \fi \fi \fi \fi \fi \fi \fi \fi \fi \fi
\ifnum \ptedl < \ptedh
\ptedtemp=\ptedhs \ptedhs=\ptedvs \ptedvs=\ptedtemp \fi}
% Convenient End-User Environment:
\newenvironment{parsetree}{\ptactivechardefs\ptbegtree}{\ptendtree}
\def\ptcatcodes
{\catcode`(=\active \catcode`)=\active
\catcode`.=\active \catcode``=\active
\catcode`~=\active}
{\ptcatcodes
\gdef\ptactivechardefs
{\ptcatcodes
\def({\ptbeg\ignorespaces}
\def){\ptend\ignorespaces}
\def.##1.{\ptnode{##1}\ignorespaces}
\def`##1'{\ptleaf{##1}\ignorespaces}
\def~{\pttritrue\ignorespaces}}}
% to link empty nodes
%\def\ptlink{\rule{0.4pt}{\ht\ptnodestrutbox + \dt\ptnodestrutbox}}
\def\ptlink{\vrule height\ht\ptnodestrutbox depth\dp\ptnodestrutbox}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% End of imported parsetree.sty
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Algorithm package
\usepackage[boxed,noend]{algorithm2e}
% Tutorial transducers
\newcommand\wikipediaTransducerUrl{http://en.wikipedia.org/w/index.php?title=Finite\_state\_transducer\&oldid=486381386}
\newcommand\nullmodel{{\cal N}}
\newcommand\substitute{{\cal S}}
\newcommand\tkf{{\cal B}}
\newcommand\tkfroot{{\cal R}}
\newcommand\profile{{\cal P}}
% Transducer-related latex command definitions
\newcommand\formaldefs{{\bf Formal definitions: }}
\newcommand\charat[2]{\mbox{symbol}(#1,#2)}
\newcommand\gappedalphabet[1]{(\Omega_{#1} \cup \{\epsilon\})}
\newcommand\gapsquared{\gappedalphabet{}^2}
\newcommand\gappedpair[2]{\gappedalphabet{#1} \times \gappedalphabet{#2}}
\newcommand\wtrans[4]{#1(#2 : [#3] : #4)}
\newcommand\transequiv{\equiv}
\newcommand\compose{}
\newcommand\identity{{\cal I}}
\newcommand\fork{\circ}
\newcommand\idfork{\Upsilon}
\newcommand\forkn[1]{\idfork(#1)}
\newcommand\forkfun[2]{\forkn{#1, #2}}
\newcommand\generate{\Delta}
\newcommand\recognize{\nabla}
\newcommand\States{\Phi}
\newcommand\statesof[1]{\States_{#1}}
\newcommand\Transitions{\tau}
\newcommand\transitionsof[1]{\Transitions_{#1}}
\newcommand\startstate{\phi_S}
\newcommand\laststate{\phi_E}
\newcommand\startstateof[1]{\phi_{S;#1}}
\newcommand\laststateof[1]{\phi_{E;#1}}
\newcommand\weight{{\cal W}}
\newcommand\weightfunof[1]{\weight_{#1}}
\newcommand\transweightfun[1]{\weightfunof{#1}^{\mbox{\small trans}}}
\newcommand\emitweightfun[1]{\weightfunof{#1}^{\mbox{\small emit}}}
\newcommand\sumoverpaths[1]{\transweightfun{#1}(\{\pi_{#1}\})}
\newcommand\transviawait[1]{\weightfunof{#1}^{\mbox{\small via-wait}}}
\newcommand\transtowait[1]{\weightfunof{#1}^{\mbox{\small to-wait}}}
\newcommand\numberofstates[1]{|\statesof{#1}|}
\newcommand\numberoftransitions[1]{|\transitionsof{#1}|}
\newcommand\statesoftype[1]{\States_{#1}}
\newcommand\statetype{\mbox{type}}
\newcommand\dup[1]{\left( \begin{array}{l} #1 \\ #1 \end{array} \right)}
\newcommand\numberofleaves{\kappa}
\newcommand\numberofinternalnodes{\numberofleaves - 1}
\newcommand\numberofnodes{2\numberofleaves - 1}
\newcounter{LeafIndex}
\newcommand\leafnode[1]{\numberofleaves \ifthenelse{\equal{#1}{1}}{}{\setcounter{LeafIndex}{#1} \addtocounter{LeafIndex}{-1} +\arabic{LeafIndex}}}
\newcommand\leaves{{\cal L}}
\newcommand\seqlen[1]{\mbox{len}(#1)}
\newcommand\outputs{{\cal D}}
\newcommand\outputn[1]{{\cal S}_{#1}}
\newcommand\outseqlen[1]{\seqlen{\outputn{#1}}}
\newcommand\order[1]{{\cal O}(#1)}
% State- and type-sets
\newcommand\typeset[1]{{\cal T}_{\mbox{\small #1}}}
\newcommand\stateset[1]{\statesof{\mbox{\small #1}}}
% H_n state tuples, state-sets and type-sets
\newcommand\hstate{(\upsilon,b_l,e_l,b_r,e_r)}
\newcommand\hstatedest{(\upsilon',b'_l,e'_l,b'_r,e'_r)}
\newcommand\externalsuffix{ext}
\newcommand\internalsuffix{int}
\newcommand\leftsuffix{left-int}
\newcommand\rightsuffix{right-int}
\newcommand\waitsuffix{wait}
\newcommand\externalcascades{\stateset{\externalsuffix}}
\newcommand\internalcascades{\stateset{\internalsuffix}}
\newcommand\leftcascades{\stateset{\leftsuffix}}
\newcommand\rightcascades{\stateset{\rightsuffix}}
\newcommand\waitstates{\stateset{\waitsuffix}}
\newcommand\externaltypes{\typeset{\externalsuffix}}
\newcommand\internaltypes{\typeset{\internalsuffix}}
\newcommand\lefttypes{\typeset{\leftsuffix}}
\newcommand\righttypes{\typeset{\rightsuffix}}
\newcommand\waittypes{\typeset{\waitsuffix}}
% M_n state tuples, state-sets and type-sets
\newcommand\mstate{(\rho,\upsilon,b_l,e_l,b_r,e_r)}
\newcommand\mstatedest{(\rho',\upsilon',b'_l,e'_l,b'_r,e'_r)}
% Q_n state tuples, state-sets and type-sets
\newcommand\qstate{(\rho,\upsilon,b_l,b_r)}
\newcommand\qstatedest{(\rho',\upsilon',b'_l,b'_r)}
\newcommand\matchsuffix{match}
\newcommand\nullsuffix{null}
\newcommand\leftinsertsuffix{left-ins}
\newcommand\rightinsertsuffix{right-ins}
\newcommand\leftdeletesuffix{left-del}
\newcommand\rightdeletesuffix{right-del}
\newcommand\leftemitsuffix{left-emit}
\newcommand\rightemitsuffix{right-emit}
\newcommand\qwaitsuffix{wait}
\newcommand\matchstates{\stateset{\matchsuffix}}
\newcommand\nullstates{\stateset{\nullsuffix}}
\newcommand\leftinsertstates{\stateset{\leftinsertsuffix}}
\newcommand\rightinsertstates{\stateset{\rightinsertsuffix}}
\newcommand\leftdeletestates{\stateset{\leftdeletesuffix}}
\newcommand\rightdeletestates{\stateset{\rightdeletesuffix}}
\newcommand\leftemitstates{\stateset{\leftemitsuffix}}
\newcommand\rightemitstates{\stateset{\rightemitsuffix}}
\newcommand\qwaitstates{\stateset{\qwaitsuffix}}
\newcommand\matchtypes{\typeset{\matchsuffix}}
\newcommand\nulltypes{\typeset{\nullsuffix}}
\newcommand\leftinserttypes{\typeset{\leftinsertsuffix}}
\newcommand\rightinserttypes{\typeset{\rightinsertsuffix}}
\newcommand\leftdeletetypes{\typeset{\leftdeletesuffix}}
\newcommand\rightdeletetypes{\typeset{\rightdeletesuffix}}
\newcommand\leftemittypes{\typeset{\leftemitsuffix}}
\newcommand\rightemittypes{\typeset{\rightemitsuffix}}
\newcommand\qwaittypes{\typeset{\qwaitsuffix}}
% DP
\newcommand\envelope[2]{\mbox{is\_in\_envelope}(#1,#2)}
\newcommand\newTransName[1]{t_{#1}}
\newcommand\numStates[1]{N_{#1}}
\newcommand\leftFromI{i_l}
\newcommand\rightFromI{i_r}
\newcommand\rightToI{i'_r}
\newcommand\leftToI{i'_l}
\newcommand\rightProfTo{e'_r}
\newcommand\leftProfTo{e'_l}
\newcommand\rightProfFrom{e_r}
\newcommand\leftProfFrom{e_l}
\newcommand\mTo{m'}
\newcommand\mFrom{m}
\newcommand\qTo{q'}
\newcommand\qFrom{q}
\newcommand\emitProb{\mathcal{E}}
\newcommand\getprofiletype{\mbox{get\_state\_type}}
\newcommand\incomingLeftProfile[1]{\mbox{incoming\_left\_profile\_indices}(#1)}
\newcommand\incomingRightProfile[1]{\mbox{incoming\_right\_profile\_indices}(#1)}
\newcommand\incomingM[1]{\mbox{incoming\_match\_states}(#1)}
\newcommand\incomingL[1]{\mbox{incoming\_left\_emit\_states}(#1)}
\newcommand\incomingR[1]{\mbox{incoming\_right\_emit\_states}(#1)}
\newcommand\addToDPFunction{sum\_paths\_to}
\newcommand\addToDP[3]{\addToDPFunction(#1,#2,#3)}
\newcommand\profTrans[1]{t_{#1}}
\newcommand\profiledelete[1]{\phi_D^{(#1)}}
\newcommand\profileunknown[1]{\phi_{\tau}^{(#1)}}
\newcommand\profilewait[1]{\phi_W^{(#1)}}
\newcommand\profileterminate{\profilewait{\mbox{\small end}}}
\newcommand\currentstate{m'}
\newcommand\newstate{m}
\newcommand\instates{\mbox{states\_in}}
\newcommand\inweights{\mbox{weights\_in}}
\newcommand\sample{\mbox{sample}}
\newcommand\stateindex[2]{\mbox{index\_of}_{#1}(#2)}
\newcommand\emptyarray{ ( )}
% Begin
\begin{document}
\newcommand\authorstring{
Oscar Westesson$^{1}$,
Gerton Lunter$^{2}$,
Benedict Paten$^{3}$,
Ian Holmes$^{1,\ast}$
\\
\textbf{1} Department of Bioengineering, University of California, Berkeley, CA, USA; \\
\textbf{2} Wellcome Trust Center for Human Genetics, Oxford, Oxford, UK;\\
\textbf{3} Baskin School of Engineering, UC Santa Cruz, Santa Cruz, CA, USA
\\
$\ast$ E-mail: ihh@berkeley.edu
}
\newcommand\titlestring{Phylogenetic automata, pruning, and multiple alignment}
\newcommand\shorttitlestring{Phylogenetic automata, pruning, and alignment}
\markboth{\shorttitlestring}{\shorttitlestring}
\begin{flushleft}
{\Large
\textbf{\titlestring}
}
\\
\authorstring
\end{flushleft}
\pagebreak
\section*{Abstract}
We present an extension of Felsenstein's algorithm to indel models defined on entire sequences,
without the need to condition on one multiple alignment.
The algorithm makes use of a generalization from probabilistic substitution matrices to weighted finite-state transducers.
Our approach may equivalently be viewed as a probabilistic formulation of progressive multiple sequence alignment,
using partial-order graphs to represent ensemble profiles of ancestral sequences.
We present a hierarchical stochastic approximation technique which makes this algorithm tractable for alignment analyses of reasonable size.
% Start of document
\paragraph{Keywords:} Multiple alignment, Felsenstein pruning, transducers, insertion deletion, ancestral sequence reconstruction.
\tableofcontents
\section{Background}
Felsenstein's pruning algorithm is routinely used throughout bioinformatics and molecular evolution \cite{Felsenstein81}.
A few common applications include estimation of substitution rates \cite{Yang94b};
reconstruction of phylogenetic trees \cite{RannalaYang96};
identification of conserved (slow-evolving) or recently-adapted (fast-evolving) elements in proteins and DNA \cite{SiepelHaussler04b};
detection of different substitution matrix ``signatures''
(e.g. purifying vs diversifying selection at synonymous codon positions \cite{YangEtAl2000},
hydrophobic vs hydrophilic amino acid signatures \cite{ThorneEtAl96},
CpG methylation in genomes \cite{SiepelHaussler04},
or basepair covariation in RNA structures \cite{KnudsenHein99});
annotation of structures in genomes \cite{SiepelHaussler04c,PedersenEtAl2006};
and placement of metagenomic reads on phylogenetic trees \cite{MatsenEtAl2010}.
The pruning algorithm computes the likelihood of observing a single column of a multiple sequence alignment,
given knowledge of an underlying phylogenetic tree (including a map from leaf-nodes of the tree to rows in the alignment)
and a substitution probability matrix associated with each branch of the tree.
Crucially, the algorithm sums over all unobserved substitution histories on internal branches of the tree.
For a tree containing $N$ taxa, the algorithm achieves ${\cal O}(N)$ time and memory complexity by computing and tabulating intermediate probability functions of the form $G_n(x) = P(Y_n|x_n=x)$,
where $x_n$ represents the individual residue state of ancestral node $n$,
and $Y_n=\{y_m\}$ represents all the data at leaves $\{m\}$ descended from node $n$ in the tree (i.e. the observed residues at all leaf nodes $m$ whose ancestors include node $n$).
The pruning recursion visits all nodes in postorder.
Each $G_n$ function is computed in terms of the functions $G_l$ and $G_r$ of its immediate left and right children (assuming a binary tree):
\begin{eqnarray*}
G_n(x) & = & P(Y_n|x_n = x) \\
& = & \left\{
\begin{array}{ll}
\left( \sum_{x_l} B^{(l)}_{x,\ x_l} G_l(x_l) \right) \left( \sum_{x_r} B^{(r)}_{x,\ x_r} G_r(x_r) \right) & \mbox{if $n$ is not a leaf}
\\
\delta(x=y_n) & \mbox{if $n$ is a leaf}
\end{array}
\right.
\end{eqnarray*}
where $B^{(n)}_{ab} = P(x_n=b|x_m=a)$ is the probability that node $n$ has state $b$, given that its parent node $m$ has state $a$;
and $\delta(x=y_n)$ is a Kronecker delta function terminating the recursion at the leaf nodes of the tree.
The ``states'' in the above description may represent individual residues (nucleotides, amino acids), base-pairs (in RNA secondary structures) or base-triples (codons).
Sometimes, the state space is augmented to include gap characters, or latent variables.
In the machine learning literature, the $G_n$ functions are often described as ``messages'' propagated from the leaves to the root of the tree \cite{KschischangEtAl98},
and corresponding to a summary of the information in the subtree rooted at $n$.
The usual method for extending this approach from individual residues to full-length sequences assumes both that one knows the alignment of the sequences,
and that the columns of this alignment are each independent realizations of single-residue evolution.
One uses pruning to compute the above likelihood for a single alignment column,
then multiplies together the probabilities across every column in the alignment.
For an alignment of length $L$, the time complexity is ${\cal O}(LN)$ and the memory complexity ${\cal O}(N)$.
This approach works well for marginalizing substitution histories consistent with a single alignment,
but does not readily generalize to summation over indel histories or alignments.
The purpose of this manuscript is to introduce another way of extending Felsenstein's recursion from single residues (or small groups of residues)
to entire, full-length sequences, without needing to condition on a single alignment.
With no constraints on the algorithm, \hl{using a branch transducer with $c$ states,}
the time and memory complexities are ${\cal O}((cL)^N)$,
with close similarity to the algorithms of Sankoff \cite{SankoffCedergren83} and Hein \cite{Hein2001}.
% Complexity mentioned
\hl{For a user-specified maximum internal profile size $\profileSizeLimit \geq L$, }
the worst-case time complexity drops to \hl{ ${\cal O}(c^2\profileSizeLimit^4 N)$
(typical case is ${\cal O}((c\profileSizeLimit)^2 N)$}) when
a stochastic lower-bound approximation is used;
memory complexity is ${\cal O}((c\profileSizeLimit)^2 + pN)$.
In this form,
the algorithm is similar to the partial order graph for multiple sequence alignment \cite{LeeGrassoSharlow2002}.
\hl{If an``alignment envelope'' is provided as a clue to the algorithm,
the typical-case time complexity drops further to
${\cal O}(c^2\profileSizeLimit^\alpha N)$,
with empirical tests indicating that $\alpha = 1.55$.
That is, the alignment envelope brings the complexity to sub-quadratic with respect to
sequence length. }
The alignment envelope is not a hard constraint,
and may be controllably relaxed, or dispensed with altogether.
The new algorithm is, essentially, algebraically equivalent to Felsenstein's algorithm,
if the concept of a ``substitution matrix'' over a particular alphabet is extended to the countably-infinite set of all sequences over that alphabet.
Our chosen class of ``infinite substitution matrix'' is one that has a finite representation:
namely, the {\em finite-state transducer}, a probabilistic automaton that transforms an input sequence to an output sequence,
a familiar tool of statistical linguistics \cite{MohriPereiraRiley2000}.
In vector form, Felsenstein's pruning recursion is
\[
G_n = \left\{
\begin{array}{ll}
\left( B^{(l)} G_l \right) \fork \left( B^{(r)} G_r \right) & \mbox{if $n$ is not a leaf} \\
\recognize(y_n) & \mbox{if $n$ is a leaf}
\end{array}
\right.
\]
where $A \fork B$ is the pointwise (Hadamard) product
and $\recognize(x)$ is the unit column vector in dimension $x$.
By generalizing a few key algebraic ideas from matrices to transducers
(matrix multiplication, the pointwise product, row vectors and column vectors),
we are able to interpret this vector-space form of Felsenstein's algorithm
as the specification of a composite phylogenetic transducer
that spans all possible alignments (see \secref{EvidenceExpandedModel}).
The transducer approach offers a natural generalization of Felsenstein's pruning recursion to indels,
since it can be used to calculate
\[
P(S|T,\theta) = \sum_A P(S,A|T,\theta)
\]
i.e. the likelihood of sequences $S$ given tree $T$ and parameters $\theta$, summed over all alignments $A$.
Previous attempts to address indels phylogenetically have mostly returned $P(S|\hat{A},T,\theta)$ where $\hat{A}$ represents a single alignment
(typically estimated by a separate alignment program, which may introduce undetermined biases).
The exceptions to this rule are the ``statistical alignment'' methods \cite{HeinEtal2000,Hein2001,HolmesBruno2001,Metzler2003,SuchardRedelings2006}
which also marginalize alignments in an unbiased way---albeit more slowly, since they use Markov Chain Monte Carlo methods (MCMC).
In this sense, the new algorithm may be thought of as a fast, non-MCMC approximation to statistical alignment.
The purpose of this manuscript is a clean theoretical presentation of the algorithm.
In separate work \cite{WestessonEtAl2012} we find that the algorithm appears to recover more accurate reconstructions of simulated phylogenetic indel histories,
as indicated by proxy statistics such as the estimated indel rate.
The use of transducers in bioinformatics has been reported before \cite{Searls95,Holmes2003,BradleyHolmes2007,SatijaEtAl2008,PatenEtAl2008}
including an application to genome reconstruction that is conceptually similar to what we do here for proteins \cite{PatenEtAl2008}.
In particular, to maximize accessibility, we have chosen to use a formulation of finite-state transducers
that closely mirrors the formulation available on Wikipedia at the time of writing ({\tt \wikipediaTransducerUrl}).
This presentation is consistent with others described in the computer science literature \cite{MohriPereiraRiley2000}.
\subsection{Document structure}
We will begin with a narrative, ``tutorial'' overview that introduces the main
theoretical concepts using a small worked example.
Following this we will present general, precise, technical definitions.
The informal overview makes extensive use of illustrative figures,
and is intended to be easily read, even by someone not familiar with transducer theory.
The technical description is intended primarily for those wishing to integrate the
internals of this method into their own algorithms.
Either section may be read in isolation.
Each example presented in this informal section
(e.g. single transducers, composite transducers, sampled paths)
correspond to rigorously-defined mathematical constructs defined in the technical section.
Whenever possible, we provide references between the examples and their technical definitions.
The final section of the tutorial, \secref{FormalTutorialRelationship}, gives a detailed account of the connections between the tutorial and formal sections.
\section{Informal tutorial on transducer composition}
In this section we introduce (via verbal descriptions and graphical representations)
the various machines and manners of combining them necessary for the
task of modeling evolution on the tree shown in \figref{cs-mf-liv-tree}.
(The arrangement of machines required for this tree is shown in \figref{cs-mf-liv-machines}.)
While the conceptual underpinnings of our algorithm are not unusually complex,
a complete mathematical description demands a significant amount of technical notation (which we provide in \secref{Formal}).
For this reason, we aim to minimize notation in this section,
instead focusing on a selection of illustrative example machines ranging from simple to complex.
We first describe the sorts of state machines used, beginning
with simple linear machines
(which appear at the leaves of the tree in \figref{cs-mf-liv-machines})
and moving on to the various possibilities of the branch model.
Then we describe (and provide examples for) the techniques which allow us to
co-ordinate these machines on a phylogeny: composition and intersection.
Finally, we outline how combinations of these machines allow a straightforward definition
of Felsenstein's pruning algorithm for models allowing insertion/deletion events,
and a stochastic approximation technique which will allow inference on datasets
of common practical size.
\subsection{Transducers as input-output machines}
We begin with a brief definition of transducers from Wikipedia.
% crossref
These ideas are defined with greater mathematical precision in \secref{Transducer}.
% Begin quote from Wikipedia
{\em A finite state transducer is a finite state machine with two tapes: an input tape and an output tape. ... An automaton can be said to } recognize {\em a string if we view the content of its tape as input. In other words, the automaton computes a function that maps strings into the set $\{0,1\}$. $(\dagger\dagger\dagger)$ Alternatively, we can say that an automaton } generates {\em strings, which means viewing its tape as an output tape. On this view, the automaton generates a formal language, which is a set of strings. The two views of automata are equivalent: the function that the automaton computes is precisely the indicator function of the set of strings it generates... Finite State Transducers can be weighted, where each transition is labeled with a weight in addition to the input and output labels. }
{\tt \wikipediaTransducerUrl}
% End quote from Wikipedia
\\
$(\dagger\dagger\dagger)$ For a weighted transducer this mapping is,
more generally, to the nonnegative real axis $[0,\infty)$
rather than just the binary set $\{0,1\}$.
In this tutorial section we are going to work through a small examples of using transducers on a tree
for three tiny protein sequences (MF, CS, LIV).
Specifically, we will compute the likelihood of the tree shown in \figref{cs-mf-liv-tree},
explaining the common descent of these three sequences
under the so-called TKF91 model (\figref{tkf91}),
as well as a simpler model that only allows point substitutions.
To do this we will construct (progressively, from the bottom up) the ensemble of transducer machines shown in \figref{cs-mf-liv-machines}.
We will see that the full state space of \figref{cs-mf-liv-machines} is equivalent to Hein's ${\cal O}(L^N)$ alignment algorithm for the TKF91 model \cite{Hein2001};
${\cal O}(NL^2)$ progressive alignment corresponds to a greedy Viterbi/maximum likelihood-traceback approximation,
and partial-order graph alignment corresponds to a Forward/stochastic-traceback approximation.
\pdffig{cs-mf-liv-tree}{Example tree used in this tutorial.
The TKF91 model is used as the branch transducer model, but our approach
is applicable to a wider range of string transducer models. }
\pdffig{cs-mf-liv-machines}{An ensemble of transducers modeling the likelihood of the tree shown in \figref{cs-mf-liv-tree}.
We write this as $\tkfroot \cdot (\tkf \cdot (\tkf \cdot \recognize(LIV))\fork(\tkf \cdot \recognize(MF)))\fork(\tkf \cdot \recognize(CS))$.
The terms in this expression represent individual component transducers:
$\tkfroot$ is shown in \figref{tkf91-root},
$\tkf$ is shown in \figref{tkf91-labeled},
$\recognize(LIV)$ is in \figref{liv-labeled},
$\recognize(MF)$ in \figref{mf-labeled},
and
$\recognize(CS)$ in \figref{cs-labeled}.
(The general notation $\recognize(\ldots)$ is
introduced in \secref{moore-genrec} and
formalized in \secref{ExactMatch}.)
The operations for combining these transducers, denoted ``$\cdot$'' and ``$\fork$'',
are---respectively---{\em transducer composition}
(introduced in \secref{Tutorial.Composition}, formalized in \secref{Composition})
and {\em transducer intersection}
(introduced in \secref{Tutorial.Intersection}, formalized in \secref{Fork}).
The full state graph of this transducer is not shown in this manuscript:
even for such a small tree and short sequences, it is too complex to visualize easily
(the closest thing is \figref{fork3-tkf91liv-tkf91mf-tkf91cs},
which represents this transducer configuration minus the root generator, $\tkfroot$).
}
\subsubsection{Generators and recognizers}
As noted in the Wikipedia quote, transducers can be thought of as generalizations of the related
concepts of {\em generators} (state machines that emit output sequences, such as HMMs)
and parsers or {\em recognizers} (state machines that match/parse input sequences, such as the UNIX `lex' program).
Both generators and recognizers are separate special cases of transducers.
Of particular use in our treatment are generators/recognizers that generate/recognize a single unique sequence.
% crossref
Generators and recognizers are defined with greater precision in \secref{GenSinglet} and \secref{ExactMatch}.
\figref{mf-generator} is an example of a generator that uniquely generates (inserts) the protein sequence MF.
\figref{liv-small} is an example of a recognizer that uniquely recognizes (and deletes) the protein sequence LIV.
\widepngfig{mf-generator}{Generator for protein sequence MF.
This is a trivial state machine which emits (generates) the sequence MF with weight (probability) 1.
The red circle indicates the Start state, and the red diamond the End state.
}
\widepngfig{liv-small}{Recognizer for protein sequence LIV.
This is a trivial state machine which absorbs (recognizes) the sequence LIV with weight (probability) 1, and all other sequences with weight 0.
The red circle indicates the Start state, and the red diamond the End state.
}
These Figures illustrate the visual notation we use throughout the illustrative Figures of this tutorial.
States and transitions are shown as a graph.
Transitions can be labeled with absorption/emission pairs,
written $x/y$ where $x$ is the absorbed character and $y$ the emitted character.
Either $x$ or $y$ is allowed to be the empty string (shown in these diagrams as the gap character, a hyphen).
In a Figure that shows absorption/emission pairs,
if there is no absorption/emission labeled on a transition, then it can be assumed to be $-/-$
(i.e. no character is absorbed or emitted) and the transition is said to be a ``null'' transition.
Some transitions are also labeled with weights.
If no transition label is present, the weight is usually 1
(some more complicated diagrams omit all the weights, to avoid clutter).
The weight of a path is defined to be the product of transition weights occuring on the path.
The weight of an input-output sequence-pair is the sum over all path weights that generate the
specified input and output sequences.
The weight of this sequence-pair can be interpreted as the joint probability of both
(a) successfully parsing the input sequence and
(b) emitting the specified output sequence.
Note sometimes this weight is zero---e.g. in \figref{liv-small} the weight is zero
except in the unique case that the input tape is LIV, when the weight is one---this
in fact makes \figref{liv-small} a special kind of recognizer:
one that only recognizes a single string
(and recognizes that string with weight one).
We call this an {\em exact-match} recognizer.
More generally, suppose that $G$, $R$ and $T$ are all probabilistically weighted finite-state transducers:
$G$ is a generator (output only), $R$ is a recognizer (input only) and $T$ is a general transducer (input {\em and} output).
Then, conventionally, $G$ defines a probability distribution $P(Y|G)$ over the emitted output sequence $Y$;
$R$ defines a probability $P(\mbox{accept}|X,R)$ of accepting a given input sequence $X$;
and $T$ defines a joint probability $P(\mbox{accept},Y|X,T)$
that input $X$ will be accepted and output $Y$ emitted.
According to this convention, it is reasonable to expect these weights to obey the following
(here $\Omega^\ast$ denotes the set of all sequences):
\begin{eqnarray*}
\sum_{Y \in \Omega^\ast} P(Y|G) & = & 1 \\
P(\mbox{accept}|X,R) & \leq & 1 \quad \forall X \in \Omega^\ast \\
\sum_{Y \in \Omega^\ast} P(\mbox{accept},Y|X,T) & \leq & 1 \quad \forall X \in \Omega^\ast
\end{eqnarray*}
{\bf It is important to state that these are just conventional interpretations of the computed weights:}
in principle the weights can mean anything we want,
but it is common to interpret them as probabilities in this way.
Thus, as noted in the Wikipedia quote, generators and recognizers are in some sense equivalent,
although the probabilistic interpretations of the weights are slightly different.
In particular, just as we can have a {\em generative profile}
that generates some sequences with higher probability than others (e.g. a profile HMM)
we can also have a {\em recognition profile}: a transducer
that recognizes some sequences with higher probability than others.
The exact-match transducer of \figref{liv-small} is a (trivial and deterministic) example of such a recognizer;
later we will see that the stored probability vectors in the Felsenstein pruning recursion can also
be thought of as recognition profiles.
\subsection{Moore machines}
In our mathematical descriptions, we will treat transducers as
{\em Mealy machines}, meaning that absorptions and emissions are associated with transitions between states.
In the {\em Moore machine} view, absorptions and emissions are associated with states.
In our case, the distinction between these two is primarily a semantic one, since the structure
of the machines and the I/O functions of the states is intimately tied.
The latter view (Moore) can be more useful in bioinformatics,
where rapid point substitution means that all combinations of input and output characters
are possible.
In such situations, the Mealy type of machine can suffer from an excess of transitions, complicating the presentation.
For example, consider the Mealy-machine-like view of \figref{fanned-emission},
and compare it with the more compact Moore-machine-like view of \figref{condensed-emission}.
The majority of this paper deals with Moore machines.
However, \appref{Mealy} reformulates the key algorithms of transducer composition (\secref{Composition})
and transducer intersection (\secref{Fork}) as Mealy machines.
\sidewayspngfig{fanned-emission}{All combinations of input and output characters are frequently observed.
The transitions in this diagram include all possible deletion, insertion, and substitution transitions between a pair of transducer states.
Each transition is labeled with I/O characters (blue) and selected transitions are labeled with the transition weights (green).
Insertions (I/O label ``{\tt -/y}'') have weight $fU(y)$,
deletions (I/O label ``{\tt x/-}'') have weight $hV(x)$,
and substitutions (I/O label ``{\tt x/y}'') have weight $gQ(x,y)$.
The large number of transitions complicates the visualization of such ``Mealy-machine'' transducers.
We therefore use a ``Moore-machine'' representation, where all transitions of each type between a pair of states
are collapsed into a single transition, and I/O weights are associated with states (\figref{condensed-emission}).
}
\widepngfig{condensed-emission}
{In a condensed Moore-machine-like representation, possible combinations of input and output characters are encoded in the distributions contained within each state,
simplifying the display.
In this diagram, all four insertion transitions from \figref{fanned-emission} ({\tt -/A}, {\tt -/C}, etc.) are collapsed into a single {\tt -/y} transition;
similarly, all four deletion transitions from \figref{fanned-emission} ({\tt A/-}, etc.) are collapsed into one {\tt x/-},
and all sixteen substitution transitions ({\tt A/A}, {\tt A/C}, \ldots {\tt G/G}) are collapsed to one {\tt x/y}.
To allow this, the transition weights for all the collapsed transitions must factorize into independent I/O- and transition-associated components.
In this example (corresponding to \figref{fanned-emission}), the I/O weights are $U(y)$ for insertions, $Q(x,y)$ for substitutions and $V(x)$ for deletions;
while the transition weights are $f$ for insertions, $g$ for substitutions, and $h$ for deletions.
{\bf Visual conventions.}
The destination state node is bordered in red, to indicate that transitions into it have been collapsed.
Instead of just a plain circle, the node shape is an upward house (insertions), a downward house (deletions), or a rectangle (substitutions).
Instead of specific I/O character labels ({\tt A/G} for a substitution of A to G, {\tt -/C} for an insertion of C, etc.)
we now have generic labels like {\tt x/y} representing the set of all substitutions; the actual characters (and their weights) are encoded by the I/O functions.
For most Figures in the remainder of this manuscript, we will omit these blue generic I/O labels,
as they are implied by the node shape of the destination state.}
\figref{legend} shows the visual notation we use in this tutorial for Moore-form transducer state types.
There are seven {\em state types}: Start, Match, Insert, Delete, End, Wait, and Null, frequently abbreviated to $S,M,I,D,E,W,N$.
% crossref
State types are defined precisely in \secref{StateTypes}.
Note that in the TKF91 model (\figref{tkf91}, the example we use for most of this tutorial)
there are exactly one of each of these types, but this is not a requirement.
For instance,
the transducer in \figref{protpal-mix2} has two Insert, two Delete and three Wait states,
while \figref{substituter} has no Insert or Delete states at all;
\figref{moore-mf-generator} has no Match states; and so on.
\widepdffig{legend}{In a Moore machine, each state falls into one of several {\em types} (a transducer may contain more than one state of each type).
A state's type determines its I/O capabilities:
an Insert state emits (writes) an output character,
a Delete state absorbs (reads) an input character,
and
a Match state both absorbs an input character and emits an output character.
Mnemonics: Insert states point upwards, Delete states point Downwards, Wait states are octagonal like U.S. Stop signs.
}
Some other features of this view include the following:
\begin{itemize}
\item The shape and color of states indicates their type.
The six Moore normal form states are all red.
Insert states point upwards; Delete states point downwards; Match states are rectangles; Wait states are octagons (like U.S. Stop signs);
Start is a circle and End is a diamond.
There is a seventh state type, Null, which is written as a black circle to distinguish it from the six Moore-form state types.
(Null states have no associated inputs or outputs; they arise as a side-effect of algorithmically-constructed transducers in \secref{Tutorial.Composition} and \secref{Tutorial.Intersection}.
In practice, they are nuisance states that must be eliminated by marginalization, or otherwise dealt with somehow.)
This visual shorthand will be used throughout.
\item We impose certain constraints on states that involve I/O:
they must be typed as Insert, Delete, or Match, and
their type determines what kinds of I/O happens on transitions into those states (e.g. a Match state always involves an absorption and an emission).
\item We impose certain constraints on transitions into I/O states:
their weights must be factorizable into transition and I/O components.
Suppose $j$ is a Match state and $i$ is a state that precedes $j$;
then all transitions $i \to j$ must both absorb a non-gap input character $x$
and emit a non-gap output character $y$,
so the transition can be written $i \stackrel{x/y}{\longrightarrow} j$
and the transition weight must take the form $t_{ij} \times e_j(x,y)$
where $t_{ij}$ is a component that can depend on the source and destination state
(but not the I/O characters)
and $e_j(x,y)$ is a component that can depend on the I/O characters and the destination state
(but not the source state).
\item We can then associate the ``{\em I/O weight function}'' $e_j$ with Match state $j$
and the ``{\em transition weight}'' $t_{ij}$ with a single conceptual transition $i \to j$
that summarizes all the transitions $i \stackrel{x/y}{\to} j$
(compare \figref{fanned-emission} and \figref{condensed-emission}).
\item The function $e_j$ can be thought of as a conditional-probability substitution matrix
(for Match states, c.f. $Q$ in \figref{condensed-emission}),
a row vector representing a probability distribution
(for Insert states, c.f. $U$ in \figref{condensed-emission}),
or a column vector of conditional probabilities
(for Delete states, c.f. $V$ in \figref{condensed-emission}).
\item Note that we call $e_j$ an ``I/O function'' rather than an ``emit function''.
The latter term is more common in bioinformatics HMM theory;
however, $e_j$ also describes probabilistic weights of {\em absorptions} as well as {\em emissions},
and we seek to avoid ambiguity.
\end{itemize}
\pngfig{transitions}
{Allowed transitions between types of transducer states, along with their I/O requirements.
In particular, note the Wait state(s) which must precede all
absorbing (Match and Delete) states---the primary departure from the familiar pair HMM
structure.
Wait states are useful in co-ordinating multiple connected trandsucers, since they indicate that
the transducer is ``waiting'' for an upstream transducer to emit a character before entering an
absorbing state.
Also note that this graph is not intended to depict a {\em particular} state machine,
but rather it shows the transitions which are permitted between the {\em types}
of states of arbitrary machines under our formalism.
Since the TKF91 model (\figref{tkf91-labeled}) contains exactly one state of each type,
its structure is similar to this graph
(but other transducers may have more or less than one state of each type). }
\figref{transitions} shows the allowed types of transition in Moore-normal form transducers.
In our ``Moore-normal form'' for transducers, we require that all input states (Match, Delete)
are immediately preceded in the transition graph by a Wait state.
This is useful for co-ordinating multiple transducers connected together as described in later sections,
since it requires
the transducer to ``wait'' for a parent transducer to emit a character before entering an
absorbing state.
%since it ensures that the transducer always goes through a Wait state before inputting a symbol (in a Match or Delete state).
The downside is that the state graph sometimes contains a few Wait states which appear redundant
(for example, compare \figref{moore-mf-generator} with \figref{mf-generator},
or \figref{liv-labeled} with \figref{liv-small}).
For most Figures in the remainder of this manuscript, we will leave out the blue ``x/y'' labels on transitions,
as they are implied by the state type of the destination state.
Note also that this graph depicts the transitions between {\em types} of states allowed under our formalism,
rather than a {\em particular} state machine.
It happens that the TKF91 model (\figref{tkf91-labeled}) contains exactly one state of each type,
so its state graph appears similar to \figref{transitions},
but this is not true of all transducers.
\subsubsection{Generators and recognizers in Moore normal form}
\seclabel{moore-genrec}
%getrefs
We provide here several examples of small transducers in Moore normal form,
including versions of the transducers in \figref{mf-generator}
and \figref{liv-small}.
We introduce a notation for generators ($\generate$) and recognizers ($\recognize$);
a useful mnemonic for this notation (and for the state types in \figref{legend})
is ``insertions point up, deletions point down''.
\begin{itemize}
\item \figref{moore-mf-generator} uses our Moore-machine visual representation
to depict the generator in \figref{mf-generator}.
We write this transducer as $\generate(MF)$.
\item \figref{liv-labeled} is a Moore-form recognizer for sequence LIV.
We write this transducer as $\recognize(LIV)$.
The state labeled $\delta_Z$ (for $Z \in \{L,I,V\}$) has I/O function $\delta(x=Z)$,
defined to be 1 if $x=Z$, 0 otherwise.
The machine recognizes sequence LIV with weight 1, and all other sequences with weight 0.
\item \figref{mf-labeled} is the Moore-machine recognizer for MF,
the same sequence whose generator is shown in \figref{moore-mf-generator}.
We write this transducer as $\recognize(MF)$.
\item \figref{cs-labeled} is the Moore-machine recognizer for sequence CS.
We write this transducer as $\recognize(CS)$.
\item \figref{null-model} is a ``null model'' generator that emits a single IID sequence
(with residue frequency distribution $\pi$)
of geometrically-distributed length (with geometric parameter $p$).
\end{itemize}
\widepdffig{moore-mf-generator}{Transducer $\generate(MF)$, the Moore-normal form generator for protein sequence MF.
The states are labeled $S$ (Start), $E$ (End),
$\imath_M$ and $\imath_F$ (Insert states that emit the respective amino acid symbols),
and $W_F$ (a Wait state that pauses after emitting the final amino acid;