/
hmm.pyx
3700 lines (2947 loc) · 147 KB
/
hmm.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#cython: boundscheck=False
#cython: cdivision=True
# hmm.pyx: Yet Another Hidden Markov Model library
# Authors: Jacob Schreiber <jmschreiber91@gmail.com>
# Adam Novak <anovak1@ucsc.edu>
from libc.math cimport exp as cexp
from operator import attrgetter
import math, random, itertools as it, sys, json
import networkx
import tempfile
import warnings
import time
from .base cimport GraphModel
from .base cimport Model
from .base cimport State
from distributions.distributions cimport Distribution
from distributions import MultivariateDistribution
from distributions.DiscreteDistribution cimport DiscreteDistribution
from distributions.IndependentComponentsDistribution cimport IndependentComponentsDistribution
from distributions.NeuralNetworkWrapper import NeuralNetworkWrapper
from .kmeans import Kmeans
from .callbacks import History
from .utils cimport _log
from .utils cimport pair_lse
from .utils cimport python_log_probability
from .utils cimport python_summarize
from .utils import check_random_state
from .utils import _check_nan
from .io import BaseGenerator
from .io import SequenceGenerator
from libc.stdlib cimport calloc
from libc.stdlib cimport free
from libc.string cimport memcpy
from libc.string cimport memset
import numpy
cimport numpy
from joblib import Parallel
from joblib import delayed
try:
import pygraphviz
import matplotlib.pyplot as plt
import matplotlib.image
except ImportError:
pygraphviz = None
# Define some useful constants
DEF NEGINF = float("-inf")
DEF INF = float("inf")
DEF SQRT_2_PI = 2.50662827463
def _check_input(sequence, model):
n = len(sequence)
if not isinstance(model, Distribution) and not isinstance(model, Model):
return numpy.array(sequence, dtype=numpy.float64)
if not model.discrete:
sequence_ndarray = numpy.array(sequence, dtype=numpy.float64)
elif model.multivariate and model.discrete:
sequence_ndarray = numpy.empty((n, model.d), dtype=numpy.float64)
for i in range(n):
for j in range(model.d):
symbol = sequence[i][j]
keymap = model.keymap[j]
if _check_nan(symbol):
sequence_ndarray[i, j] = numpy.nan
elif symbol in keymap:
sequence_ndarray[i, j] = keymap[symbol]
else:
raise ValueError("Symbol '{}' is not defined in a distribution"
.format(symbol))
else:
sequence_ndarray = numpy.empty(n, dtype=numpy.float64)
keymap = model.keymap[0]
for i in range(n):
symbol = sequence[i]
if _check_nan(symbol):
sequence_ndarray[i] = numpy.nan
elif sequence[i] in keymap:
sequence_ndarray[i] = keymap[symbol]
else:
raise ValueError("Symbol '{}' is not defined in a distribution"
.format(symbol))
return sequence_ndarray
cdef class HiddenMarkovModel(GraphModel):
"""A Hidden Markov Model
A Hidden Markov Model (HMM) is a directed graphical model where nodes are
hidden states which contain an observed emission distribution and edges
contain the probability of transitioning from one hidden state to another.
HMMs allow you to tag each observation in a variable length sequence with
the most likely hidden state according to the model.
Parameters
----------
name : str, optional
The name of the model. Default is None.
start : State, optional
An optional state to force the model to start in. Default is None.
end : State, optional
An optional state to force the model to end in. Default is None.
Attributes
----------
start : State
A state object corresponding to the initial start of the model
end : State
A state object corresponding to the forced end of the model
start_index : int
The index of the start object in the state list
end_index : int
The index of the end object in the state list
silent_start : int
The index of the beginning of the silent states in the state list
states : list
The list of all states in the model, with silent states at the end
Examples
--------
>>> from pomegranate import *
>>> d1 = DiscreteDistribution({'A' : 0.35, 'C' : 0.20, 'G' : 0.05, 'T' : 0.40})
>>> d2 = DiscreteDistribution({'A' : 0.25, 'C' : 0.25, 'G' : 0.25, 'T' : 0.25})
>>> d3 = DiscreteDistribution({'A' : 0.10, 'C' : 0.40, 'G' : 0.40, 'T' : 0.10})
>>>
>>> s1 = State(d1, name="s1")
>>> s2 = State(d2, name="s2")
>>> s3 = State(d3, name="s3")
>>>
>>> model = HiddenMarkovModel('example')
>>> model.add_states([s1, s2, s3])
>>> model.add_transition(model.start, s1, 0.90)
>>> model.add_transition(model.start, s2, 0.10)
>>> model.add_transition(s1, s1, 0.80)
>>> model.add_transition(s1, s2, 0.20)
>>> model.add_transition(s2, s2, 0.90)
>>> model.add_transition(s2, s3, 0.10)
>>> model.add_transition(s3, s3, 0.70)
>>> model.add_transition(s3, model.end, 0.30)
>>> model.bake()
>>>
>>> print(model.log_probability(list('ACGACTATTCGAT')))
-22.73896159971087
>>> print(", ".join(state.name for i, state in model.viterbi(list('ACGACTATTCGAT'))[1]))
example-start, s1, s2, s2, s2, s2, s2, s2, s2, s2, s2, s2, s2, s3, example-end
"""
cdef public object start, end
cdef public int start_index
cdef public int end_index
cdef public int silent_start
cdef double* in_transition_pseudocounts
cdef double* out_transition_pseudocounts
cdef double [:] state_weights
cdef public bint discrete
cdef public bint multivariate
cdef int summaries
cdef int cython
cdef int* tied_state_count
cdef int* tied
cdef int* tied_edge_group_size
cdef int* tied_edges_starts
cdef int* tied_edges_ends
cdef double* in_transition_log_probabilities
cdef double* out_transition_log_probabilities
cdef double* expected_transitions
cdef int* in_edge_count
cdef int* in_transitions
cdef int* out_edge_count
cdef int* out_transitions
cdef int finite, n_tied_edge_groups
cdef public list keymap
cdef object state_names
cdef dict state_name_mapping
cdef numpy.ndarray distributions
cdef void** distributions_ptr
def __init__(self, name=None, start=None, end=None):
# Save the name or make up a name.
self.name = str(name) or str(id(self))
self.model = "HiddenMarkovModel"
# This holds a directed graph between states. Nodes in that graph are
# State objects, so they're guaranteed never to conflict when composing
# two distinct models
self.graph = networkx.DiGraph()
# Save the start and end or mae one up
self.start = start or State(None, name=self.name + "-start")
self.end = end or State(None, name=self.name + "-end")
self.d = 0
self.n_edges = 0
self.n_states = 0
self.discrete = 0
self.multivariate = 0
# Put start and end in the graph
self.graph.add_node(self.start)
self.graph.add_node(self.end)
self.in_edge_count = NULL
self.in_transitions = NULL
self.in_transition_pseudocounts = NULL
self.in_transition_log_probabilities = NULL
self.out_edge_count = NULL
self.out_transitions = NULL
self.out_transition_pseudocounts = NULL
self.out_transition_log_probabilities = NULL
self.expected_transitions = NULL
self.summaries = 0
self.tied_state_count = NULL
self.tied = NULL
self.tied_edge_group_size = NULL
self.tied_edges_starts = NULL
self.tied_edges_ends = NULL
self.state_names = set()
self.state_name_mapping = {}
def __dealloc__(self):
self.free_bake_buffers()
def __getstate__(self):
"""Return model representation in a dictionary."""
state = {
'class' : 'HiddenMarkovModel',
'name' : self.name,
'start' : self.start,
'end' : self.end,
'states' : self.states,
'end_index' : self.end_index,
'start_index' : self.start_index,
'silent_index' : self.silent_start
}
indices = { state: i for i, state in enumerate(self.states)}
# Get the number of groups of edges which are tied
groups = []
n = self.n_tied_edge_groups-1
# Go through each group one at a time
for i in range(n):
# Create an empty list for that group
groups.append([])
# Go through each edge in that group
start, end = self.tied_edge_group_size[i], self.tied_edge_group_size[i+1]
# Add each edge as a tuple of indices
for j in range(start, end):
groups[i].append((self.tied_edges_starts[j], self.tied_edges_ends[j]))
# Now reverse this into a dictionary, such that each pair of edges points
# to a label (a number in this case)
d = { tup : i for i in range(n) for tup in groups[i] }
# Get all the edges from the graph
edges = []
for start, end, data in self.graph.edges(data=True):
# If this edge is part of a group of tied edges, annotate this group
# it is a part of
s, e = indices[start], indices[end]
prob, pseudocount = math.e**data['probability'], data['pseudocount']
edge = (s, e)
edges.append((s, e, prob, pseudocount, d.get(edge, None)))
state['edges'] = edges
# Get distribution tie information
ties = []
for i in range(self.silent_start):
start, end = self.tied_state_count[i], self.tied_state_count[i+1]
for j in range(start, end):
ties.append((i, self.tied[j]))
state['distribution ties'] = ties
return state
def __reduce__(self):
return self.__class__, tuple(), self.__getstate__()
def __setstate__(self, state):
"""Deserialize object for unpickling.
Parameters
----------
state :
The model state, (see `__reduce__()` documentation from the pickle protocol).
"""
self.name = state['name']
# Load all the states from JSON formatted strings
states = state['states']
for i, j in state['distribution ties']:
# Tie appropriate states together
states[i].tie(states[j])
# Add all the states to the model
self.add_states(states)
# Indicate appropriate start and end states
self.start = states[state['start_index']]
self.end = states[state['end_index']]
# Add all the edges to the model
for start, end, probability, pseudocount, group in state['edges']:
self.add_transition(states[start], states[end], probability,
pseudocount, group)
# Bake the model
self.bake(verbose=False)
def free_bake_buffers(self):
free(self.in_transition_pseudocounts)
free(self.out_transition_pseudocounts)
free(self.tied_state_count)
free(self.tied)
free(self.tied_edge_group_size)
free(self.tied_edges_starts)
free(self.tied_edges_ends)
free(self.in_transition_log_probabilities)
free(self.out_transition_log_probabilities)
free(self.expected_transitions)
free(self.in_edge_count)
free(self.in_transitions)
free(self.out_edge_count)
free(self.out_transitions)
def add_state(self, state):
"""Add a state to the given model.
The state must not already be in the model, nor may it be part of any
other model that will eventually be combined with this one.
Parameters
----------
state : State
A state object to be added to the model.
Returns
-------
None
"""
if state.name in self.state_names:
raise ValueError("A state with name '{}' already exists".format(state.name))
self.graph.add_node(state)
self.state_names.add(state.name)
def add_states(self, *states):
"""Add multiple states to the model at the same time.
Parameters
----------
states : list or generator
Either a list of states which are entered sequentially, or just
comma separated values, for example model.add_states(a, b, c, d).
Returns
-------
None
"""
for state in states:
if isinstance(state, list):
for s in state:
self.add_state(s)
else:
self.add_state(state)
def add_transition(self, a, b, probability, pseudocount=None, group=None):
"""Add a transition from state a to state b.
Add a transition from state a to state b with the given (non-log)
probability. Both states must be in the HMM already. self.start and
self.end are valid arguments here. Probabilities will be normalized
such that every node has edges summing to 1. leaving that node, but
only when the model is baked. Psueodocounts are allowed as a way of
using edge-specific pseudocounts for training.
By specifying a group as a string, you can tie edges together by giving
them the same group. This means that a transition across one edge in the
group counts as a transition across all edges in terms of training.
Parameters
----------
a : State
The state that the edge originates from
b : State
The state that the edge goes to
probability : double
The probability of transitioning from state a to state b in [0, 1]
pseudocount : double, optional
The pseudocount to use for this specific edge if using edge
pseudocounts for training. Defaults to the probability. Default
is None.
group : str, optional
The name of the group of edges to tie together during training. If
groups are used, then a transition across any one edge counts as a
transition across all edges. Default is None.
Returns
-------
None
"""
pseudocount = pseudocount or probability
self.graph.add_edge(a, b, probability=_log(probability),
pseudocount=pseudocount, group=group)
def add_transitions(self, a, b, probabilities, pseudocounts=None,
groups=None):
"""Add many transitions at the same time,
Parameters
----------
a : State or list
Either a state or a list of states where the edges originate.
b : State or list
Either a state or a list of states where the edges go to.
probabilities : list
The probabilities associated with each transition.
pseudocounts : list, optional
The pseudocounts associated with each transition. Default is None.
groups : list, optional
The groups of each edge. Default is None.
Returns
-------
None
Examples
--------
>>> model.add_transitions([model.start, s1], [s1, model.end], [1., 1.])
>>> model.add_transitions([model.start, s1, s2, s3], s4, [0.2, 0.4, 0.3, 0.9])
>>> model.add_transitions(model.start, [s1, s2, s3], [0.6, 0.2, 0.05])
"""
pseudocounts = pseudocounts or probabilities
n = len(a) if isinstance(a, list) else len(b)
if groups is None or isinstance(groups, str):
groups = [groups] * n
# Allow addition of many transitions from many states
if isinstance(a, list) and isinstance(b, list):
edges = zip(a, b, probabilities, pseudocounts, groups)
for start, end, probability, pseudocount, group in edges:
self.add_transition(start, end, probability, pseudocount, group)
# Allow for multiple transitions to a specific state
elif isinstance(a, list) and isinstance(b, State):
edges = zip(a, probabilities, pseudocounts, groups)
for start, probability, pseudocount, group in edges:
self.add_transition(start, b, probability, pseudocount, group)
# Allow for multiple transitions from a specific state
elif isinstance(a, State) and isinstance(b, list):
edges = zip(b, probabilities, pseudocounts, groups)
for end, probability, pseudocount, group in edges:
self.add_transition(a, end, probability, pseudocount, group)
def dense_transition_matrix(self):
"""Returns the dense transition matrix.
Parameters
----------
None
Returns
-------
matrix : numpy.ndarray, shape (n_states, n_states)
A dense transition matrix, containing the log probability
of transitioning from each state to each other state.
"""
m = len(self.states)
transition_log_probabilities = numpy.zeros((m, m)) + NEGINF
for i in range(m):
for n in range(self.out_edge_count[i], self.out_edge_count[i+1]):
transition_log_probabilities[i, self.out_transitions[n]] = \
self.out_transition_log_probabilities[n]
return numpy.exp(transition_log_probabilities)
def copy(self):
"""Returns a deep copy of the HMM.
Parameters
----------
None
Returns
-------
model : HiddenMarkovModel
A deep copy of the model with entirely new objects.
"""
return HiddenMarkovModel.from_json(self.to_json())
def freeze_distributions(self):
"""Freeze all the distributions in model.
Upon training only edges will be updated. The parameters of
distributions will not be affected.
Parameters
----------
None
Returns
-------
None
"""
for state in self.states:
if not state.is_silent():
state.distribution.freeze()
def thaw_distributions(self):
"""Thaw all distributions in the model.
Upon training distributions will be updated again.
Parameters
----------
None
Returns
-------
None
"""
for state in self.states:
if not state.is_silent():
state.distribution.thaw()
def add_model(self, other):
"""Add the states and edges of another model to this model.
Parameters
----------
other : HiddenMarkovModel
The other model to add
Returns
-------
None
"""
self.graph = networkx.union(self.graph, other.graph)
def concatenate(self, other, suffix='', prefix=''):
"""Concatenate this model to another model.
Concatenate this model to another model in such a way that a single
probability 1 edge is added between self.end and other.start. Rename
all other states appropriately by adding a suffix or prefix if needed.
Parameters
----------
other : HiddenMarkovModel
The other model to concatenate
suffix : str, optional
Add the suffix to the end of all state names in the other model.
Default is ''.
prefix : str, optional
Add the prefix to the beginning of all state names in the other
model. Default is ''.
Returns
-------
None
"""
other.name = "{}{}{}".format(prefix, other.name, suffix)
for state in other.states:
state.name = "{}{}{}".format(prefix, state.name, suffix)
self.graph = networkx.union(self.graph, other.graph)
self.add_transition(self.end, other.start, 1.00)
self.end = other.end
def plot(self, precision=4, **kwargs):
"""Draw this model's graph using NetworkX and matplotlib.
Note that this relies on networkx's built-in graphing capabilities (and
not Graphviz) and thus can't draw self-loops.
See networkx.draw_networkx() for the keywords you can pass in.
Parameters
----------
precision : int, optional
The precision with which to round edge probabilities.
Default is 4.
**kwargs : any
The arguments to pass into networkx.draw_networkx()
Returns
-------
None
"""
if pygraphviz is not None:
G = pygraphviz.AGraph(directed=True)
out_edges = self.out_edge_count
for state in self.states:
if state.is_silent():
color = 'grey'
elif state.distribution.frozen:
color = 'blue'
else:
color = 'red'
G.add_node(state.name, color=color)
for i, state in enumerate(self.states):
for l in range(out_edges[i], out_edges[i+1]):
li = self.out_transitions[l]
p = cexp(self.out_transition_log_probabilities[l])
p = round(p, precision)
G.add_edge(state.name, self.states[li].name, label=p)
with tempfile.NamedTemporaryFile() as tf:
G.draw(tf.name, format='png', prog='dot')
img = matplotlib.image.imread(tf.name)
plt.imshow(img)
plt.axis('off')
else:
warnings.warn("Install pygraphviz for nicer visualizations")
networkx.draw(self.graph, **kwargs)
def bake(self, verbose=False, merge="All"):
"""Finalize the topology of the model.
Finalize the topology of the model and assign a numerical index to
every state. This method must be called before any of the probability-
calculating methods.
This fills in self.states (a list of all states in order) and
self.transition_log_probabilities (log probabilities for transitions),
as well as self.start_index and self.end_index, and self.silent_start
(the index of the first silent state).
Parameters
----------
verbose : bool, optional
Return a log of changes made to the model during normalization
or merging. Default is False.
merge : "None", "Partial, "All"
Merging has three options:
"None": No modifications will be made to the model.
"Partial": A silent state which only has a probability 1 transition
to another silent state will be merged with that silent state.
This means that if silent state "S1" has a single transition
to silent state "S2", that all transitions to S1 will now go
to S2, with the same probability as before, and S1 will be
removed from the model.
"All": A silent state with a probability 1 transition to any other
state, silent or symbol emitting, will be merged in the manner
described above. In addition, any orphan states will be removed
from the model. An orphan state is a state which does not have
any transitions to it OR does not have any transitions from it,
except for the start and end of the model. This will iteratively
remove orphan chains from the model. This is sometimes desirable,
as all states should have both a transition in to get to that
state, and a transition out, even if it is only to itself. If
the state does not have either, the HMM will likely not work as
intended.
Default is 'All'.
Returns
-------
None
"""
self.free_bake_buffers()
in_edge_count = numpy.zeros(len(self.graph.nodes()),
dtype=numpy.int32)
out_edge_count = numpy.zeros(len(self.graph.nodes()),
dtype=numpy.int32)
merge = merge.lower() if merge else None
while merge == 'all':
merge_count = 0
# Reindex the states based on ones which are still there
prestates = list(self.graph.nodes)
indices = { prestates[i]: i for i in range(len(prestates)) }
# Go through all the edges, summing in and out edges
for a, b in list(self.graph.edges()):
out_edge_count[indices[a]] += 1
in_edge_count[indices[b]] += 1
# Go through each state, and if either in or out edges are 0,
# remove the edge.
for i in range(len(prestates)):
if prestates[i] is self.start or prestates[i] is self.end:
continue
if in_edge_count[i] == 0:
merge_count += 1
self.graph.remove_node(prestates[i])
if verbose:
print("Orphan state {} removed due to no edges \
leading to it".format(prestates[i].name))
elif out_edge_count[i] == 0:
merge_count += 1
self.graph.remove_node(prestates[i])
if verbose:
print("Orphan state {} removed due to no edges \
leaving it".format(prestates[i].name))
if merge_count == 0:
break
# Go through the model checking to make sure out edges sum to 1.
# Normalize them to 1 if this is not the case.
if merge in ['all', 'partial']:
for state in list(self.graph.nodes()):
# Perform log sum exp on the edges to see if they properly sum to 1
out_edges = round(sum(numpy.e**x['probability']
for x in self.graph.adj[state].values()), 8)
# The end state has no out edges, so will be 0
if out_edges != 1. and state != self.end:
# Issue a notice if verbose is activated
if verbose:
print("{} : {} summed to {}, normalized to 1.0"\
.format(self.name, state.name, out_edges))
# Reweight the edges so that the probability (not logp) sums
# to 1.
for edge in self.graph.adj[state].values():
edge['probability'] = edge['probability'] - _log(out_edges)
# Automatically merge adjacent silent states attached by a single edge
# of 1.0 probability, as that adds nothing to the model. Traverse the
# edges looking for 1.0 probability edges between silent states.
while merge in ['all', 'partial']:
# Repeatedly go through the model until no merges take place.
merge_count = 0
for a, b, e in self.graph.edges(data=True):
# Since we may have removed a or b in a previous iteration,
# a simple fix is to just check to see if it's still there
if a not in list(self.graph.nodes()) or b not in list(self.graph.nodes()):
continue
if a == self.start or b == self.end:
continue
# If a silent state has a probability 1 transition out
if e['probability'] == 0.0 and a.is_silent():
# Make sure the transition is an appropriate merger
if merge=='all' or (merge=='partial' and b.is_silent()):
# Go through every transition to that state
for x, y, d in self.graph.edges(data=True):
# Make sure that the edge points to the current node
if y is a:
# Increment the edge counter
merge_count += 1
# Remove the edge going to that node
self.graph.remove_edge(x, y)
pseudo = max(e['pseudocount'], d['pseudocount'])
group = e['group'] if e['group'] == d['group'] else None
# Add a new edge going to the new node
self.graph.add_edge(x, b, probability=d['probability'],
pseudocount=pseudo,
group=group)
# Log the event
if verbose:
print("{} : {} - {} merged".format(
self.name, a, b))
# Remove the state now that all edges are removed
self.graph.remove_node(a)
if merge_count == 0:
break
if merge in ['all', 'partial']:
# Detect whether or not there are loops of silent states by going
# through every pair of edges, and ensure that there is not a cycle
# of silent states.
for a, b, e in self.graph.edges(data=True):
for x, y, d in self.graph.edges(data=True):
if a is y and b is x and a.is_silent() and b.is_silent():
print("Loop: {} - {}".format(a.name, b.name))
states = self.graph.nodes()
n, m = len(states), len(self.graph.edges())
self.n_edges = m
self.n_states = n
silent_states, normal_states = [], []
for state in states:
if state.is_silent():
silent_states.append(state)
else:
normal_states.append(state)
normal_states = list(sorted(normal_states, key=attrgetter('name')))
silent_states = list(sorted(silent_states, key=attrgetter('name')))
silent_order = {state: i for i, state in enumerate(reversed(silent_states))}
# We need the silent states to be in topological sort order: any
# transition between silent states must be from a lower-numbered state
# to a higher-numbered state. Since we ban loops of silent states, we
# can get away with this.
# Get the subgraph of all silent states
silent_subgraph = self.graph.subgraph(silent_states)
# Get the sorted silent states. Isn't it convenient how NetworkX has
# exactly the algorithm we need?
silent_states_sorted = list(networkx.lexicographical_topological_sort(
silent_subgraph, silent_order.__getitem__))
# What's the index of the first silent state?
self.silent_start = len(normal_states)
# Save the master state ordering. Silent states are last and in
# topological order, so when calculationg forward algorithm
# probabilities we can just go down the list of states.
self.states = normal_states + silent_states_sorted
# We need a good way to get transition probabilities by state index that
# isn't N^2 to build or store. So we will need a reverse of the above
# mapping. It's awkward but asymptotically fine.
indices = { self.states[i]: i for i in range(n) }
# Create a sparse representation of the tied states in the model. This
# is done in the same way of the transition, by having a vector of
# counts, and a vector of the IDs that the state is tied to.
self.tied_state_count = <int*> calloc(self.silent_start+1, sizeof(int))
for i in range(self.silent_start+1):
self.tied_state_count[i] = 0
for i in range(self.silent_start):
for j in range(self.silent_start):
if i == j:
continue
if self.states[i].distribution is self.states[j].distribution:
self.tied_state_count[i+1] += 1
for i in range(1, self.silent_start+1):
self.tied_state_count[i] += self.tied_state_count[i-1]
self.tied = <int*> calloc(self.tied_state_count[self.silent_start], sizeof(int))
for i in range(self.tied_state_count[self.silent_start]):
self.tied[i] = -1
for i in range(self.silent_start):
for j in range(self.silent_start):
if i == j:
continue
if self.states[i].distribution is self.states[j].distribution:
# Begin at the first index which belongs to state i...
start = self.tied_state_count[i]
# Find the first non -1 entry in order to put our index.
while self.tied[start] != -1:
start += 1
# Now that we've found a non -1 entry, put the index of the
# state which this state is tied to in!
self.tied[start] = j
# Unpack the state weights
self.state_weights = numpy.zeros(self.silent_start)
for i in range(self.silent_start):
self.state_weights[i] = _log(self.states[i].weight)
# This holds numpy array indexed [a, b] to transition log probabilities
# from a to b, where a and b are state indices. It starts out saying all
# transitions are impossible.
self.in_transitions = <int*> calloc(m, sizeof(int))
self.in_edge_count = <int*> calloc(n+1, sizeof(int))
self.in_transition_pseudocounts = <double*> calloc(m,
sizeof(double))
self.in_transition_log_probabilities = <double*> calloc(m,
sizeof(double))
self.out_transitions = <int*> calloc(m, sizeof(int))
self.out_edge_count = <int*> calloc(n+1, sizeof(int))
self.out_transition_pseudocounts = <double*> calloc(m,
sizeof(double))
self.out_transition_log_probabilities = <double*> calloc(m,
sizeof(double))
self.expected_transitions = <double*> calloc(self.n_edges, sizeof(double))
memset(self.in_transitions, -1, m*sizeof(int))
memset(self.in_edge_count, 0, (n+1)*sizeof(int))
memset(self.in_transition_pseudocounts, 0, m*sizeof(double))
memset(self.in_transition_log_probabilities, 0, m*sizeof(double))
memset(self.out_transitions, -1, m*sizeof(int))
memset(self.out_edge_count, 0, (n+1)*sizeof(int))
memset(self.out_transition_pseudocounts, 0, m*sizeof(double))
memset(self.out_transition_log_probabilities, 0, m*sizeof(double))
# Now we need to find a way of storing in-edges for a state in a manner
# that can be called in the cythonized methods below. This is basically
# an inversion of the graph. We will do this by having two lists, one
# list size number of nodes + 1, and one list size number of edges.
# The node size list will store the beginning and end values in the
# edge list that point to that node. The edge list will be ordered in
# such a manner that all edges pointing to the same node are grouped
# together. This will allow us to run the algorithms in time
# nodes*edges instead of nodes*nodes.
for a, b in self.graph.edges():
# Increment the total number of edges going to node b.
self.in_edge_count[indices[b]+1] += 1
# Increment the total number of edges leaving node a.
self.out_edge_count[indices[a]+1] += 1
# Determine if the model is infinite or not based on the number of edges
# to the end state
if self.in_edge_count[indices[self.end]+1] == 0:
self.finite = 0
else:
self.finite = 1
# Take the cumulative sum so that we can associate array indices with
# in or out transitions
for i in range(1, n+1):
self.in_edge_count[i] += self.in_edge_count[i-1]
self.out_edge_count[i] += self.out_edge_count[i-1]
# We need to store the edge groups as name : set pairs.
edge_groups = {}
# Now we go through the edges again in order to both fill in the
# transition probability matrix, and also to store the indices sorted