-
Notifications
You must be signed in to change notification settings - Fork 4
/
ref_graph.ml
1310 lines (1200 loc) · 46.2 KB
/
ref_graph.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(** Code and logic for constructing 'Reference' graphs of HLA genes. *)
open Util
open Biology
open Graph
type start = MSA.position * Alleles.t [@@deriving eq, ord]
type sequence = string [@@deriving eq, ord, show]
(* start end pairs *)
type sep = { start : start ; end_ : MSA.position } [@@deriving eq, ord]
let blts = Gene_region.to_string
module Nodes = struct
type t =
| S of start
| E of MSA.position
| B of MSA.position * Gene_region.t (* Boundary of position and label *)
| N of MSA.position * sequence (* Sequences *)
[@@deriving eq, ord]
let vertex_name ?(short=true) = function
| S (n, s) -> sprintf "S%d-%s" n s
| E n -> sprintf "E%d" n
| B (p, l) -> sprintf "B-%s-%d" (blts l) p
| N (p, s) -> sprintf "%d-%s(%d)"
p (if short then short_seq s else s)
(p + String.length s)
let position = function
| S (p, _) | E p | B (p, _) | N (p, _) -> p
let inside pos = function
| S (p, _) | E p | B (p, _) -> p = pos
| N (p, s) -> p <= pos && pos < p + String.length s
let inside_seq p s ~pos =
p < pos && pos < p + String.length s
let compare_by_position_first n1 n2 =
let compare_int (i1 : int) (i2 : int) = Pervasives.compare i1 i2 in
let r = compare_int (position n1) (position n2) in
if r = 0 then compare n1 n2 else r
let hash = Hashtbl.hash
let is_boundary = function
| B _ -> true
| N _ | S _ | E _ -> false
let is_seq = function
| N _ -> true
| S _ | E _ | B _ -> false
let is_seq_or_boundary = function
| N _ | B _ -> true
| S _ | E _ -> false
end (* Nodes *)
module Edges = struct
type t = Alleles.Set.t
let hash = Hashtbl.hash
let compare = Alleles.Set.compare
let equal = Alleles.Set.equal
let default = Alleles.Set.empty
end (* Edges *)
module G = Imperative.Digraph.ConcreteLabeled(Nodes)(Edges)
let between g start stop =
let ng = G.create () in
G.add_vertex ng start;
let rec add_from node =
G.iter_succ_e (fun ((_, _, n) as e) ->
G.add_vertex ng n;
G.add_edge_e ng e;
if n <> stop then add_from n)
g node
in
add_from start;
ng
(* We want to be able to have a 'by-position' array of all the nodes of the
graph that intersect that position. Since some positions might contain the
same set of nodes we distinguish two cases: *)
type by_position =
| NL of Nodes.t list (* A list of nodes that start at a given position. *)
| Redirect of int (* A redirect (index into array - NOT position) of
where to find the previous Node list. In
[by_position_array] below, that location should
contain the nodes that span this position. *)
and by_position_array = by_position array
module NodeSet = struct
include Set.Make (Nodes)
let to_string s =
sprintf "{%s}" (string_of_list (elements s) ~sep:"; " ~f:Nodes.vertex_name)
end (* NodeSet *)
module EdgeNodeSet = struct
include Set.Make (struct
(* edges first since these edges point into the respective node. *)
type t = Edges.t * Nodes.t
let compare (e1, n1) (e2, n2) =
let r = Nodes.compare n1 n2 in
if r = 0 then Edges.compare e1 e2 else r
end)
(* Expose these once we revert the Alleles.Set functorization
let to_string ?max_length ?complement s =
sprintf "{%s}"
(string_of_list (elements s) ~sep:"; "
~f:(fun (e, n) ->
sprintf "(%s -> %s)"
(Alleles.Set.to_human_readable ?max_length ?complement e)
(Nodes.vertex_name n)))
let to_table ?max_length ?complement s =
let uns = elements s in
let lst = List.sort uns ~cmp:(fun (_,n1) (_,n2) -> Nodes.compare n1 n2) in
sprintf "%s"
(string_of_list lst ~sep:"\n"
~f:(fun (e, n) ->
sprintf "%s <- %s"
(Nodes.vertex_name n)
(Alleles.Set.to_human_readable ?max_length ?complement e))) *)
end (* EdgeNodeSet *)
module AS = Alleles.Set
module AM = Alleles.Map
type t =
{ align_date : string (* When the alignment was created by IMGT. *)
; reference : string (* Name of reference allelle. *)
; g : G.t (* The actual graph. *)
; aindex : Alleles.index (* The allele index, for Sets and Maps. *)
; bounds : sep list AM.t (* Map of where the alleles start and stop. *)
; posarr : by_position_array (* Per position lookups. *)
; merge_map : (string * MSA.Alteration.t list) list
; offset : int
}
let number_of_alleles t =
t.aindex.Alleles.size
(** Some accessors that are necessary for output (and construction) but placed
here for convenience. *)
(** [starts_by_position g] returns an associated list of positions and
an a set of alleles that start at that position. *)
let starts_by_position { bounds; aindex; _ } =
AM.fold bounds ~init:[] ~f:(fun asc sep_lst allele ->
List.fold_left sep_lst ~init:asc ~f:(fun asc sep ->
let pos = fst sep.start in
try
let bts, rest = remove_and_assoc pos asc in
(pos, AS.set bts allele) :: rest
with Not_found ->
(pos, AS.singleton aindex allele) :: asc))
let add_allele_edge aindex g pv nv allele =
(* UGH. Unfortunately a necessary guard against
1. Not typing this logic better.
2. OCamlgraph find_edge stack-overflow *)
if pv = nv then () else
let new_edge =
try
let eset = G.find_edge g pv nv |> G.E.label in
G.remove_edge g pv nv;
G.E.create pv (AS.set eset allele) nv
with Not_found ->
G.E.create pv (AS.singleton aindex allele) nv
in
G.add_edge_e g new_edge
(** Compress the start nodes; join all the start nodes that have the same
alignment position into one node. *)
let create_compressed_starts t =
let start_asc = starts_by_position t in
let ng = G.copy t.g in
let open Nodes in
List.iter start_asc ~f:(fun (pos, allele_set) ->
if AS.cardinal allele_set > 1 then begin
let a_str = AS.to_string ~compress:true allele_set in
let node = G.V.create (S (pos, a_str)) in
G.add_vertex ng node;
AS.iter allele_set ~f:(fun allele ->
let rm = S (pos, allele) in
G.iter_succ (fun sv -> add_allele_edge t.aindex ng node sv allele) ng rm;
G.remove_vertex ng rm)
end);
{ t with g = ng }
(** Output
TODO: When constructing the dot files, it would be nice if the alleles (edges), were
in some kind of consistent order. *)
(**
* @args human_edges : use heuristics to shorten the name.
*)
let output_dot
?(human_edges=true)
?(compress_edges=true)
?(compress_start=true)
?(insert_newlines=true)
?short
?max_length
fname
t =
let { g; _} = if compress_start then create_compressed_starts t else t in
let oc = open_out fname in
let insert_newline = insert_chars ['\n'] in
let module Dot = Graphviz.Dot (
struct
include G
let graph_attributes _g = []
let default_vertex_attributes _g = []
let vertex_name v =
let s = sprintf "\"%s\"" (Nodes.vertex_name ?short v) in
if insert_newlines then insert_newline s else s
let vertex_attributes _v = [`Shape `Box]
let get_subgraph _v = None
let default_edge_attributes _t = [`Color 4711]
let edge_attributes e =
let compress = compress_edges in
let s =
if human_edges then
AS.to_human_readable ~compress ?max_length (G.E.label e)
else
AS.to_string ~compress (G.E.label e)
in
if insert_newlines then
[`Label ( insert_newline s) ]
else
[`Label s ]
end)
in
Dot.output_graph oc g;
close_out oc
let output
?human_edges
?compress_edges
?compress_start
?max_length
?(pdf=true)
?(open_=true)
~short
fname t =
output_dot ?human_edges ?compress_edges ?compress_start ?max_length ~short
(fname ^ ".dot") t;
if pdf then begin
let r = Sys.command (sprintf "dot -Tpdf %s.dot -o %s.pdf" fname fname) in
if r = 0 && open_ then
Sys.command (sprintf "open %s.pdf" fname)
else
r
end else
0
let save fname g =
let oc = open_out fname in
Marshal.to_channel oc g [];
close_out oc
let load fname =
let ic = open_in fname in
let g : G.t = (Marshal.from_channel ic) in
close_in ic;
g
(** Construction *)
let relationship pos v =
let open Nodes in
let end_pos p s = p + String.length s in
match v with
| S _ | E _ -> `Ignore
| B (p, _) when pos < p -> `Before p
| N (p, _) when pos < p -> `Before p
| B (p, _) when pos = p -> `Exact
| N (p, _) when pos = p -> `Exact
| B _ (* when pos > p *) -> `After
| N (p, s) when pos < end_pos p s -> `In (p, s)
| N _ (*p, s) when pos >= end_pos p s*) -> `After
exception GraphConstruction of string
let gc fmt =
ksprintf (fun s -> raise (GraphConstruction s))
fmt
(*first_start, last_end, end_to_next_start_assoc *)
let reference_starts_and_ends lst =
match lst with
| [] -> gc "Reference has no start and ends"
| {start; end_} :: [] -> start, end_, []
| {start; end_} :: t ->
let rec loop ep acc = function
| [] -> gc "stop before empty"
| {start; end_} :: [] -> end_, (ep, start) :: acc
| {start; end_} :: t -> loop end_ ((ep, start) :: acc) t
in
let e, l = loop end_ [] t in
start, e, l
let add_reference_elems aindex g allele ref_elems =
let open Nodes in
let bse () = AS.singleton aindex allele in
let add_start start_pos lst =
let st = start_pos, allele in
`Started (st, G.V.create (S st)) :: lst
in
let add_end end_pos ~st ~prev lst =
G.add_edge_e g (G.E.create prev (bse ()) (G.V.create (E end_pos)));
`Ended (st, end_pos) :: lst
in
let add_boundary ~st ~prev ~label ~pos lst =
let boundary_node = G.V.create (B (pos, label)) in
G.add_edge_e g (G.E.create prev (bse ()) boundary_node);
`Started (st, boundary_node) :: lst
in
let add_seq ~st ~prev start s lst =
let sequence_node = G.V.create (N (start, s)) in
G.add_edge_e g (G.E.create prev (bse ()) sequence_node);
`Started (st, sequence_node) :: lst
in
List.fold_left ref_elems ~init:[] ~f:(fun state e ->
let open MSA in
match state, e with
| [] , Start start_pos -> add_start start_pos state
| `Ended _ :: _ , Start start_pos -> add_start start_pos state
| [] , Boundary _
| `Ended _ :: _ , Boundary _ -> state (* ignore *)
| [] , al_el
| `Ended _ :: _ , al_el ->
gc "Unexpected %s after end for %s" (al_el_to_string al_el) allele
| `Started (st, prev) :: tl, End end_pos -> add_end end_pos ~st ~prev tl
| `Started (st, prev) :: tl, Boundary {label; pos } -> add_boundary ~st ~prev ~label ~pos tl
| `Started (st, prev) :: tl, Sequence {start; s } -> add_seq ~st ~prev start s tl
| `Started (_, _) :: _, Gap _ -> state (* ignore gaps *)
| `Started (_, _) :: _tl, Start sp ->
gc "Unexpected second start at %d for %s" sp allele)
|> List.map ~f:(function
| `Started _ -> gc "Still have a Started in %s ref" allele
| `Ended (start, end_) -> { start; end_})
|> List.sort ~cmp:(fun s1 s2 -> compare_start s1.start s2.start)
let test_consecutive_elements allele =
let open MSA in
let open Nodes in
function
| `Gap close_pos ->
begin function
| (End end_pos) :: tl when end_pos = close_pos -> `End (close_pos, tl)
| Sequence {start; s} :: tl when start = close_pos ->
let new_close = start + String.length s in
`Continue (Some (N (start, s)), (`Sequence new_close), tl)
| Start _ :: _ ->
gc "For %s another start after a Gap closes %d." allele close_pos
| [] ->
gc "For %s Empty list after Gap close %d." allele close_pos
| Boundary _ :: _
| Sequence _ :: _
| Gap _ :: _
| End _ :: _ -> `Close close_pos
end
| `Sequence close_pos ->
begin function
| End end_pos :: tl when end_pos = close_pos -> `End (close_pos, tl)
| Gap { gstart; length} :: tl when gstart = close_pos ->
let new_close = gstart + length in
`Continue (None, (`Gap new_close), tl)
| Start _ :: _ ->
gc "For %s another start after a Sequence close %d." allele close_pos
| [] ->
gc "For %s empty list after Sequence close %d." allele close_pos
| Boundary _ :: _
| Sequence _ :: _
| Gap _ :: _
| End _ :: _ -> `Close close_pos
end
exception Found of Nodes.t
let next_node_along g allele ~from =
try
G.fold_succ_e (fun (_, bt, vs) n ->
if AS.is_set bt allele then raise (Found vs) else n)
g from None
with Found v ->
Some v
let add_non_ref aindex g reference
(first_start, last_end, end_to_next_start_assoc)
allele alt_lst =
let open MSA in
let open Nodes in
let first_start_pos = fst first_start in
let first_start_node = S first_start in
let last_end_node = E last_end in
let end_to_start_nodes = List.map ~f:(fun (e, s) -> E e, S s) end_to_next_start_assoc in
let next_reference ~msg from =
match next_node_along g reference ~from with
| Some n -> n
| None -> match List.Assoc.get from end_to_start_nodes with
| Some n -> n
| None -> gc "In %s: %s" allele msg
in
let add_allele_edge pv nv =
(*let () = printf "adding allele edge %s -- (%s) -> %s"
(vertex_name pv) allele (vertex_name nv)
in *)
add_allele_edge aindex g pv nv allele in
let do_nothing _ _ = () in
let advance_until ~visit ~prev ~next pos =
let rec forward node msg =
loop node (next_reference ~msg node)
and loop prev next =
if next = first_start_node then
forward next "No next after reference Start!"
else if next = last_end_node then
`AfterLast prev
else
match relationship pos next with
| `Ignore -> forward next (sprintf "Skipping %s" (vertex_name next))
| `Before ap -> `InGap (prev, next, ap)
| `Exact -> `AtNext (prev, next)
| `In (p, s) -> `InsideNext (prev, next, p, s)
| `After -> visit prev next;
forward next
(sprintf "At pos: %d, next: %s not at End, should have next!"
pos (Nodes.vertex_name next))
in
loop prev next
in
let split_in ~prev ~visit ~next pos =
let split_and_rejoin p s node =
let open Nodes in
let index = pos - p in
let fs, sn = String.split_at s ~index in
let pr = G.pred_e g node in
let su = G.succ_e g node in
G.remove_vertex g node;
let v1 = N (p, fs) in
G.add_vertex g v1;
let s_inter =
List.fold_left pr ~init:(AS.init aindex)
~f:(fun bta (p, bt, _) ->
G.add_edge_e g (G.E.create p bt v1);
AS.union bt bta)
in
let v2 = N (pos, sn) in
G.add_vertex g v2;
let s_inter = AS.clear s_inter allele in
G.add_edge_e g (G.E.create v1 s_inter v2);
List.iter su ~f:(fun (_, e, s) -> G.add_edge_e g (G.E.create v2 e s));
(v1, v2)
in
match advance_until ~prev ~next ~visit pos with
| `InsideNext (pv, nv, p, s) ->
let v1, v2 = split_and_rejoin p s nv in
visit pv v1;
`AtNext (v1, v2)
| `AfterLast _ as al -> al
| `InGap _ as ig -> ig
| `AtNext _ as an -> an
in
let advance_until_boundary ~visit ~prev ~next pos label =
let rec forward node msg =
loop node (next_reference ~msg node)
and loop pv nv =
match nv with
| S _ | E _ -> forward nv "Skipping start End"
| B (p, c) when p = pos ->
if c <> label then
gc "Boundary at %d position diff from reference %s label %s"
p (blts c) (blts label)
else
pv, nv
| B (p, _)
| N (p, _) when p < pos ->
visit pv nv;
forward nv (sprintf "Trying to find B %d %s after %d"
pos (blts label) p)
| B (p, c) (*when p > pos*) ->
gc "Next Boundary %d %s after desired boundary %d %s"
p (blts c) pos (blts label)
| N (p, _) ->
gc "Next Sequence position: %d at or after desired boundary pos %d (label %s) %s"
p pos (blts label) allele
in
loop prev next
in
(* How we close back with the reference *)
let rec rejoin_after_split ~prev ~next split_pos state ~new_node lst =
match split_in ~prev ~next ~visit:do_nothing split_pos with
| `AfterLast _ -> solo_loop state new_node lst
| `InGap (_pv, next, ap) -> ref_gap_loop state ~prev:new_node next ap lst
| `AtNext (_pv, next) -> add_allele_edge new_node next;
main_loop state ~prev:new_node ~next lst
(* In the beginning we have not 'Start'ed ->
Loop through the alignment elemends:
- discarding Boundaries and Gaps
- on a Start find the position in reference and start construction loop
- complaining on anything other than a Start *)
and start_loop previous_starts_and_ends lst =
let rec find_start_loop = function
| Boundary _ :: t
| Gap _ :: t -> find_start_loop t (* Ignore Gaps & Boundaries before Start *)
| Start p :: t -> Some ( p, t)
| [] ->
begin
match previous_starts_and_ends with
| [] -> gc "Failed to find start for %s." allele
| _t -> None
end
| s :: _ -> gc "Encountered %s in %s instead of Start"
(al_el_to_string s) allele
in
match find_start_loop lst with
| None -> previous_starts_and_ends (* fin *)
| Some (start_pos, tl) ->
let start = start_pos, allele in
let state = start, previous_starts_and_ends in
let new_node = G.V.create (S start) in
if start_pos < first_start_pos then
ref_gap_loop state ~prev:new_node first_start_node first_start_pos tl
else
(* we can think of a start as as a Gap *)
close_position_loop state ~prev:first_start_node ~next:first_start_node
~allele_node:new_node (`Gap start_pos) tl
and add_end (start, os) end_ prev tl =
add_allele_edge prev (G.V.create (E end_));
let ns = { start; end_ } :: os in
if tl = [] then
ns (* fin *)
else
start_loop ns tl
(* When the only thing that matters is the previous node. *)
and solo_loop state prev = function
| [] -> gc "No End at allele sequence: %s" allele
| Start p :: _ -> gc "Another start %d in %s allele sequence." p allele
| End end_pos :: tl -> add_end state end_pos prev tl
| Boundary { label; pos} :: t -> let boundary_node = G.V.create (B (pos, label)) in
add_allele_edge prev boundary_node;
solo_loop state boundary_node t
| Sequence { start; s} :: t -> let sequence_node = G.V.create (N (start, s)) in
add_allele_edge prev sequence_node;
solo_loop state sequence_node t
| Gap _ :: t -> solo_loop state prev t
(* When traversing a reference gap. We have to keep track of allele element's
position to check when to join back with the next reference node. *)
and ref_gap_loop state ~prev ref_node ref_pos lst = match lst with
| [] -> gc "No End at allele sequence: %s" allele
| Start p :: _ -> gc "Another start %d in %s allele sequence." p allele
| End end_pos :: tl ->
if end_pos <= ref_pos then
add_end state end_pos prev tl
else (* end_pos > ref_pos *)
let () = add_allele_edge prev ref_node in
main_loop state ~prev ~next:ref_node lst
| Boundary { label; pos} :: tl ->
if pos < ref_pos then begin
if ref_pos = first_start_pos then begin
let bn = G.V.create (B (pos, label)) in
let () = add_allele_edge prev bn in
ref_gap_loop state ~prev:bn ref_node ref_pos tl
end else
gc "Allele %s has a boundary %s at %d that is in ref gap ending %d."
allele (blts label) pos ref_pos
end else if pos = ref_pos then
if ref_node = B (pos, label) then
let () = add_allele_edge prev ref_node in
main_loop state ~prev ~next:ref_node tl
else
gc "Allele %s has a boundary %s at %d where ref gap ends %d."
allele (blts label) pos ref_pos
else (* pos > ref_pos *)
let () = add_allele_edge prev ref_node in
main_loop state ~prev ~next:ref_node lst
| Sequence { start; s} :: tl ->
if start > ref_pos then begin
add_allele_edge prev ref_node;
main_loop state ~prev ~next:ref_node lst
end else
let new_node = G.V.create (N (start, s)) in
let () = add_allele_edge prev new_node in
let close_pos = start + String.length s in
if close_pos < ref_pos then begin
ref_gap_loop state ~prev:new_node ref_node ref_pos tl
end else (* close_pos => ref_pos *)
rejoin_after_split ~prev:ref_node ~next:ref_node close_pos state
~new_node tl
| Gap { gstart; length} :: tl ->
if gstart > ref_pos then begin
add_allele_edge prev ref_node;
main_loop state ~prev ~next:ref_node lst
end else
let close_pos = gstart + length in
if close_pos < ref_pos then
ref_gap_loop state ~prev ref_node ref_pos tl
else (* close_pos >= ref_pos *)
rejoin_after_split ~prev:ref_node ~next:ref_node close_pos state
~new_node:prev tl
(* The "run-length"-encoding nature of alignment implies that our edges have
to link to the next sequence element iff they are consecutive
(ex. "A..C", "...A..C" ) as opposed to linking back to the reference! *)
and close_position_loop state ~prev ~next ~allele_node pe lst =
match test_consecutive_elements allele pe lst with
| `End (end_pos, tl) ->
add_end state end_pos allele_node tl
| `Close pos ->
rejoin_after_split pos state ~prev ~next ~new_node:allele_node lst
| `Continue (new_node_opt, new_pe, lst) ->
let allele_node =
match new_node_opt with
| None -> allele_node
| Some nn -> add_allele_edge allele_node nn; nn
in
close_position_loop state ~prev ~next ~allele_node new_pe lst
(* When not in a reference gap *)
and main_loop state ~prev ~next = function
| [] -> gc "No End at allele sequence: %s" allele
| Start p :: _ -> gc "Another start %d in %s allele sequence." p allele
| End end_pos :: tl ->
let prev =
match split_in ~prev ~next ~visit:add_allele_edge end_pos with
| `AfterLast p
| `AtNext (p, _)
| `InGap (p, _, _) -> p
in
add_end state end_pos prev tl
| Boundary { label; pos} :: t ->
let prev, next =
advance_until_boundary ~prev ~next ~visit:add_allele_edge
pos label
in
let () = add_allele_edge prev next in
main_loop state ~prev:next ~next t
| Sequence { start; s} :: t ->
let new_node = G.V.create (N (start, s)) in
let open_res = split_in ~prev ~next ~visit:add_allele_edge start in begin
match open_res with
| `AfterLast prev ->
let () = add_allele_edge prev new_node in
solo_loop state new_node t
| `InGap (prev, next, _)
| `AtNext (prev, next) ->
let () = add_allele_edge prev new_node in
let close_pos = start + String.length s in
close_position_loop ~prev ~next ~allele_node:new_node state (`Sequence close_pos) t
end
| Gap {gstart; length} :: t ->
let open_res = split_in ~prev ~next ~visit:add_allele_edge gstart in begin
match open_res with
| `AfterLast _pre ->
solo_loop state next t
| `InGap (prev, next, _)
| `AtNext (prev, next) ->
let close_pos = gstart + length in
close_position_loop ~prev ~next ~allele_node:prev state (`Gap close_pos) t
end
in
start_loop [] alt_lst
(* TODO: use a real heap/pq ds?
- The one in Graph doesn't allow you to control not putting in duplicates
*)
module NodeQueue = struct
include Set.Make(struct
type t = Nodes.t
let compare = Nodes.compare_by_position_first
end)
let add_successors g = G.fold_succ add g
let at_min_position g q =
let rec loop p q acc =
if is_empty q then
q, acc
else
let me = min_elt q in
if Nodes.position me = p then
loop p (add_successors g me (remove me q)) (me :: acc)
else
q, acc
and start q =
let me = min_elt q in
let nq = remove me q in
loop (Nodes.position me) (add_successors g me nq) [me]
in
start q
end
module FoldAtSamePosition = struct
(* We know all of the start positions based upon the bounds map; so use that
to fold upon start nodes to seed a queue. *)
let after_start_nodes g bounds =
let open Nodes in
AM.fold bounds ~init:NodeQueue.empty
~f:(fun q sep_lst allele ->
List.fold_left sep_lst ~init:q
~f:(fun q sep ->
NodeQueue.add_successors g (S (fst sep.start, allele)) q))
let step = NodeQueue.at_min_position
let nl_to_string =
string_of_list ~sep:";" ~f:Nodes.vertex_name
let nq_to_string nq =
NodeQueue.fold nq ~init:[] ~f:(fun n acc -> n :: acc)
|> nl_to_string
|> sprintf "nq: %s"
(* Fold from an initialized queue. *)
let fold_from g ~f ~init q =
let rec loop a q =
if NodeQueue.is_empty q then
a
else
let nq, amp = step g q in
(*if !debug then printf "at: %s\n%!" (nl_to_string amp); *)
let na = f a amp in
loop na nq
in
loop init q
let fold_after_starts g bounds ~f ~init =
fold_from g ~f ~init (after_start_nodes g bounds)
let node_queue_with_start_and_successors_nodes g bounds =
let open Nodes in
AM.fold bounds ~init:NodeQueue.empty
~f:(fun q sep_lst allele ->
List.fold_left sep_lst ~init:q
~f:(fun q sep ->
let s = S (fst sep.start, allele) in
let nq = NodeQueue.add s q in
NodeQueue.add_successors g s nq))
let f g bounds ~f ~init =
fold_from g ~f ~init
(node_queue_with_start_and_successors_nodes g bounds)
end (* FoldAtSamePosition *)
let range_pr bounds =
AM.fold_wa bounds ~init:(max_int, min_int)
~f:(fun p sep_lst ->
List.fold_left sep_lst ~init:p ~f:(fun (st, en) sep ->
(min st (fst sep.start)), (max en sep.end_)))
(** [range g] returns the minimum and maximum alignment position in [g]. *)
let range { bounds; _ } =
range_pr bounds
let create_by_position g bounds =
let st, _en = range_pr bounds in
let rec redirects dest num acc =
if num <= 0 then
acc
else
redirects dest (num - 1) ((Redirect dest) :: acc)
in
FoldAtSamePosition.(f g bounds ~init:(0, []) ~f:(fun (i,acc) lst ->
let p = Nodes.position (List.hd_exn lst) in
let j = p - st in
let num_redirects = j - (i + 1) in
(j, NL lst :: (redirects i num_redirects acc))))
|> snd
|> List.rev
|> Array.of_list
module JoinSameSequencePaths = struct
let debug = ref false
(* Alignment and sequence pair, set used as queue *)
module Apsq =
Set.Make(struct
type t = MSA.position * sequence
let compare (a1, s1) (a2, s2) =
let r = MSA.compare_position a1 a2 in
if r = 0 then
compare_sequence s1 s2
else
r
end)
let peak_min_position q =
if Apsq.is_empty q then
None
else
Some (fst (Apsq.min_elt q))
let at_min_position q =
let rec loop p q acc =
if Apsq.is_empty q then
q, acc
else
let me = Apsq.min_elt q in
match acc with
| [] -> loop (fst me) (Apsq.remove me q) [me]
| _ -> if fst me = p then
loop p (Apsq.remove me q) (me :: acc)
else
q, acc
in
loop min_int q []
let rec add_node_successors_only g v q =
let open Nodes in
match v with
| N (a, s) -> Apsq.add (a, s) q
| E _ -> q
| S _
| B _ -> G.fold_succ (add_node_successors_only g) g v q
let after_starts g bounds =
let open Nodes in
let add_successors = G.fold_succ (add_node_successors_only g) g in
AM.fold bounds ~init:Apsq.empty
~f:(fun q sep_lst allele ->
List.fold_left sep_lst ~init:q
~f:(fun q sep ->
add_successors (S (fst sep.start, allele)) q))
(* assume the list isn't empty *)
let find_highest_diff ~must_split lst =
if !debug then
printf "find_highest_diff: ms:%s %s\n"
(match must_split with None -> "None" | Some l -> sprintf "Some %d" l)
(string_of_list lst ~sep:";" ~f:(fun (p, s) -> sprintf "%d,%s" p s));
let init = Option.value must_split ~default:max_int in
let max_compare_length =
List.fold_left lst ~init ~f:(fun l (_p, s) ->
min l (String.length s))
in
let arr = [| 0; 0; 0; 0; 0 |] in
let clear_arr () = for i = 0 to 4 do arr.(i) <- 0 done in
let k_char_to_int = function
| 'A' -> 0
| 'C' -> 1
| 'G' -> 2
| 'T' -> 3
| 'N' -> 4
| c -> eprintf "surprising base character %c, assuming N" c; 4
in
let fill_arr index =
List.iter lst ~f:(fun (_p, s) ->
let c = String.get_exn s ~index in
let j = k_char_to_int c in
arr.(j) <- arr.(j) + 1)
in
let rec diff_loop index =
if index >= max_compare_length then
index
else begin
clear_arr (); (* clear A,C,G,T,N counts in the beginning! *)
fill_arr index;
match arr with
| [| n; 0; 0; 0; 0|] when n >= 1 -> diff_loop (index + 1)
| [| 0; n; 0; 0; 0|] when n >= 1 -> diff_loop (index + 1)
| [| 0; 0; n; 0; 0|] when n >= 1 -> diff_loop (index + 1)
| [| 0; 0; 0; n; 0|] when n >= 1 -> diff_loop (index + 1)
| [| 0; 0; 0; 0; n|] when n >= 1 -> diff_loop (index + 1)
| _ ->
if !debug then
printf "found diff %d: [| %d; %d; %d; %d; %d |]\n"
index arr.(0) arr.(1) arr.(2) arr.(3) arr.(4);
if index > 0 then index else same_loop index
end
and same_loop index =
let nindex = index + 1 in
if nindex >= max_compare_length then
index
else begin
clear_arr ();
fill_arr nindex;
let mx = Array.fold_left ~init:0 ~f:max arr in
if !debug then
printf "found diff %d: [| %d; %d; %d; %d; %d |]\n"
index arr.(0) arr.(1) arr.(2) arr.(3) arr.(4);
if mx = 1 then
same_loop nindex
else
index
end
in
diff_loop 0
let do_it aindex g bounds =
let open Nodes in
let add_successors = G.fold_succ (add_node_successors_only g) g in
let qstart = after_starts g bounds in
let unite_edges pv nv e =
try
let ecur = G.find_edge g pv nv in
let into = G.E.label ecur in
AS.unite ~into e
with Not_found ->
G.add_edge_e g (G.E.create pv e nv)
in
let split_and_rejoin ~index q p s =
let node = N (p, s) in
let pr = G.pred_e g node in
let su = G.succ_e g node in
G.remove_vertex g node;
let fs, sn = String.split_at s ~index in
let p1 = p + index in
let v1 = N (p, fs) in
if not (G.mem_vertex g v1) then G.add_vertex g v1;
let s_inter =
List.fold_left pr ~init:(AS.init aindex)
~f:(fun bta (p, bt, _) ->
unite_edges p v1 bt;
AS.union bt bta)
in
let v2 = N (p1, sn) in
if not (G.mem_vertex g v2) then G.add_vertex g v2;
unite_edges v1 v2 s_inter;
List.iter su ~f:(fun (_, e, s) -> unite_edges v2 s e);
Apsq.add (p1, sn) q
in
let flatten ~next_pos q = function
| [] -> q
| (p, s) :: [] ->
begin
if !debug then
printf "one %d %s next_pos %d\n%!"
p s (Option.value next_pos ~default:(-1));
match next_pos with
| Some np when inside_seq p s ~pos:np ->
split_and_rejoin ~index:(np - p) q p s
| None | Some _ ->
add_successors (N (p, s)) q
end
| (p, _) :: _ as ls ->
let must_split = Option.map ~f:(fun np -> np - p) next_pos in
let index = find_highest_diff ~must_split ls in
if !debug then begin
printf "index:\t%d must_split:\t%d\n" index (Option.value must_split ~default:(-1));
List.iter ls ~f:(fun (p, s) -> printf "%d: %s\n" p s)
end;
if index = 0 then
List.fold_left ls ~init:q ~f:(fun q (p, s) ->
if String.length s <= 1 then
add_successors (N (p, s)) q
else
split_and_rejoin ~index:1 q p s)
else
List.fold_left ls ~init:q ~f:(fun q (p, s) ->
if String.length s = index + 1 then begin
if !debug then
printf "Not splitting %d %s because length is less than %d\n" p s index;
(* Don't split to avoid an empty Node!*)
add_successors (N (p, s)) q
end else begin
if !debug then
printf "splitting %d %s at %d\n" p (index_string s index) index;
split_and_rejoin ~index q p s
end)
in
let rec loop q =
if q = Apsq.empty then
()
else
let nq, amp = at_min_position q in
let next_pos = peak_min_position nq in
if !debug then
printf "popping [%s] peaking at %d\n"
(string_of_list amp ~sep:";" ~f:(fun (p, s) -> sprintf "(%d,%s)" p s))
(Option.value next_pos ~default:(-1));
let nq = flatten ~next_pos nq amp in
loop nq
in
loop qstart