-
Notifications
You must be signed in to change notification settings - Fork 0
/
batch_pileup.c
1584 lines (1351 loc) · 48.6 KB
/
batch_pileup.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
Populates complete pileup statistics for multiple samples in
tandem. Provides access to these statistics for a given sample at
the current position in three views: basecall counts, (b,q,s)
triplet counts, or indel counts. Maintains an internal 'iterator'
position and provides the user with a 'next' functionality for the
iterator.
*/
#include "batch_pileup.h"
#include "ksort.h"
#include "cache.h"
#include "bam_reader.h"
#include "bam_sample_info.h"
#include "fasta.h"
#include "locus_range.h"
#include "chunk_strategy.h"
#include "htslib/sam.h"
#include <pthread.h>
#include <stdint.h>
#include <assert.h>
#include <ctype.h>
struct pbqt {
uint32_t pos;
uint32_t tid; /* 1,048,576 */
unsigned char base; /* BAM encoding. use hts.h: seq_nt16_str[base]
to get the base */
unsigned char qual; /* numeric quality score */
unsigned char strand; /* 1 for positive, 0 for negative */
};
union pair {
uint32_t c[2];
uint64_t v;
};
/* packs pbqt information into 64 bits as follows: pos: 32, tid: 20,
base: 4, qual: 7, strand: 1. does not check for overflow. */
static inline khint64_t
pack_pbqt(struct pbqt t)
{
union pair pr;
pr.c[0] = t.pos;
pr.c[1] = t.tid<<12 | (uint32_t)t.base<<8 | t.qual<<1 | t.strand;
return pr.v;
}
/* unpacks the 64-bit key information into the pbqt structure. */
static inline void
unpack_pbqt(khint64_t k, struct pbqt *t)
{
union pair pr;
pr.v = k;
t->pos = pr.c[0];
t->tid = pr.c[1]>>12;
t->base = pr.c[1]>>8 & 0x0f;
t->qual = pr.c[1]>>1 & 0x7f;
t->strand = pr.c[1] & 0x01;
}
union pos_key {
khint64_t k;
struct contig_pos v;
};
struct pos_base_count {
struct contig_pos cpos;
struct base_count bct;
};
struct pos_bqs_count {
struct contig_pos cpos;
struct bqs_count bqs_ct;
};
struct pos_indel_count {
struct contig_pos cpos;
struct indel_count ict;
};
static int
cmp_indel(struct indel a, struct indel b)
{
int ins_cmp, len_cmp;
int rv = (ins_cmp = (int)a.is_ins - (int)b.is_ins)
? ins_cmp
: (
(len_cmp = (int)a.length - (int)b.length)
? len_cmp
: (a.is_ins
? strcmp(a.seq, b.seq)
: 0)
);
return rv;
}
/* hash type for storing bqt (basecall, quality, strand) counts at
position. encodes p,b,q,t tuples in the khint64_t key, */
KHASH_MAP_INIT_INT64(pbqt_h, unsigned);
/* hash type for storing basecall counts at position. uses 'union
pos_key' as the key. */
KHASH_MAP_INIT_INT64(pb_h, struct base_count);
/* hash type for storing plain old strings for insertion sequences.
needed just to provide a place to own the insertion sequences. */
KHASH_SET_INIT_STR(str_h);
/* hash type for tallying indel events. use */
KHASH_MAP_INIT_INT64(indel_ct_h, struct indel_count_ary);
/* overlapping mates hash, key is q_name. values are strdup-allocated
raw BAM records. */
KHASH_MAP_INIT_STR(olap_h, char *);
/* complete tally statistics for the set of BAM records overlapping a
given set of locus ranges. */
struct tally_stats {
/* Once tallying is complete, pbqt_hash and indel_ct_hash have
complete statistics for loci < tally_end, and
partial statistics for loci in [tally_end, MAX). */
khash_t(pbqt_h) *pbqt_hash;
khash_t(indel_ct_h) *indel_ct_hash;
khash_t(olap_h) *overlap_hash;
/* summary statistics are compiled for positions < tally_end */
struct pos_base_count *base_ct, *base_cur, *base_end;
struct pos_bqs_count *bqs_ct, *bqs_cur, *bqs_end;
struct pos_indel_count *indel_ct, *indel_cur, *indel_end;
};
static __thread struct tls {
khash_t(str_h) *seq_hash;
unsigned n_samples;
struct contig_pos tally_end;
struct tally_stats *ts; /* tally stats for each sample */
struct contig_pos cur_pos; /* current position being queried */
/* these fields refreshed at pileup_tally_stats, freed at
pileup_clear_finished_stats(). they define the region of
interest. */
struct contig_fragment *refseqs, *cur_refseq;
unsigned n_refseqs;
struct base_count null_ct;
struct base_count refsam_ct[5];
struct bqs_count refsam_bqs_ct[5];
} tls;
#define FRAGMENT_MAX_INLINE_SEQLEN 10
struct contig_fragment {
struct contig_region reg;
char buf[FRAGMENT_MAX_INLINE_SEQLEN + 1];
char *seq;
};
static struct batch_pileup_params g_bp_par;
static struct bam_filter_params bam_filter;
void
batch_pileup_init(struct bam_filter_params bf_par,
struct batch_pileup_params bp_par)
{
bam_filter = bf_par;
g_bp_par = bp_par;
}
void
batch_pileup_free()
{
}
/* used to represent the position after we run out of loci in a
chunk. also defined as the maximum possible position. tls.cur_pos
is set to this value when pileup_next_pos() is called and
tls.cur_pos is incremented out of the region of interest */
static const struct contig_pos g_end_pos = { UINT_MAX, UINT_MAX };
/* use to represent the current position after a new batch was loaded
but before pileup_next_pos() is called. tls.cur_pos is set to this
value after calling pileup_clear_stats() and also after
batch_pileup_thread_init() */
static const struct contig_pos g_unset_pos = { UINT_MAX - 1, 0 };
/* */
void
batch_pileup_thread_init(unsigned n_samples, const char *fasta_file)
{
fasta_thread_init(fasta_file);
tls.seq_hash = kh_init(str_h);
tls.n_samples = n_samples;
tls.tally_end = (struct contig_pos){ 0, 0 };
tls.cur_pos = g_unset_pos;
tls.ts = malloc(n_samples * sizeof(struct tally_stats));
tls.refseqs = NULL;
tls.n_refseqs = 0;
unsigned s;
for (s = 0; s != n_samples; ++s)
tls.ts[s] = (struct tally_stats){
.pbqt_hash = kh_init(pbqt_h),
.indel_ct_hash = kh_init(indel_ct_h),
.overlap_hash = kh_init(olap_h),
.base_ct = NULL,
.base_cur = NULL,
.base_end = NULL,
.bqs_ct = NULL,
.bqs_cur = NULL,
.bqs_end = NULL,
.indel_ct = NULL,
.indel_cur = NULL,
.indel_end = NULL
};
tls.null_ct = (struct base_count){ { 0, 0, 0, 0 }, 0, 0 };
unsigned pd = g_bp_par.pseudo_depth;
tls.refsam_ct[0] = (struct base_count){ { pd, 0, 0, 0 }, 0, pd };
tls.refsam_ct[1] = (struct base_count){ { 0, pd, 0, 0 }, 0, pd };
tls.refsam_ct[2] = (struct base_count){ { 0, 0, pd, 0 }, 0, pd };
tls.refsam_ct[3] = (struct base_count){ { 0, 0, 0, pd }, 0, pd };
tls.refsam_ct[4] = (struct base_count){ { 0, 0, 0, 0 }, 0, 0 };
tls.refsam_bqs_ct[0] = (struct bqs_count){ 1<<0, 50, 0, pd };
tls.refsam_bqs_ct[1] = (struct bqs_count){ 1<<1, 50, 0, pd };
tls.refsam_bqs_ct[2] = (struct bqs_count){ 1<<2, 50, 0, pd };
tls.refsam_bqs_ct[3] = (struct bqs_count){ 1<<3, 50, 0, pd };
tls.refsam_bqs_ct[4] = (struct bqs_count){ 15, 0, 0, 0 }; /* zero counts of base 'N' */
}
void
batch_pileup_thread_free()
{
fasta_thread_free();
khiter_t it;
for (it = kh_begin(tls.seq_hash); it != kh_end(tls.seq_hash); ++it)
if (kh_exist(tls.seq_hash, it))
free((char *)kh_key(tls.seq_hash, it));
kh_destroy(str_h, tls.seq_hash);
unsigned s;
for (s = 0; s != tls.n_samples; ++s) {
struct tally_stats *ts = &tls.ts[s];
kh_destroy(pbqt_h, ts->pbqt_hash);
khash_t(indel_ct_h) *ich = ts->indel_ct_hash;
for (it = kh_begin(ich); it != kh_end(ich); ++it)
if (kh_exist(ich, it)) {
struct indel_count_ary *ia = &kh_val(ich, it);
free(ia->i);
}
kh_destroy(indel_ct_h, ich);
khash_t(olap_h) *oh = ts->overlap_hash;
for (it = kh_begin(oh); it != kh_end(oh); ++it)
if (kh_exist(oh, it)) {
free((void *)kh_key(oh, it));
free((void *)kh_val(oh, it));
}
kh_destroy(olap_h, oh);
free(ts->base_ct);
free(ts->bqs_ct);
free(ts->indel_ct);
}
}
static int
infer_read_pair_overlap(bam1_t *b1, bam1_t *b2);
static void
tweak_overlap_quality(int left_off,
bam1_t *b0, bam1_t *b1,
unsigned min_clash_qual);
static void
process_bam_block(char *rec, char *end,
const struct contig_region *qbeg,
const struct contig_region *qend,
struct contig_span loaded_span,
struct tally_stats *ts);
/* load specific ranges of reference sequence. [qbeg, qend) defines
the total set of (non-overlapping) ranges to consider. subset
defines the overlapping intersection of these ranges that will be
loaded into tls.refseqs. assume that each interval in [qbeg, qend)
is on one contig, but that 'subset' may span multiple contigs. */
void
pileup_load_refseq_ranges(struct bam_scanner_info *bsi)
{
if (cs_stats.n_query_regions == 0) return;
const struct contig_region *q, *qlo, *qhi;
const struct contig_region
*qbeg = cs_stats.query_regions,
*qend = qbeg + cs_stats.n_query_regions;
find_intersecting_span(qbeg, qend, bsi->loaded_span, &qlo, &qhi);
if (qlo == qhi) return;
/* find the first region in [qbeg, qend) */
unsigned r = 0, alloc = 0;
for (q = qlo; q != qhi; ++q) {
ALLOC_GROW(tls.refseqs, r + 1, alloc);
tls.refseqs[r++] = (struct contig_fragment){ .reg = *q, .seq = NULL };
}
tls.n_refseqs = r;
/* alter ranges of first and last (may be the same range) */
struct contig_fragment *adj = &tls.refseqs[0];
struct contig_pos new_beg =
MAX_CONTIG_POS(CONTIG_REGION_BEG(adj->reg), bsi->loaded_span.beg);
assert(new_beg.tid == adj->reg.tid);
adj->reg.beg = new_beg.pos;
/* adjust last fragment */
adj = &tls.refseqs[tls.n_refseqs - 1];
struct contig_pos new_end =
MIN_CONTIG_POS(CONTIG_REGION_END(adj->reg), bsi->loaded_span.end);
assert(new_end.tid == adj->reg.tid);
adj->reg.end = new_end.pos;
/* now load all sequences */
for (r = 0; r != tls.n_refseqs; ++r) {
struct contig_fragment *frag = &tls.refseqs[r];
if (frag->reg.beg == frag->reg.end) {
frag->seq = frag->buf;
*frag->buf = '\0';
} else {
char *seq =
fasta_fetch_iseq(frag->reg.tid,
frag->reg.beg,
frag->reg.end);
if ((frag->reg.end - frag->reg.beg) <= FRAGMENT_MAX_INLINE_SEQLEN) {
strcpy(frag->buf, seq);
frag->seq = frag->buf;
free(seq);
} else {
frag->seq = seq;
}
}
}
tls.cur_refseq = tls.refseqs;
}
/* release refseqs. */
static void
free_refseq_ranges()
{
unsigned r;
for (r = 0; r != tls.n_refseqs; ++r)
if (tls.refseqs[r].seq != tls.refseqs[r].buf)
free(tls.refseqs[r].seq);
free(tls.refseqs);
tls.refseqs = NULL;
tls.n_refseqs = 0;
}
/* perform entire tally phase, for basecalls and for indels for one
sample. update tls.tally_end to reflect furthest start position of
any bam record seen. */
void
pileup_tally_stats(const struct managed_buf bam,
struct bam_scanner_info *bsi,
unsigned s)
{
const struct contig_region
*qbeg = cs_stats.query_regions,
*qend = qbeg + cs_stats.n_query_regions;
process_bam_block(bam.buf, bam.buf + bam.size,
qbeg, qend, bsi->loaded_span,
&tls.ts[s]);
}
/* return BAM bitflag-encoded form of current refbase. */
static unsigned
get_cur_refbase_code16()
{
assert(tls.cur_pos.tid == tls.cur_refseq->reg.tid);
unsigned offset = tls.cur_pos.pos - tls.cur_refseq->reg.beg;
char refbase = tls.cur_refseq->seq[offset];
return seq_nt16_table[(int)refbase];
}
/* */
static unsigned
get_cur_refbase_code5()
{
return seq_nt16_int[get_cur_refbase_code16()];
}
/* provide basecall stats for a sample at current position, or the
null statistic. */
struct base_count
pileup_current_basecalls(unsigned s)
{
if (s == REFERENCE_SAMPLE)
return tls.refsam_ct[get_cur_refbase_code5()];
struct tally_stats *ts = &tls.ts[s];
assert(ts->base_cur == ts->base_end ||
cmp_contig_pos(tls.cur_pos, ts->base_cur->cpos) < 1);
if (ts->base_cur == ts->base_end
|| cmp_contig_pos(tls.cur_pos, ts->base_cur->cpos) == -1)
return tls.null_ct;
else
return ts->base_cur->bct;
}
static int
pos_bqs_count_less(struct pos_bqs_count a, struct pos_bqs_count b)
{
return cmp_contig_pos(a.cpos, b.cpos) == -1;
}
KSORT_INIT(pbqt_sort, struct pos_bqs_count, pos_bqs_count_less);
/* transform contents of pbqt hash into a position-sorted array of bqt
counts. */
void
pileup_prepare_bqs(unsigned s)
{
struct tally_stats *ts = &tls.ts[s];
khash_t(pbqt_h) *ph = ts->pbqt_hash;
unsigned n = kh_size(ph);
ts->bqs_ct = realloc(ts->bqs_ct, n * sizeof(struct pos_bqs_count));
khiter_t it;
unsigned i;
struct pbqt tup;
khint64_t key;
unsigned ct;
struct contig_pos pos;
for (it = kh_begin(ph), i = 0; it != kh_end(ph); ++it) {
if (! kh_exist(ph, it)) continue;
key = kh_key(ph, it);
unpack_pbqt(key, &tup);
pos = (struct contig_pos){ tup.tid, tup.pos };
if (cmp_contig_pos(pos, tls.tally_end) == -1) {
ct = kh_val(ph, it);
ts->bqs_ct[i++] = (struct pos_bqs_count){
.cpos = { tup.tid, tup.pos },
.bqs_ct = { .base = tup.base,
.qual = tup.qual,
.strand = tup.strand,
.ct = ct }
};
}
}
ks_introsort(pbqt_sort, i, ts->bqs_ct);
ts->bqs_cur = ts->bqs_ct;
ts->bqs_end = ts->bqs_ct + i;
}
/* provide (b,q,s) stats for a sample at current position. *cts is
reallocated as necessary. *n_cts set to number of distinct stats
that are populated. complete statistics are provided; there is no
filtering by quality score. */
void
pileup_current_bqs(unsigned s, struct bqs_count **cts, unsigned *n_cts)
{
if (s == REFERENCE_SAMPLE) {
*n_cts = 1;
*cts = realloc(*cts, 1 * sizeof(struct bqs_count));
(*cts)[0] = tls.refsam_bqs_ct[get_cur_refbase_code5()];
return;
}
struct tally_stats *ts = &tls.ts[s];
struct pos_bqs_count *pc = ts->bqs_cur;
while (pc != ts->bqs_end &&
cmp_contig_pos(pc->cpos, tls.cur_pos) == 0)
++pc;
unsigned n = pc - ts->bqs_cur;
*n_cts = n;
*cts = realloc(*cts, n * sizeof(struct bqs_count));
pc = ts->bqs_cur;
unsigned i;
for (i = 0; i != n; ++i, ++pc)
(*cts)[i] = pc->bqs_ct;
}
/* provide indel stats for a sample at current position. *cts is
reallocated as necessary, *n_cts is set to the number of distinct
indels found. */
void
pileup_current_indels(unsigned s, struct indel_count **cts, unsigned *n_cts)
{
if (s == REFERENCE_SAMPLE) {
*n_cts = 0;
return;
}
/* pre-scan to find total number of distinct indels (use linear
scan since this will be very small) */
struct tally_stats *ts = &tls.ts[s];
struct pos_indel_count *pc = ts->indel_cur;
while (pc != ts->indel_end &&
cmp_contig_pos(pc->cpos, tls.cur_pos) == 0)
++pc;
unsigned n = pc - ts->indel_cur;
*cts = realloc(*cts, n * sizeof(struct indel_count));
*n_cts = n;
unsigned i = 0;
pc = ts->indel_cur;
while (i != n)
(*cts)[i++] = pc++->ict;
}
/* merge information in indel counts 1 and 2, producing counts of
pairs based on indel type. cts1 and cts2 are the indels from two
samples at a given genomic position. */
void
pileup_make_indel_pairs(struct indel_count *cts1, unsigned n_cts1,
struct indel_count *cts2, unsigned n_cts2,
struct indel_pair_count **pair_cts, unsigned *n_pair_cts)
{
unsigned max_n_events = n_cts1 + n_cts2 + 2;
*pair_cts = realloc(*pair_cts, max_n_events * sizeof(struct indel_pair_count));
struct indel_count
*ic0 = cts1, *ie0 = ic0 + n_cts1,
*ic1 = cts2, *ie1 = ic1 + n_cts2;
struct indel_pair_count *ip = *pair_cts;
/* traverse both arrays in tandem, advancing one or both depending
on comparison order. */
int cmp;
while (ic0 != ie0 || ic1 != ie1) {
/* <0: first-in-pair only
0 : both pairs
>0 : second-in-pair only */
cmp = (ic1 == ie1 ? -1
: (ic0 == ie0 ? 1
: cmp_indel(ic0->idl, ic1->idl)));
if (cmp < 0) {
ip->count[0] = ic0->ct;
ip->count[1] = 0;
ip->indel = ic0++->idl;
} else if (cmp == 0) {
ip->count[0] = ic0->ct;
ip->count[1] = ic1->ct;
ip->indel = ic0->idl; ++ic0; ++ic1;
} else {
ip->count[1] = ic1->ct;
ip->count[0] = 0;
ip->indel = ic1++->idl;
}
++ip;
}
*n_pair_cts = ip - *pair_cts;
assert(*n_pair_cts <= max_n_events);
}
/* */
static inline char
bqs_count_to_call(struct bqs_count bc, unsigned refbase_i)
{
static char match[] = ",.";
return bc.base == refbase_i
? match[bc.strand]
: (bc.strand
? seq_nt16_str[bc.base]
: tolower(seq_nt16_str[bc.base]));
}
/* convert a BAM quality to a character */
static inline char
qual_to_char(unsigned q)
{
return (char)(q + (unsigned)'!');
}
/* reify the indel, allocating and returning an indel_seq, using the
current position. Note: this function doesn't actually need the
current position in order to retrieve sequences for insertions, but
it does for deletions. */
struct indel_seq *
pileup_current_indel_seq(struct indel *idl)
{
struct indel_seq *isq = malloc(sizeof(struct indel_seq) + idl->length + 1);
isq->is_ins = idl->is_ins;
if (isq->is_ins) {
strcpy(isq->seq, idl->seq);
} else {
char *del_seq = fasta_fetch_iseq(tls.cur_pos.tid,
tls.cur_pos.pos,
tls.cur_pos.pos + idl->length);
strcpy(isq->seq, del_seq);
free(del_seq);
}
return isq;
}
/* produce pileup data (calls, quals, and read depths) from current
position for sample s. manage buffer reallocation of pd fields. */
void
pileup_current_data(unsigned s, struct pileup_data *pd)
{
if (s == REFERENCE_SAMPLE) {
pd->calls.size = 3;
ALLOC_GROW(pd->calls.buf, pd->calls.size, pd->calls.alloc);
strncpy(pd->calls.buf, "REF", 3);
pd->quals.size = 3;
ALLOC_GROW(pd->quals.buf, pd->quals.size, pd->quals.alloc);
strncpy(pd->quals.buf, "REF", 3);
pd->n_match_lo_q = 0;
pd->n_match_hi_q = g_bp_par.pseudo_depth;
pd->n_indel = 0;
return;
}
struct pileup_locus_info pli;
pileup_current_info(&pli);
unsigned n_bqs_ct;
struct bqs_count *bqs_ct = NULL;
pileup_current_bqs(s, &bqs_ct, &n_bqs_ct);
unsigned b, n_calls = 0;
pd->n_match_lo_q = 0;
pd->n_match_hi_q = 0;
for (b = 0; b != n_bqs_ct; ++b)
if (bqs_ct[b].qual < bam_filter.min_base_quality)
pd->n_match_lo_q += bqs_ct[b].ct;
else
pd->n_match_hi_q += bqs_ct[b].ct;
n_calls = pd->n_match_lo_q + pd->n_match_hi_q;
pd->quals.size = 0;
ALLOC_GROW(pd->quals.buf, n_calls, pd->quals.alloc);
struct indel_count *indel_ct = NULL;
unsigned n_indel_ct, indel_len_total = 0;
pileup_current_indels(s, &indel_ct, &n_indel_ct);
pd->n_indel = 0;
for (b = 0; b != n_indel_ct; ++b) {
indel_len_total += indel_ct[b].ct * (indel_ct[b].idl.length + 10);
pd->n_indel += indel_ct[b].ct;
}
pd->calls.size = 0;
ALLOC_GROW(pd->calls.buf, n_calls + indel_len_total, pd->calls.alloc);
/* print out the calls */
unsigned refbase_i = get_cur_refbase_code16();
unsigned p = 0, p_cur, p_end;
for (b = 0; b != n_bqs_ct; ++b) {
p_end = p + bqs_ct[b].ct;
char bc = bqs_count_to_call(bqs_ct[b], refbase_i);
char qs = qual_to_char(bqs_ct[b].qual);
p_cur = p;
for (; p != p_end; ++p) pd->calls.buf[p] = bc;
for (p = p_cur; p != p_end; ++p) pd->quals.buf[p] = qs;
assert(p_end <= pd->calls.alloc);
assert(p_end <= pd->quals.alloc);
}
pd->quals.size = p;
pd->calls.size = p;
assert(bqs_ct != NULL);
free(bqs_ct);
/* print out all indel representations */
struct indel_seq *isq;
static char sign[] = "-+";
unsigned c;
p = pd->calls.size;
for (b = 0; b != n_indel_ct; ++b) {
isq = pileup_current_indel_seq(&indel_ct[b].idl);
if (indel_ct[b].idl.is_ins)
assert(indel_ct[b].idl.length == strlen(isq->seq));
for (c = 0; c != indel_ct[b].ct; ++c)
p += sprintf(pd->calls.buf + p, "%c%Zu%s",
sign[(int)isq->is_ins],
strlen(isq->seq), isq->seq);
free(isq);
}
pd->calls.size = p;
assert(indel_ct != NULL);
free(indel_ct);
}
void
init_pileup_data(struct pileup_data *pd)
{
pd->calls = (struct managed_buf){ NULL, 0, 0 };
pd->quals = (struct managed_buf){ NULL, 0, 0 };
pd->n_match_lo_q = 0;
pd->n_match_hi_q = 0;
pd->n_indel = 0;
}
void
free_pileup_data(struct pileup_data *pd)
{
if (pd->calls.buf != NULL) {
free(pd->calls.buf);
pd->calls.alloc = 0;
}
if (pd->quals.buf != NULL) {
free(pd->quals.buf);
pd->quals.alloc = 0;
}
}
static void
incr_indel_count_aux(struct tally_stats *ts,
struct contig_pos pos,
int len,
uint8_t *bam_seq,
unsigned bam_rec_start);
static void
incr_insert_count(struct tally_stats *ts,
struct contig_pos pos,
unsigned len,
uint8_t *bam_seq,
unsigned bam_rec_start)
{
return incr_indel_count_aux(ts, pos, (int)len, bam_seq, bam_rec_start);
}
static void
incr_delete_count(struct tally_stats *ts,
struct contig_pos pos,
unsigned len)
{
return incr_indel_count_aux(ts, pos, -(int)len, NULL, 0);
}
/* traverse b, tallying the match blocks into ts->pbqt_hash, and the
indels into ts->indel_ct_hash and tls.seq_hash as necessary */
static void
process_bam_stats(bam1_t *b, struct tally_stats *ts)
{
int32_t tid = b->core.tid, qpos = 0, rpos = b->core.pos, q, r;
unsigned c;
uint32_t *cigar = bam_get_cigar(b), op, ln;
unsigned strand = bam_is_rev(b) ? 0 : 1;
int ret;
khash_t(pbqt_h) *ph = ts->pbqt_hash;
uint8_t *bam_seq = bam_get_seq(b);
uint8_t *bam_qual = bam_get_qual(b);
/* initialize the constant parts of 'stat' here, then reuse
this key in the loop */
khint64_t key;
struct pbqt stat = { .tid = tid, .strand = strand };
for (c = 0; c != b->core.n_cigar; ++c) {
op = bam_cigar_op(cigar[c]);
ln = bam_cigar_oplen(cigar[c]);
khiter_t it;
int32_t qe;
if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
qe = qpos + ln;
for (q = qpos, r = rpos; q != qe; ++q, ++r) {
stat.pos = r;
stat.base = bam_seqi(bam_seq, q);
stat.qual = bam_qual[q];
key = pack_pbqt(stat);
it = kh_put(pbqt_h, ph, key, &ret);
if (ret != 0)
kh_val(ph, it) = 0;
kh_val(ph, it)++;
}
} else if (op == BAM_CINS) {
struct contig_pos ins_pos = { tid, rpos };
incr_insert_count(ts, ins_pos, ln, bam_seq, qpos);
} else if (op == BAM_CDEL) {
struct contig_pos del_pos = { tid, rpos };
incr_delete_count(ts, del_pos, ln);
} else
; /* All other operations (N, S, H, P) do not result in
tallying anything */
/* but, we do advance the query */
if (bam_cigar_type(op) & 1) qpos += ln;
if (bam_cigar_type(op) & 2) rpos += ln;
}
}
/* records are provided in [rec, end) ascending by start
position. these are the minimal set of records that were identified
by bam_scanner to overlap the region of interest defined by the
intersection of 'loaded_span' and [qbeg, qend). but due to the
limited resolution of the BSI index, there will be records in [rec,
end) that do not overlap the region of interest.
loads each record, tallies its statistics if the record overlaps
the region of interest. maintain a map of possibly overlapping read
pairs and resolves their quality scores. return the upper bound
position of complete tally statistics. (this is the lowest start
position of any records still in the overlaps_hash, or if empty,
the start position of the last record parsed). */
static void
process_bam_block(char *rec, char *end,
const struct contig_region *qbeg,
const struct contig_region *qend,
struct contig_span loaded_span,
struct tally_stats *ts)
{
/* the call to this function tallies all data in loaded_span.
this signals the downstream consumers that it is complete to
this point. */
tls.tally_end = loaded_span.end;
bam1_t b, b_mate;
int ret;
char *rec_next;
const struct contig_region *qcur, *qhi;
find_intersecting_span(qbeg, qend, loaded_span, &qcur, &qhi);
if (qcur == qhi) return;
struct contig_pos rec_beg;
khiter_t itr;
while (rec != end) {
rec_next = bam_parse(rec, &b);
if (! rec_overlaps(&b, &qcur, qhi)) {
if (qcur == qhi) break;
else {
rec = rec_next;
continue;
}
}
if (bam_rec_exclude(&b, bam_filter)) {
rec = rec_next;
continue;
}
rec_beg = (struct contig_pos){ b.core.tid, b.core.pos };
if (cmp_contig_pos(loaded_span.end, rec_beg) == -1)
break; /* no more records overlapping the region of interest. done. */
if (! (b.core.flag & BAM_FPAIRED) || ! (b.core.flag & BAM_FPROPER_PAIR)) {
/* either this read is not paired or it is paired but not
mapped in a proper pair. tally and don't store. */
process_bam_stats(&b, ts);
} else { /* b is paired and mapped in proper pair */
if (b.core.pos < b.core.mpos) { /* b is upstream mate */
if (bam_endpos(&b) < b.core.mpos) { /* does not overlap with mate */
process_bam_stats(&b, ts);
} else {
/* b overlaps mate. since b is upstream, it
shouldn't be in overlaps hash. store; do not
tally yet. */
char *qname = strdup(bam_get_qname(&b));
itr = kh_put(olap_h, ts->overlap_hash, qname, &ret);
assert(ret == 1 || ret == 2);
kh_val(ts->overlap_hash, itr) = bam_duplicate_buf(rec);
}
} else {
/* b is downstream mate. must search overlaps hash to
see if it overlaps with its upstream mate. */
itr = kh_get(olap_h, ts->overlap_hash, bam_get_qname(&b));
if (itr != kh_end(ts->overlap_hash)) {
(void)bam_parse(kh_val(ts->overlap_hash, itr), &b_mate);
/* int left_offset = infer_read_pair_overlap(&b_mate, &b); */
/* if (left_offset != -1) */
/* tweak_overlap_quality(left_offset, &b_mate, &b, g_bp_par.min_clash_qual); */
process_bam_stats(&b, ts);
process_bam_stats(&b_mate, ts);
free((void *)kh_key(ts->overlap_hash, itr));
free((void *)kh_val(ts->overlap_hash, itr));
kh_del(olap_h, ts->overlap_hash, itr);
} else {
if (b.core.pos != b.core.mpos) {
/* downstream mate does not overlap with its
upstream partner. or, upstream mate was
never loaded since it was not within the
region of interest. */
process_bam_stats(&b, ts);
} else {
/* b is first in the pair to be encountered. store it */
char *qname = strdup(bam_get_qname(&b));
itr = kh_put(olap_h, ts->overlap_hash, qname, &ret);
assert(ret == 1 || ret == 2);
kh_val(ts->overlap_hash, itr) = bam_duplicate_buf(rec);
}
}
}
}
rec = rec_next;
}
}
/* marginalize out q and t from pbqt_hash, storing results in pb_hash.
counts are only tallied if q >= bam_filter.min_base_quality (global
var). pb_hash must be freed by the caller. */
static khash_t(pb_h) *
summarize_base_counts(unsigned s, struct contig_pos tally_end)
{
khint64_t src_k;
struct pbqt stat;
khash_t(pbqt_h) *src_h = tls.ts[s].pbqt_hash;
union pos_key trg_k;
khash_t(pb_h) *trg_h = kh_init(pb_h);
unsigned ct;
kh_clear(pb_h, trg_h);
khiter_t src_itr, trg_itr;
int ret;
for (src_itr = kh_begin(src_h); src_itr != kh_end(src_h); ++src_itr) {
if (! kh_exist(src_h, src_itr)) continue;
src_k = kh_key(src_h, src_itr);
unpack_pbqt(src_k, &stat);
trg_k.v = (struct contig_pos){ stat.tid, stat.pos };
if (cmp_contig_pos(trg_k.v, tally_end) != -1) continue;
ct = kh_val(src_h, src_itr);
trg_itr = kh_put(pb_h, trg_h, trg_k.k, &ret);
if (ret != 0)
kh_val(trg_h, trg_itr) =
(struct base_count){ .ct_filt = { 0, 0, 0, 0 },
.n_match_lo_q = 0,
.n_match_hi_q = 0,
.n_match_fuzzy = 0 };
int pure_base = seq_nt16_int[stat.base];
if (pure_base == 4)
kh_val(trg_h, trg_itr).n_match_fuzzy += ct;
else {
if (stat.qual >= bam_filter.min_base_quality) {
kh_val(trg_h, trg_itr).ct_filt[pure_base] += ct;
kh_val(trg_h, trg_itr).n_match_hi_q += ct;
}
else
kh_val(trg_h, trg_itr).n_match_lo_q += ct;
}
}
return trg_h;
}