Permalink
Browse files

Multiple changes:

1. Removed bwa.{h,c}. I am not going to finish them anyway.
2. Updated to the latest khash.h, which should be faster.
3. Define 64-bit vector and 128-bit integer/vector in utils.h.
  • Loading branch information...
lh3 committed Feb 12, 2013
1 parent 13288e2 commit e5ab59db5327f628bf688952e1c16f4a4f038b4f
Showing with 261 additions and 549 deletions.
  1. +2 −3 Makefile
  2. +0 −272 bwa.c
  3. +0 −107 bwa.h
  4. +6 −9 bwamem_pair.c
  5. +12 −21 bwape.c
  6. +2 −2 bwtsw2_pair.c
  7. +193 −89 khash.h
  8. +35 −44 utils.c
  9. +11 −2 utils.h
View
@@ -1,10 +1,9 @@
-CC= gcc
-CXX= g++
+CC= clang
CFLAGS= -g -Wall -O2
CXXFLAGS= $(CFLAGS)
AR= ar
DFLAGS= -DHAVE_PTHREAD #-D_NO_SSE2 #-D_FILE_OFFSET_BITS=64
-LOBJS= bwa.o bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \
+LOBJS= bamlite.o utils.o bwt.o bwtio.o bwtaln.o bwtgap.o bntseq.o bwamem.o bwamem_pair.o stdaln.o \
bseq.o bwaseqio.o bwase.o kstring.o
AOBJS= QSufSort.o bwt_gen.o \
is.o bwtmisc.o bwtindex.o ksw.o simple_dp.o \
View
272 bwa.c
@@ -1,272 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <math.h>
-#include "bwa.h"
-#include "bwt.h"
-#include "bwtgap.h"
-#include "bntseq.h"
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-extern unsigned char nst_nt4_table[256];
-extern void seq_reverse(int len, uint8_t *seq, int is_comp);
-
-bwa_opt_t bwa_def_opt = { 11, 4, -1, 1, 6, 32, 2, 0.04 };
-
-struct bwa_idx_t {
- bwt_t *bwt;
- bntseq_t *bns;
- uint8_t *pac;
-};
-
-struct bwa_buf_t {
- int max_buf;
- bwa_pestat_t pes;
- gap_stack_t *stack;
- gap_opt_t *opt;
- int *diff_tab;
- uint8_t *buf;
- int *logn;
-};
-
-bwa_idx_t *bwa_idx_load(const char *prefix)
-{
- bwa_idx_t *p;
- int l;
- char *str;
- l = strlen(prefix);
- p = calloc(1, sizeof(bwa_idx_t));
- str = malloc(l + 10);
- strcpy(str, prefix);
- p->bns = bns_restore(str);
- strcpy(str + l, ".bwt");
- p->bwt = bwt_restore_bwt(str);
- str[l] = 0;
- strcpy(str + l, ".sa");
- bwt_restore_sa(str, p->bwt);
- free(str);
- p->pac = calloc(p->bns->l_pac/4+1, 1);
- fread(p->pac, 1, p->bns->l_pac/4+1, p->bns->fp_pac);
- fclose(p->bns->fp_pac);
- p->bns->fp_pac = 0;
- return p;
-}
-
-void bwa_idx_destroy(bwa_idx_t *p)
-{
- bns_destroy(p->bns);
- bwt_destroy(p->bwt);
- free(p->pac);
- free(p);
-}
-
-bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score)
-{
- extern gap_opt_t *gap_init_opt(void);
- extern int bwa_cal_maxdiff(int l, double err, double thres);
- int i;
- bwa_buf_t *p;
- p = malloc(sizeof(bwa_buf_t));
- p->stack = gap_init_stack2(max_score);
- p->opt = gap_init_opt();
- p->opt->s_gapo = opt->s_gapo;
- p->opt->s_gape = opt->s_gape;
- p->opt->max_diff = opt->max_diff;
- p->opt->max_gapo = opt->max_gapo;
- p->opt->max_gape = opt->max_gape;
- p->opt->seed_len = opt->seed_len;
- p->opt->max_seed_diff = opt->max_seed_diff;
- p->opt->fnr = opt->fnr;
- p->diff_tab = calloc(BWA_MAX_QUERY_LEN, sizeof(int));
- for (i = 1; i < BWA_MAX_QUERY_LEN; ++i)
- p->diff_tab[i] = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr);
- p->logn = calloc(256, sizeof(int));
- for (i = 1; i != 256; ++i)
- p->logn[i] = (int)(4.343 * log(i) + 0.499);
- return p;
-}
-
-void bwa_buf_destroy(bwa_buf_t *p)
-{
- gap_destroy_stack(p->stack);
- free(p->diff_tab); free(p->logn); free(p->opt);
- free(p);
-}
-
-bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq)
-{
- extern int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width);
- int i, seq_len, buf_len;
- bwt_width_t *w, *seed_w;
- uint8_t *s;
- gap_opt_t opt2 = *buf->opt;
- bwa_sai_t sai;
-
- seq_len = strlen(seq);
- // estimate the buffer length
- buf_len = (buf->opt->seed_len + seq_len + 1) * sizeof(bwt_width_t) + seq_len;
- if (buf_len > buf->max_buf) {
- buf->max_buf = buf_len;
- kroundup32(buf->max_buf);
- buf->buf = realloc(buf->buf, buf->max_buf);
- }
- memset(buf->buf, 0, buf_len);
- seed_w = (bwt_width_t*)buf->buf;
- w = seed_w + buf->opt->seed_len;
- s = (uint8_t*)(w + seq_len + 1);
- if (opt2.fnr > 0.) opt2.max_diff = buf->diff_tab[seq_len];
- // copy the sequence
- for (i = 0; i < seq_len; ++i)
- s[i] = nst_nt4_table[(int)seq[i]];
- seq_reverse(seq_len, s, 0);
- // mapping
- bwt_cal_width(idx->bwt, seq_len, s, w);
- if (opt2.seed_len >= seq_len) opt2.seed_len = 0x7fffffff;
- if (seq_len > buf->opt->seed_len)
- bwt_cal_width(idx->bwt, buf->opt->seed_len, s + (seq_len - buf->opt->seed_len), seed_w);
- for (i = 0; i < seq_len; ++i) // complement; I forgot why...
- s[i] = s[i] > 3? 4 : 3 - s[i];
- sai.sai = (bwa_sai1_t*)bwt_match_gap(idx->bwt, seq_len, s, w, seq_len <= buf->opt->seed_len? 0 : seed_w, &opt2, &sai.n, buf->stack);
- return sai;
-}
-
-static void compute_NM(const uint8_t *pac, uint64_t l_pac, uint8_t *seq, int64_t pos, int n_cigar, uint32_t *cigar, int *n_mm, int *n_gaps)
-{
- uint64_t x = pos, z;
- int k, y = 0;
- *n_mm = *n_gaps = 0;
- for (k = 0; k < n_cigar; ++k) {
- int l = cigar[k]>>4;
- int op = cigar[k]&0xf;
- if (op == 0) { // match/mismatch
- for (z = 0; z < l && x + z < l_pac; ++z) {
- int c = pac[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3;
- if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) ++(*n_mm);
- }
- }
- if (op == 1 || op == 2) (*n_gaps) += l;
- if (op == 0 || op == 2) x += l;
- if (op == 0 || op == 1 || op == 4) y += l;
- }
-}
-
-void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln)
-{
- extern bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand);
- extern bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const uint8_t *seq, bwtint_t *_pos, int ext, int *n_cigar, int is_end_correct);
- int strand, seq_len, i, n_gap, n_mm;
- uint64_t pos3, pac_pos;
- uint8_t *s[2];
-
- memset(aln, 0, sizeof(bwa_aln_t));
- seq_len = strlen(seq);
- if (seq_len<<1 > buf->max_buf) {
- buf->max_buf = seq_len<<1;
- kroundup32(buf->max_buf);
- buf->buf = realloc(buf->buf, buf->max_buf);
- }
- s[0] = buf->buf;
- s[1] = s[0] + seq_len;
- for (i = 0; i < seq_len; ++i)
- s[0][i] = s[1][i] = nst_nt4_table[(int)seq[i]];
- seq_reverse(seq_len, s[1], 1);
- pac_pos = bwa_sa2pos(idx->bns, idx->bwt, sa, seq_len, &strand);
- if (strand) aln->flag |= 16;
- if (n_gaps) { // only for gapped alignment
- int n_cigar;
- bwa_cigar_t *cigar16;
- cigar16 = bwa_refine_gapped_core(idx->bns->l_pac, idx->pac, seq_len, s[strand], &pac_pos, strand? n_gaps : -n_gaps, &n_cigar, 1);
- aln->n_cigar = n_cigar;
- aln->cigar = malloc(n_cigar * 4);
- for (i = 0, pos3 = pac_pos; i < n_cigar; ++i) {
- int op = cigar16[i]>>14;
- int len = cigar16[i]&0x3fff;
- if (op == 3) op = 4; // the 16-bit CIGAR is different from the 32-bit CIGAR
- aln->cigar[i] = len<<4 | op;
- if (op == 0 || op == 2) pos3 += len;
- }
- free(cigar16);
- } else { // ungapped
- aln->n_cigar = 1;
- aln->cigar = malloc(4);
- aln->cigar[0] = seq_len<<4 | 0;
- pos3 = pac_pos + seq_len;
- }
- aln->n_n = bns_cnt_ambi(idx->bns, pac_pos, pos3 - pac_pos, &aln->ref_id);
- aln->offset = pac_pos - idx->bns->anns[aln->ref_id].offset;
- if (pos3 - idx->bns->anns[aln->ref_id].offset > idx->bns->anns[aln->ref_id].len) // read mapped beyond the end of a sequence
- aln->flag |= 4; // read unmapped
- compute_NM(idx->pac, idx->bns->l_pac, s[strand], pac_pos, aln->n_cigar, aln->cigar, &n_mm, &n_gap);
- aln->n_mm = n_mm;
- aln->n_gap = n_gap;
-}
-
-/************************
- * Single-end alignment *
- ************************/
-
-bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar)
-{
- bwa_one_t *one;
- int best, cnt, i, seq_len;
-
- seq_len = strlen(seq);
- one = calloc(1, sizeof(bwa_one_t));
- one->sai = bwa_sai(idx, buf, seq);
- if (one->sai.n == 0) return one;
- // count number of hits; randomly select one alignment
- best = one->sai.sai[0].score;
- for (i = cnt = 0; i < one->sai.n; ++i) {
- bwa_sai1_t *p = &one->sai.sai[i];
- if (p->score > best) break;
- if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) {
- one->which = p;
- one->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48());
- }
- cnt += p->l - p->k + 1;
- }
- one->c1 = cnt;
- for (; i < one->sai.n; ++i)
- cnt += one->sai.sai[i].l - one->sai.sai[i].k + 1;
- one->c2 = cnt - one->c1;
- // estimate single-end mapping quality
- one->mapQs = -1;
- if (one->c1 == 0) one->mapQs = 23; // FIXME: is it possible?
- else if (one->c1 > 1) one->mapQs = 0;
- else {
- int diff = one->which->n_mm + one->which->n_gapo + one->which->n_gape;
- if (diff >= buf->diff_tab[seq_len]) one->mapQs = 25;
- else if (one->c2 == 0) one->mapQs = 37;
- }
- if (one->mapQs < 0) {
- cnt = (one->c2 >= 255)? 255 : one->c2;
- one->mapQs = 23 < buf->logn[cnt]? 0 : 23 - buf->logn[cnt];
- }
- one->mapQ = one->mapQs;
- // compute CIGAR on request
- one->one.ref_id = -1;
- if (gen_cigar) bwa_sa2aln(idx, buf, seq, one->sa, one->which->n_gapo + one->which->n_gape, &one->one);
- return one;
-}
-
-void bwa_one_destroy(bwa_one_t *one)
-{
- free(one->sai.sai);
- free(one->one.cigar);
- free(one);
-}
-
-/************************
- * Paired-end alignment *
- ************************/
-
-void bwa_pestat(bwa_buf_t *buf, int n, bwa_one_t **o[2])
-{
-}
-
-void bwa_pe(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq[2], bwa_one_t *o[2])
-{
-}
View
107 bwa.h
@@ -1,107 +0,0 @@
-#ifndef BWA_H_
-#define BWA_H_
-
-#include <stdint.h>
-
-#define BWA_DEF_MAX_SCORE 2048
-#define BWA_MAX_QUERY_LEN 1024
-
-// BWA index
-struct bwa_idx_t;
-typedef struct bwa_idx_t bwa_idx_t;
-
-// Buffer for BWA alignment
-struct bwa_buf_t;
-typedef struct bwa_buf_t bwa_buf_t;
-
-// BWA alignment options
-typedef struct {
- int s_gapo, s_gape; // gap open and extension penalties; the mismatch penalty is fixed at 3
- int max_diff, max_gapo, max_gape; // max differences (-1 to use fnr for length-adjusted max diff), gap opens and gap extensions
- int seed_len, max_seed_diff; // seed length and max differences allowed in the seed
- float fnr; // parameter for automatic length-adjusted max differences
-} bwa_opt_t;
-
-// default BWA alignment options
-extern bwa_opt_t bwa_def_opt; // = { 11, 4, -1, 1, 6, 32, 2, 0.04 }
-
-// an interval hit in the SA coordinate; basic unit in .sai files
-typedef struct {
- uint32_t n_mm:16, n_gapo:8, n_gape:8;
- int score;
- uint64_t k, l; // [k,l] is the SA interval; each interval has l-k+1 hits
-} bwa_sai1_t;
-
-// all interval hits in the SA coordinate
-typedef struct {
- int n; // number of interval hits
- bwa_sai1_t *sai;
-} bwa_sai_t;
-
-// an alignment
-typedef struct {
- uint32_t n_n:8, n_gap:12, n_mm:12; // number of ambiguous bases, gaps and mismatches in the alignment
- int32_t ref_id; // referece sequence index (the first seq is indexed by 0)
- uint32_t offset; // coordinate on the reference; zero-based
- uint32_t n_cigar:16, flag:16; // number of CIGAR operations; SAM flag
- uint32_t *cigar; // CIGAR in the BAM 28+4 encoding; having n_cigar operations
-} bwa_aln_t;
-
-typedef struct {
- int mapQs, mapQ, c1, c2;
- uint64_t sa;
- bwa_sai1_t *which;
- bwa_sai_t sai;
- bwa_aln_t one;
-} bwa_one_t;
-
-typedef struct {
- double avg, std, ap_prior;
- uint64_t low, high, high_bayesian;
-} bwa_pestat_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- // load a BWA index
- bwa_idx_t *bwa_idx_load(const char *prefix);
- void bwa_idx_destroy(bwa_idx_t *p);
-
- // allocate a BWA alignment buffer; if unsure, set opt to &bwa_def_opt and max_score to BWA_DEF_MAX_SCORE
- bwa_buf_t *bwa_buf_init(const bwa_opt_t *opt, int max_score);
- void bwa_buf_destroy(bwa_buf_t *p);
-
- /**
- * Find all the SA intervals
- *
- * @param idx BWA index; multiple threads can share the same index
- * @param buf BWA alignment buffer; each thread should have its own buffer
- * @param seq NULL terminated C string, consisting of A/C/G/T/N only
- *
- * @return SA intervals seq is matched to
- */
- bwa_sai_t bwa_sai(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq);
-
- /**
- * Construct an alignment in the base-pair coordinate
- *
- * @param idx BWA index
- * @param buf BWA alignment buffer
- * @param seq NULL terinated C string
- * @param sa Suffix array value
- * @param n_gaps Number of gaps (typically equal to bwa_sai1_t::n_gapo + bwa_sai1_t::n_gape
- *
- * @return An alignment
- */
- void bwa_sa2aln(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, uint64_t sa, int n_gaps, bwa_aln_t *aln);
-
- bwa_one_t *bwa_se(const bwa_idx_t *idx, bwa_buf_t *buf, const char *seq, int gen_cigar);
-
- void bwa_one_destroy(bwa_one_t *one);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
Oops, something went wrong.

0 comments on commit e5ab59d

Please sign in to comment.