Permalink
Browse files

Add filtered background frequency to output VCF when calling. Availab…

…le for visualizing limits of callability
  • Loading branch information...
1 parent 15f1d3b commit db245c95289b870a70857573017affcac07a715f @chapmanb chapmanb committed Apr 15, 2012
@@ -14,8 +14,10 @@
(defn vrns-by-pos
"Lazy sequence of variation calls at each position."
[rdr classifier aa-finder config]
- (letfn [(vrns-as-vc [[[contig pos] freqs count]]
- (convert-to-vc contig pos freqs :depth count :aa-finder aa-finder))]
+ (letfn [(vrns-as-vc [x]
+ (let [[contig pos] (:position x)]
+ (convert-to-vc contig pos (:calls x) :depth (:total x)
+ :aa-finder aa-finder :back-filter-freq (:back-filter-freq x))))]
(let [classifier-passes? (classifier-checker classifier config)]
(->> (raw-reads-by-pos rdr config)
(map (fn [xs] (call-vrns-at-pos xs classifier-passes? config)))
@@ -180,23 +180,31 @@
flatten
frequencies
percents))
- (remove-low [min-freq [orig total]]
+ (organize-filter-calls [min-freq [orig total]]
(let [min-freq-percent (* 100.0 min-freq)
want (->> orig
(filter #(> (second %) min-freq-percent))
- (map first))]
- (list (select-keys orig want) total)))]
+ (map first))
+ filter-calls (select-keys orig want)
+ back-filter-freq (->> orig
+ (remove (fn [[k v]] (contains? (set want) k)))
+ (sort-by second <)
+ first
+ second)]
+ {:calls filter-calls
+ :total total
+ :back-filter-freq back-filter-freq}))]
(let [position ((juxt :space :pos) (first reads))
clean-reads (remove #(= "N" (:base %)) reads)
majority-base (ffirst (sort-by second > (first (percents-by-base clean-reads))))
- [base-calls total] (->> clean-reads
- (filter (fn [xs]
- (or (= (:base xs) majority-base)
- (apply passes?
- ((juxt :qual :kmer-pct :map-score) xs)))))
- percents-by-base
- (remove-low (:min-freq config)))]
- [position base-calls total])))
+ vrns-at-pos (->> clean-reads
+ (filter (fn [xs]
+ (or (= (:base xs) majority-base)
+ (apply passes?
+ ((juxt :qual :kmer-pct :map-score) xs)))))
+ percents-by-base
+ (organize-filter-calls (:min-freq config)))]
+ (assoc vrns-at-pos :position position))))
(defn classifier-checker [classifier config]
"Calculate probability of read inclusion using pre-built classifier."
@@ -253,7 +261,7 @@
(with-open [rdr (reader data-file)]
(->> (raw-reads-by-pos rdr config)
(map (fn [xs] (call-vrns-at-pos xs (fn [_ _ _] true) config)))
- vec)))
+ doall)))
(defn assess-classifier [data-file pos-file ref-file classifier config]
"Determine rates of true/false positive/negatives with trained classifier"
@@ -262,7 +270,8 @@
(with-open [rdr (reader data-file)]
(->> (raw-reads-by-pos rdr config)
(map (fn [xs] (call-vrns-at-pos xs classifier-passes? config)))
- (map (fn [[pos bases total]] (assign-position-type pos bases total known-vrns config)))
+ (map (fn [x] (assign-position-type (:position x) (:calls x)
+ (:total x) known-vrns config)))
(remove #(nil? (:class %)))
(map (fn [xs] (if (:verbose config) (println xs)) xs))
vec))))
@@ -88,8 +88,8 @@
(defn coverage-by-pos-plot [data-file image-dir config]
"Plot of read coverage by position."
(let [raw-freqs (raw-data-frequencies data-file (assoc config :min-freq 0.0))
- cov-x (map #(-> % first second) raw-freqs)
- cov-y (map #(/ (last %) 1e6) raw-freqs)]
+ cov-x (map #(-> % :position second) raw-freqs)
+ cov-y (map #(/ (:total %) 1e6) raw-freqs)]
(doto (xy-plot cov-x cov-y
:title (fs/base-name data-file true)
:x-label "Position" :y-label "Coverage (million reads)")
@@ -58,7 +58,7 @@
(defn convert-to-vc
"Convert base frequency information into a VCF VariantContext."
- [contig pos base-freqs & {:keys [depth aa-finder]}]
+ [contig pos base-freqs & {:keys [depth aa-finder back-filter-freq]}]
(letfn [(to-alleles [bases]
(map-indexed (fn [i [base _]]
(Allele/create (str base) (= i 0)))
@@ -73,13 +73,14 @@
(#(if (> (count ordered-bases) 1)
(assoc % "AF" (to-freqs ordered-bases))
%))
- (#(if-not (nil? depth)
- (assoc % "DP" depth)
- %))
+ (#(if (nil? depth) %
+ (assoc % "DP" depth)))
(#(if (and (not (nil? aa-finder))
(> (count ordered-bases) 1))
(assoc % "AA_CHANGE" (aa-changes ordered-bases))
- %))))
+ %))
+ (#(if (nil? back-filter-freq) %
+ (assoc % "AFBACK" back-filter-freq)))))
(.make)))))
(defn- to-seq-dict [in-fasta]
@@ -100,7 +101,10 @@
VCFHeaderLineType/Integer "Total Depth")
(VCFInfoHeaderLine. "AA_CHANGE" VCFHeaderLineCount/A
VCFHeaderLineType/String
- "Amino acid change caused by a variant")}))
+ "Amino acid change caused by a variant")
+ (VCFInfoHeaderLine. "AFBACK" 1
+ VCFHeaderLineType/Float
+ "Filtered background allele frequency")}))
(defn write-vcf-calls
"Write output calls in VCF format"
@@ -90,8 +90,8 @@
(assoc :min-freq 0.0))]
(let [raw-freqs (raw-data-frequencies count-file config)]
(count raw-freqs) => 3
- (ffirst raw-freqs) => ["HXB2" 5] ; read position
- (-> raw-freqs first last) => 71169 ; number of total reads
- (-> raw-freqs first second (get "G")) => (roughly 99.9747)
- (-> raw-freqs first second (get "T")) => (roughly 0.009835))))
+ (-> raw-freqs first :position) => ["HXB2" 5] ; read position
+ (-> raw-freqs first :total) => 71169 ; number of total reads
+ (-> raw-freqs first :calls (get "G")) => (roughly 99.9747)
+ (-> raw-freqs first :calls (get "T")) => (roughly 0.009835))))

0 comments on commit db245c9

Please sign in to comment.