Skip to content

Commit

Permalink
First pass at seq-specific bias correction
Browse files Browse the repository at this point in the history
  • Loading branch information
rob-p committed Dec 14, 2015
1 parent 920f559 commit be0760e
Show file tree
Hide file tree
Showing 11 changed files with 614 additions and 356 deletions.
169 changes: 16 additions & 153 deletions include/KmerDist.hpp
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@
#include <map> #include <map>
#include <algorithm> #include <algorithm>
#include <cstdint> #include <cstdint>
// from http://stackoverflow.com/questions/17719674/c11-fast-constexpr-integer-powers #include "UtilityFunctions.hpp"
constexpr int64_t constExprPow(int64_t base, unsigned int exp, int64_t result = 1) {
return exp < 1 ? result : constExprPow(base*base, exp/2, (exp % 2) ? result*base : result);
}
// templatized on the k-mer length // templatized on the k-mer length
template <unsigned int K, typename CountT=uint32_t> template <unsigned int K, typename CountT=uint32_t>
class KmerDist { class KmerDist {
Expand All @@ -37,147 +35,15 @@ class KmerDist {
//constexpr const uint64_t numKmers{constExprPow(4, K)}; //constexpr const uint64_t numKmers{constExprPow(4, K)};
std::array<CountT, constExprPow(4,K)> counts; std::array<CountT, constExprPow(4,K)> counts;
std::map<uint64_t,bool> hexamers; std::map<uint64_t,bool> hexamers;

const uint64_t numPossKmers = constExprPow(4,K); const uint64_t numPossKmers = constExprPow(4,K);

KmerDist() : haveBias_(false) { KmerDist() : haveBias_(false) {
// set a pseudo-count for each k-mer // set a pseudo-count for each k-mer
for (size_t i = 0; i < counts.size(); ++i) { counts[i] = 0; } for (size_t i = 0; i < counts.size(); ++i) { counts[i] = 0; }
} }
inline uint64_t indexForKmer(const char* s, bool reverseComplement) {
// The index we'll return
uint64_t idx{0};
// The number of bits we need to shift the
// current mask to the left.
uint64_t pos{0};
if(!reverseComplement)
{
for (size_t i = 0; i < K; ++i) {
char c = s[i];
switch (c) {
case 'A':
case 'a':
// don't need to do anything; or'ing with 0
break;
case 'C':
case 'c':
idx |= (0x1 << pos);
break;
case 'G':
case 'g':
idx |= (0x2 << pos);
break;
case 'T':
case 't':
case 'U':
case 'u':
idx |= (0x3 << pos);
break;
default:
// treat anything else (e.g. an 'N' as an 'A')
break;
}
pos += 2;
}
}
else
{
for(size_t i=K-1 ; i>=0 ; i--)
{
switch(s[i])
{
case 'T':
case 't':
case 'u':
case 'U': break;
case 'C':
case 'c': idx |= (0x2 << pos);
break;
case 'G':
case 'g': idx |= (0x1 << pos);
break;
case 'A':
case 'a': idx |= (0x3 <<pos);
break;
}
pos+=2;
}
}
return idx;
}

inline uint64_t nextKmerIndex(uint64_t idx, const char *next,bool reverseComplement)
{
idx = idx & (0x3FF);
idx = idx << 2;
char n=*next;
if(reverseComplement)
{
switch(n)
{
case 'A':
case 'a': n='T'
break;
case 'C':
case 'c': n='G';break;
case 'g':
case 'G': n='C';break;
case 'T':
case 't':
case 'U':
case 'u': n='A';break;


}
}
switch(n)
{
case 'A':
case 'a': break;
case 'C':
case 'c': idx = idx + 1;
break;
case 'G':
case 'g': idx = idx + 2;
break;
case 'T':
case 't':
case 'U':
case 'u':
idx = idx + 3;
break;
}
return idx;
}

inline std::string kmerForIndex(uint64_t idx) {
std::string kmer(K, 'X');
// The number of bits we need to shift the
// current mask to the left.
uint64_t pos{0};
for (size_t i = 0; i < K; ++i) {
uint8_t c = (idx >> pos) & 0x3;
switch (c) {
case 0:
kmer[i] = 'A';
break;
case 1:
kmer[i] = 'C';
break;
case 2:
kmer[i] = 'G';
break;
case 3:
kmer[i] = 'T';
break;
default:
break;
}
pos += 2;
}
return kmer;
}

inline void addKmers(const char* s, const char* end, bool reverseComplement) { inline void addKmers(const char* s, const char* end, bool reverseComplement) {
// Make sure there are at least k bases left // Make sure there are at least k bases left


Expand All @@ -192,38 +58,35 @@ class KmerDist {
} }
return true; return true;
*/ */
if(!reverseComplement) if(!reverseComplement) {
{ auto idx = indexForKmer(start, K, false);
auto idx = indexForKmer(start,false);
hexamers[idx]=true; hexamers[idx]=true;
counts[idx]++; counts[idx]++;
uint32_t i=1; uint32_t i=1;
while(std::distance(start+i,end)>=K) { while(std::distance(start+i,end)>=K) {

idx = nextKmerIndex(idx,start+i,false); idx = nextKmerIndex(idx, start+i, K, false);
auto it = hexamers.find(idx); auto it = hexamers.find(idx);
if(it==hexamers.end()) if(it==hexamers.end())
hexamers[idx] = true; hexamers[idx] = true;
counts[idx]++; counts[idx]++;
i++; i++;

} }
} } else {
else auto idx = indexForKmer(end-1, K, true);
{
auto idx = indexForKmer(end-1,true);
hexamers[idx]=true; hexamers[idx]=true;
counts[idx]++; counts[idx]++;
uint32_t i=2; uint32_t i=2;
while(end-i>=start) while(end-i>=start)
{ {
idx = nextKmerIndex(idx,end-i,true); idx = nextKmerIndex(idx, end-i, K, true);
auto it = hexamers.find(idx); auto it = hexamers.find(idx);
if(it==hexamers.end()) if(it==hexamers.end())
hexamers[idx] = true; hexamers[idx] = true;
counts[idx]++; counts[idx]++;
i--; i--;

} }
} }
} }
Expand All @@ -232,4 +95,4 @@ class KmerDist {
//std::vector<Bias> biases_; //std::vector<Bias> biases_;
//std::unordered_map<std::string, size_t> offsetMap_; //std::unordered_map<std::string, size_t> offsetMap_;
}; };
#endif // KMER_DIST_HPP__ #endif // KMER_DIST_HPP__
14 changes: 14 additions & 0 deletions include/ReadExperiment.hpp
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
#include <memory> #include <memory>
#include <fstream> #include <fstream>


//S_AYUSH_CODE
#include "ReadKmerDist.hpp"
//T_AYUSH_CODE

/** /**
* This class represents a library of reads used to quantify * This class represents a library of reads used to quantify
* a set of target transcripts. * a set of target transcripts.
Expand Down Expand Up @@ -139,6 +143,10 @@ class ReadExperiment {
std::vector<ReadLibrary>& readLibraries() { return readLibraries_; } std::vector<ReadLibrary>& readLibraries() { return readLibraries_; }
//FragmentLengthDistribution* fragmentLengthDistribution() { return fragLengthDist_.get(); } //FragmentLengthDistribution* fragmentLengthDistribution() { return fragLengthDist_.get(); }


//S_AYUSH_CODE
ReadKmerDist<6, std::atomic<uint32_t>>& readBias() { return readBias_; }
//T_AYUSH_CODE

private: private:
/** /**
* The file from which the alignments will be read. * The file from which the alignments will be read.
Expand Down Expand Up @@ -167,6 +175,12 @@ class ReadExperiment {
double effectiveMappingRate_{0.0}; double effectiveMappingRate_{0.0};
//std::unique_ptr<FragmentLengthDistribution> fragLengthDist_; //std::unique_ptr<FragmentLengthDistribution> fragLengthDist_;
EquivalenceClassBuilder eqBuilder_; EquivalenceClassBuilder eqBuilder_;

//S_AYUSH_CODE
// Since multiple threads can touch this dist, we
// need atomic counters.
ReadKmerDist<6, std::atomic<uint32_t>> readBias_;
//T_AYUSH_CODE
}; };


#endif // EXPERIMENT_HPP #endif // EXPERIMENT_HPP
Loading

0 comments on commit be0760e

Please sign in to comment.