Skip to content

Commit

Permalink
changed _max to _any
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jul 7, 2010
1 parent 2f7fff2 commit 5e3ec0d
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 19 deletions.
76 changes: 73 additions & 3 deletions lib/hashtable.cc
Expand Up @@ -81,10 +81,11 @@ MinMaxTable * Hashtable::fasta_file_to_minmax(const std::string &inputfile,
}

//
// filter_fasta_file_max: filter and trims a FASTA file into a new one
// filter_fasta_file_any: filters a FASTA file based on whether any (at
// least one) k-mer in a sequence has 'threshold' counts in the hashtable.
//

ReadMaskTable * Hashtable::filter_fasta_file_max(const std::string &inputfile,
ReadMaskTable * Hashtable::filter_fasta_file_any(const std::string &inputfile,
MinMaxTable &minmax,
BoundedCounterType threshold,
ReadMaskTable * old_readmask,
Expand Down Expand Up @@ -128,7 +129,76 @@ ReadMaskTable * Hashtable::filter_fasta_file_max(const std::string &inputfile,
// run callback, if specified
if (read_num % CALLBACK_PERIOD == 0 && callback) {
try {
callback("filter_fasta_file_max", callback_data, read_num, 0);
callback("filter_fasta_file_any", callback_data, read_num, 0);
} catch (...) {
infile.close();
throw;
}
}
}
else {
name = line.substr(1, line.length()-1);
}

isRead = isRead ? 0 : 1;
}
}

infile.close();

return readmask;
}

//
// filter_fasta_file_all: filters a FASTA file based on whether all
// k-mers in a sequence have 'threshold' counts in the hashtable.
//

ReadMaskTable * Hashtable::filter_fasta_file_all(const std::string &inputfile,
MinMaxTable &minmax,
BoundedCounterType threshold,
ReadMaskTable * old_readmask,
CallbackFn callback,
void * callback_data)

{
string line;
ifstream infile(inputfile.c_str());
int isRead = 0;
string name;
string seq;
unsigned int read_num = 0;
ReadMaskTable * readmask = new ReadMaskTable(minmax.get_tablesize());

if (old_readmask) {
readmask->merge(*old_readmask);
}

if (infile.is_open()) {
while(!infile.eof()) {
getline(infile, line);
if (line.length() == 0) {
break;
}

if (isRead) {
seq = line;
if (readmask->get(read_num)) {
BoundedCounterType minval = minmax.get_min(read_num);

if (minval < threshold) {
readmask->set(read_num, false);
}
name.clear();
seq.clear();
}

read_num += 1;

// run callback, if specified
if (read_num % CALLBACK_PERIOD == 0 && callback) {
try {
callback("filter_fasta_file_all", callback_data, read_num, 0);
} catch (...) {
infile.close();
throw;
Expand Down
9 changes: 8 additions & 1 deletion lib/hashtable.hh
Expand Up @@ -99,7 +99,14 @@ namespace khmer {
CallbackFn callback = NULL,
void * callback_data = NULL);

ReadMaskTable * filter_fasta_file_max(const std::string &inputfile,
ReadMaskTable * filter_fasta_file_any(const std::string &inputfile,
MinMaxTable &minmax,
BoundedCounterType threshold,
ReadMaskTable * readmask = NULL,
CallbackFn callback = NULL,
void * callback_data = NULL);

ReadMaskTable * filter_fasta_file_all(const std::string &inputfile,
MinMaxTable &minmax,
BoundedCounterType threshold,
ReadMaskTable * readmask = NULL,
Expand Down
57 changes: 54 additions & 3 deletions python/_khmermodule.cc
Expand Up @@ -653,7 +653,7 @@ static PyObject * hash_fasta_file_to_minmax(PyObject * self, PyObject *args)
return (PyObject *) minmax_obj;
}

static PyObject * hash_filter_fasta_file_max(PyObject * self, PyObject *args)
static PyObject * hash_filter_fasta_file_any(PyObject * self, PyObject *args)
{
khmer_KHashtableObject * me = (khmer_KHashtableObject *) self;
khmer::Hashtable * hashtable = me->hashtable;
Expand Down Expand Up @@ -688,7 +688,57 @@ static PyObject * hash_filter_fasta_file_max(PyObject * self, PyObject *args)

khmer::ReadMaskTable * readmask;
try {
readmask = hashtable->filter_fasta_file_max(filename, *mmt, threshold,
readmask = hashtable->filter_fasta_file_any(filename, *mmt, threshold,
old_readmask,
_report_fn, callback_obj);
} catch (_khmer_signal &e) {
return NULL;
}

khmer_ReadMaskObject * readmask_obj = (khmer_ReadMaskObject *) \
PyObject_New(khmer_ReadMaskObject, &khmer_ReadMaskType);

readmask_obj->mask = readmask;

return (PyObject *) readmask_obj;
}

static PyObject * hash_filter_fasta_file_all(PyObject * self, PyObject *args)
{
khmer_KHashtableObject * me = (khmer_KHashtableObject *) self;
khmer::Hashtable * hashtable = me->hashtable;

char * filename;
unsigned int threshold;

PyObject * o1 = NULL, * o2 = NULL;
PyObject * callback_obj = NULL;

if (!PyArg_ParseTuple(args, "sOi|OO", &filename, &o1, &threshold, &o2,
&callback_obj)) {
return NULL;
}

if (!is_minmax_obj(o1)) {
PyErr_SetString(PyExc_TypeError,
"second argument must be a minmax object");
return NULL;
}
khmer::MinMaxTable * mmt = ((khmer_MinMaxObject *) o1)->mmt;

khmer::ReadMaskTable * old_readmask = NULL;
if (o2 && o2 != Py_None) {
if (!is_readmask_obj(o2)) {
PyErr_SetString(PyExc_TypeError,
"fourth argument must be None or a readmask object");
return NULL;
}
old_readmask = ((khmer_ReadMaskObject *) o2)->mask;
}

khmer::ReadMaskTable * readmask;
try {
readmask = hashtable->filter_fasta_file_all(filename, *mmt, threshold,
old_readmask,
_report_fn, callback_obj);
} catch (_khmer_signal &e) {
Expand Down Expand Up @@ -910,7 +960,8 @@ static PyMethodDef khmer_hashtable_methods[] = {
{ "consume_fasta", hash_consume_fasta, METH_VARARGS, "Count all k-mers in a given file" },
{ "consume_fasta_build_readmask", hash_consume_fasta_build_readmask, METH_VARARGS, "Count all k-mers in a given file, creating a readmask object to mask off bad reads" },
{ "fasta_file_to_minmax", hash_fasta_file_to_minmax, METH_VARARGS, "" },
{ "filter_fasta_file_max", hash_filter_fasta_file_max, METH_VARARGS, "" },
{ "filter_fasta_file_any", hash_filter_fasta_file_any, METH_VARARGS, "" },
{ "filter_fasta_file_all", hash_filter_fasta_file_all, METH_VARARGS, "" },
{ "get", hash_get, METH_VARARGS, "Get the count for the given k-mer" },
{ "get_min_count", hash_get_min_count, METH_VARARGS, "Get the smallest count of all the k-mers in the string" },
{ "get_max_count", hash_get_max_count, METH_VARARGS, "Get the largest count of all the k-mers in the string" },
Expand Down
2 changes: 1 addition & 1 deletion python/khmer/__init__.py
Expand Up @@ -9,7 +9,7 @@
from _khmer import forward_hash, forward_hash_no_rc, reverse_hash
from _khmer import set_reporting_callback

from filter_utils import filter_fasta_file
from filter_utils import filter_fasta_file_any, filter_fasta_file_all

###

Expand Down
12 changes: 10 additions & 2 deletions python/khmer/filter_utils.py
@@ -1,6 +1,14 @@
def filter_fasta_file(ht, filename, total_reads, outname, threshold):
def filter_fasta_file_any(ht, filename, total_reads, outname, threshold):
minmax = ht.fasta_file_to_minmax(filename, total_reads)
readmask = ht.filter_fasta_file_max(filename, minmax, threshold)
readmask = ht.filter_fasta_file_any(filename, minmax, threshold)

n_kept = readmask.filter_fasta_file(filename, outname)

return total_reads, n_kept

def filter_fasta_file_all(ht, filename, total_reads, outname, threshold):
minmax = ht.fasta_file_to_minmax(filename, total_reads)
readmask = ht.filter_fasta_file_all(filename, minmax, threshold)

n_kept = readmask.filter_fasta_file(filename, outname)

Expand Down
6 changes: 3 additions & 3 deletions python/test_c_wrapper.py
Expand Up @@ -69,7 +69,7 @@ def test_raise_in_filter_fasta_file_max():
mmt = ht.fasta_file_to_minmax(reads_filename, N_READS)

try:
ht.filter_fasta_file_max(reads_filename, mmt, 2, None, callback_raise)
ht.filter_fasta_file_any(reads_filename, mmt, 2, None, callback_raise)
assert 0
except GoodException:
pass
Expand All @@ -80,7 +80,7 @@ def test_bad_mmt_in_filter_fasta_file_max():
ht = khmer.new_hashtable(4, 4**4)

try:
ht.filter_fasta_file_max(reads_filename, "hi", 2)
ht.filter_fasta_file_any(reads_filename, "hi", 2)
assert 0
except TypeError:
pass # expected
Expand All @@ -92,7 +92,7 @@ def test_bad_readmask_in_filter_fasta_file_max():
mmt = ht.fasta_file_to_minmax(reads_filename, N_READS)

try:
ht.filter_fasta_file_max(reads_filename, mmt, 2, "hi")
ht.filter_fasta_file_any(reads_filename, mmt, 2, "hi")
assert 0
except TypeError:
pass # expected
Expand Down
12 changes: 6 additions & 6 deletions python/test_filter.py
Expand Up @@ -32,9 +32,9 @@ def test_filter(self):
assert total_reads == 3, total_reads
assert n_consumed == 63, n_consumed

(total_reads, n_seq_kept) = khmer.filter_fasta_file(ht, filename,
total_reads,
outname, 2)
(total_reads, n_seq_kept) = khmer.filter_fasta_file_any(ht, filename,
total_reads,
outname, 2)
assert n_seq_kept == 2, n_seq_kept

names = load_fa_seq_names(outname)
Expand All @@ -50,9 +50,9 @@ def test_filter_n(self):
assert total_reads == 4, total_reads
assert n_consumed == 63, n_consumed

(total_reads, n_seq_kept) = khmer.filter_fasta_file(ht, filename,
total_reads,
outname, 1)
(total_reads, n_seq_kept) = khmer.filter_fasta_file_any(ht, filename,
total_reads,
outname, 1)
assert n_seq_kept == 3, n_seq_kept

names = load_fa_seq_names(outname)
Expand Down

0 comments on commit 5e3ec0d

Please sign in to comment.