From b78addc9a6f7e30be29f0d4cbe3238b1cd24b9b3 Mon Sep 17 00:00:00 2001 From: jmschrei Date: Wed, 23 Aug 2017 15:59:29 -0700 Subject: [PATCH] ADD sparse prediction --- rambutan/rambutan.py | 58 +++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/rambutan/rambutan.py b/rambutan/rambutan.py index 9926380..e5ddb27 100644 --- a/rambutan/rambutan.py +++ b/rambutan/rambutan.py @@ -182,7 +182,7 @@ def __init__(self, name='rambutan', iteration=None, model=None, self.use_dist = use_dist self.verbose = verbose - def predict(self, sequence, dnase, regions=None, ctxs=[0]): + def predict(self, sequence, dnase, regions=None, ctxs=[0], sparse=False): """Make predictions and return the matrix of probabilities. Rambutan will make a prediction for each pair of genomic loci defined in @@ -211,7 +211,7 @@ def predict(self, sequence, dnase, regions=None, ctxs=[0]): where there are no n or N symbols in the fasta file. Default is None. - ctxs: list, optional + ctxs : list, optional The contexts of the gpus to use for prediction. Currently prediction is only supported on gpus and not cpus due to the time it would take for prediction. For example, if you @@ -220,6 +220,10 @@ def predict(self, sequence, dnase, regions=None, ctxs=[0]): ctxs=[0, 1, 3] and the prediction task will be naturally parallelized across your 3 gpus with a linear speedup. + sparse : bool, optional + Whether to return three arrays, the rows, columns, and values, + or the full dense matrix. Sparse is useful for large matrices. + Returns ------- y : numpy.ndarray, shape=(m, m) @@ -255,21 +259,41 @@ def predict(self, sequence, dnase, regions=None, ctxs=[0]): self.use_seq, self.use_dnase, self.use_dist, self.min_dist, self.max_dist, self.batch_size, self.verbose) for ctx in ctxs) - n = int(regions.max()) / 1000 + 1 - y = numpy.zeros((n, n)) - - for ctx in ctxs: - with open('.rambutan.predictions.{}.txt'.format(ctx), 'r') as infile: - for line in infile: - mid1, mid2, p = line.split() - mid1 = (int(float(mid1)) - 500) / 1000 - mid2 = (int(float(mid2)) - 500) / 1000 - p = float(p) - y[mid1, mid2] = p - - os.system('rm .rambutan.predictions.{}.txt'.format(ctx)) - - return y + if sparse == False: + n = int(regions.max()) / 1000 + 1 + y = numpy.zeros((n, n)) + + for ctx in ctxs: + with open('.rambutan.predictions.{}.txt'.format(ctx), 'r') as infile: + for line in infile: + mid1, mid2, p = line.split() + mid1 = (int(float(mid1)) - 500) / 1000 + mid2 = (int(float(mid2)) - 500) / 1000 + p = float(p) + y[mid1, mid2] = p + + os.system('rm .rambutan.predictions.{}.txt'.format(ctx)) + + return y + + else: + rows, cols, values = [], [], [] + for ctx in ctxs: + with open('.rambutan.predictions.{}.txt'.format(ctx), 'r') as infile: + for line in infile: + mid1, mid2, p = line.split() + mid1, mid2, p = int(mid1), int(mid2), float(p) + + rows.append(mid1) + cols.append(mid2) + values.append(p) + + os.system('rm .rambutan.predictions.{}.txt'.format(ctx)) + + rows = numpy.array(rows) + cols = numpy.array(cols) + values = numpy.array(values) + return rows, cols, values def fit(self, sequence, dnase, contacts, regions=None, validation_contacts=None, training_chromosome=None, validation_chromosome=None, ctxs=[0],