From 66603dc4d090aaacaa61821e43d9ebcc16318814 Mon Sep 17 00:00:00 2001 From: irinaColgiu Date: Thu, 26 Jul 2012 13:19:14 +0100 Subject: [PATCH] Optimization: iterating only on the columns intended to be kept, instead of iterating on all the columns. --- dropSamplesFromReport_FasterVersion.py | 78 ++++++++++++-------------- 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/dropSamplesFromReport_FasterVersion.py b/dropSamplesFromReport_FasterVersion.py index ec69191..6cde1fb 100644 --- a/dropSamplesFromReport_FasterVersion.py +++ b/dropSamplesFromReport_FasterVersion.py @@ -1,44 +1,40 @@ #! /usr/bin/python # Thank you to Josh Randall (Sanger) for providing this script! - -import sys - -reportFileName = sys.argv[1] -dropSamplesFileName = sys.argv[2] - -# read sample exclusion file and populate samples dictionary -dropSamples = dict() -for line in open(dropSamplesFileName, 'r'): - line = line.replace("\n", "") - dropSamples[line] = 1 - -# open file and connect to iterator -reportIter = iter(open(reportFileName, 'r').readline, '') - -dropColumns = dict() -# read first line and fill dropColumns with column indices to drop -headerline = reportIter.next() -headerline = headerline.replace("\n", "") -headers = headerline.split("\t") -headerout = [headers[0],headers[1],headers[2]] -for i in range(3, len(headers)): - id = headers[i].split(".")[0] - if id in dropSamples: - dropColumns[i] = 1; - else: - headerout.append(headers[i]) - -# print new header -print "\t".join(headerout) - -# read the rest of the report file lines, dropping columns present in dropColumns dict -for line in reportIter: - line = line.replace("\n", "") - fields = line.split("\t") - out = [fields[0],fields[1],fields[2]] - for i in range(3, len(fields)): - if i not in dropColumns: - out.append(fields[i]) - print "\t".join(out) - + +import sys + +reportFileName = sys.argv[1] +dropSamplesFileName = sys.argv[2] + +# read sample exclusion file and populate samples set +dropSamples = set() +for line in open(dropSamplesFileName, 'r'): + line = line.strip() + dropSamples.add(line) + +# open file => returns an iterator +reportFile = open(reportFileName, 'r') + +okColumns = [] +# read first line and fill okColumns with column indices to be kept (the columns not present in the dropSamples) +headerline = reportFile.next() +headers = headerline.strip().split("\t") +headerout = headers[:3] +for i in range(3, len(headers)): + id = headers[i].split(".")[0] + if id not in dropSamples: + okColumns.append(i) + headerout.append(headers[i]) + +# print new header +print "\t".join(headerout) + +# read the rest of the report file lines, printing only the important columns (okColumns) +for line in reportFile: + fields = line.strip().split("\t") + out = fields[:3] + for i in okColumns: + out.append(fields[i]) + print "\t".join(out) +