From 66603dc4d090aaacaa61821e43d9ebcc16318814 Mon Sep 17 00:00:00 2001
From: irinaColgiu <irina.colgiu@gmail.com>
Date: Thu, 26 Jul 2012 13:19:14 +0100
Subject: [PATCH] Optimization: iterating only on the columns intended to be
 kept, instead of iterating on all the columns.

---
 dropSamplesFromReport_FasterVersion.py | 78 ++++++++++++--------------
 1 file changed, 37 insertions(+), 41 deletions(-)

diff --git a/dropSamplesFromReport_FasterVersion.py b/dropSamplesFromReport_FasterVersion.py
index ec69191..6cde1fb 100644
--- a/dropSamplesFromReport_FasterVersion.py
+++ b/dropSamplesFromReport_FasterVersion.py
@@ -1,44 +1,40 @@
 #! /usr/bin/python
 
 # Thank you to Josh Randall (Sanger) for providing this script!
-
-import sys
-
-reportFileName = sys.argv[1]
-dropSamplesFileName = sys.argv[2]
-
-# read sample exclusion file and populate samples dictionary 
-dropSamples = dict() 
-for line in open(dropSamplesFileName, 'r'):
-	line = line.replace("\n", "")
-	dropSamples[line] = 1
- 
-# open file and connect to iterator
-reportIter = iter(open(reportFileName, 'r').readline, '')
-
-dropColumns = dict()
-# read first line and fill dropColumns with column indices to drop 
-headerline = reportIter.next() 
-headerline = headerline.replace("\n", "") 
-headers = headerline.split("\t") 
-headerout = [headers[0],headers[1],headers[2]]
-for i in range(3, len(headers)):
-	id = headers[i].split(".")[0]
-	if id in dropSamples:
-		dropColumns[i] = 1;
-	else:
-		headerout.append(headers[i])
-
-# print new header
-print "\t".join(headerout)
- 
-# read the rest of the report file lines, dropping columns present in dropColumns dict
-for line in reportIter:
-	line = line.replace("\n", "")
-	fields = line.split("\t")
-	out = [fields[0],fields[1],fields[2]]
-	for i in range(3, len(fields)):
-		if i not in dropColumns:
-			out.append(fields[i])
-	print "\t".join(out)
-
+
+import sys
+
+reportFileName = sys.argv[1]
+dropSamplesFileName = sys.argv[2]
+
+# read sample exclusion file and populate samples set 
+dropSamples = set() 
+for line in open(dropSamplesFileName, 'r'):
+	line = line.strip()
+	dropSamples.add(line)
+ 
+# open file => returns an iterator
+reportFile = open(reportFileName, 'r')
+
+okColumns = []
+# read first line and fill okColumns with column indices to be kept (the columns not present in the dropSamples)
+headerline = reportFile.next() 
+headers = headerline.strip().split("\t") 
+headerout = headers[:3]
+for i in range(3, len(headers)):
+	id = headers[i].split(".")[0]
+	if id not in dropSamples:
+		okColumns.append(i)
+		headerout.append(headers[i])
+
+# print new header
+print "\t".join(headerout)
+ 
+# read the rest of the report file lines, printing only the important columns (okColumns) 
+for line in reportFile:
+	fields = line.strip().split("\t")
+	out = fields[:3]
+	for i in okColumns:
+		out.append(fields[i])
+	print "\t".join(out)
+