Skip to content

Commit

Permalink
Merge pull request #1 from irinaColgiu/master
Browse files Browse the repository at this point in the history
I optimized the Python script so that doesn't iterate over all the columns and check if the current column is to be dropped, but instead it iterates over and gets only the information of interest. It runs 30% faster than the latest version.
  • Loading branch information
jigold committed Aug 3, 2012
2 parents 747c472 + 66603dc commit 8639cd7
Showing 1 changed file with 37 additions and 41 deletions.
78 changes: 37 additions & 41 deletions dropSamplesFromReport_FasterVersion.py
@@ -1,44 +1,40 @@
#! /usr/bin/python

# Thank you to Josh Randall (Sanger) for providing this script!

import sys

reportFileName = sys.argv[1]
dropSamplesFileName = sys.argv[2]

# read sample exclusion file and populate samples dictionary
dropSamples = dict()
for line in open(dropSamplesFileName, 'r'):
line = line.replace("\n", "")
dropSamples[line] = 1

# open file and connect to iterator
reportIter = iter(open(reportFileName, 'r').readline, '')

dropColumns = dict()
# read first line and fill dropColumns with column indices to drop
headerline = reportIter.next()
headerline = headerline.replace("\n", "")
headers = headerline.split("\t")
headerout = [headers[0],headers[1],headers[2]]
for i in range(3, len(headers)):
id = headers[i].split(".")[0]
if id in dropSamples:
dropColumns[i] = 1;
else:
headerout.append(headers[i])

# print new header
print "\t".join(headerout)

# read the rest of the report file lines, dropping columns present in dropColumns dict
for line in reportIter:
line = line.replace("\n", "")
fields = line.split("\t")
out = [fields[0],fields[1],fields[2]]
for i in range(3, len(fields)):
if i not in dropColumns:
out.append(fields[i])
print "\t".join(out)


import sys

reportFileName = sys.argv[1]
dropSamplesFileName = sys.argv[2]

# read sample exclusion file and populate samples set
dropSamples = set()
for line in open(dropSamplesFileName, 'r'):
line = line.strip()
dropSamples.add(line)

# open file => returns an iterator
reportFile = open(reportFileName, 'r')

okColumns = []
# read first line and fill okColumns with column indices to be kept (the columns not present in the dropSamples)
headerline = reportFile.next()
headers = headerline.strip().split("\t")
headerout = headers[:3]
for i in range(3, len(headers)):
id = headers[i].split(".")[0]
if id not in dropSamples:
okColumns.append(i)
headerout.append(headers[i])

# print new header
print "\t".join(headerout)

# read the rest of the report file lines, printing only the important columns (okColumns)
for line in reportFile:
fields = line.strip().split("\t")
out = fields[:3]
for i in okColumns:
out.append(fields[i])
print "\t".join(out)

0 comments on commit 8639cd7

Please sign in to comment.