Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from irinaColgiu/master
I optimized the Python script so that doesn't iterate over all the columns and check if the current column is to be dropped, but instead it iterates over and gets only the information of interest. It runs 30% faster than the latest version.
- Loading branch information
Showing
1 changed file
with
37 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,40 @@ | ||
#! /usr/bin/python | ||
|
||
# Thank you to Josh Randall (Sanger) for providing this script! | ||
|
||
import sys | ||
|
||
reportFileName = sys.argv[1] | ||
dropSamplesFileName = sys.argv[2] | ||
|
||
# read sample exclusion file and populate samples dictionary | ||
dropSamples = dict() | ||
for line in open(dropSamplesFileName, 'r'): | ||
line = line.replace("\n", "") | ||
dropSamples[line] = 1 | ||
|
||
# open file and connect to iterator | ||
reportIter = iter(open(reportFileName, 'r').readline, '') | ||
|
||
dropColumns = dict() | ||
# read first line and fill dropColumns with column indices to drop | ||
headerline = reportIter.next() | ||
headerline = headerline.replace("\n", "") | ||
headers = headerline.split("\t") | ||
headerout = [headers[0],headers[1],headers[2]] | ||
for i in range(3, len(headers)): | ||
id = headers[i].split(".")[0] | ||
if id in dropSamples: | ||
dropColumns[i] = 1; | ||
else: | ||
headerout.append(headers[i]) | ||
|
||
# print new header | ||
print "\t".join(headerout) | ||
|
||
# read the rest of the report file lines, dropping columns present in dropColumns dict | ||
for line in reportIter: | ||
line = line.replace("\n", "") | ||
fields = line.split("\t") | ||
out = [fields[0],fields[1],fields[2]] | ||
for i in range(3, len(fields)): | ||
if i not in dropColumns: | ||
out.append(fields[i]) | ||
print "\t".join(out) | ||
|
||
|
||
import sys | ||
|
||
reportFileName = sys.argv[1] | ||
dropSamplesFileName = sys.argv[2] | ||
|
||
# read sample exclusion file and populate samples set | ||
dropSamples = set() | ||
for line in open(dropSamplesFileName, 'r'): | ||
line = line.strip() | ||
dropSamples.add(line) | ||
|
||
# open file => returns an iterator | ||
reportFile = open(reportFileName, 'r') | ||
|
||
okColumns = [] | ||
# read first line and fill okColumns with column indices to be kept (the columns not present in the dropSamples) | ||
headerline = reportFile.next() | ||
headers = headerline.strip().split("\t") | ||
headerout = headers[:3] | ||
for i in range(3, len(headers)): | ||
id = headers[i].split(".")[0] | ||
if id not in dropSamples: | ||
okColumns.append(i) | ||
headerout.append(headers[i]) | ||
|
||
# print new header | ||
print "\t".join(headerout) | ||
|
||
# read the rest of the report file lines, printing only the important columns (okColumns) | ||
for line in reportFile: | ||
fields = line.strip().split("\t") | ||
out = fields[:3] | ||
for i in okColumns: | ||
out.append(fields[i]) | ||
print "\t".join(out) | ||
|