From 67f4fbcccff476b943746168cf0d0e6209a724f4 Mon Sep 17 00:00:00 2001 From: Kishori Date: Sun, 2 Mar 2014 16:59:56 -0800 Subject: [PATCH] fixed the large sample processing bottleneck --- MetaPathways.py | 6 +- libs/python_modules/LCAComputation.py | 175 +++++- libs/python_modules/metapaths.py | 44 +- libs/python_modules/metapaths_utils.py | 130 ++-- .../MetaPathways_annotate_fast.py | 19 +- .../MetaPathways_create_reports_fast.py | 565 ++++++++++++------ .../MetaPathways_parse_blast.py | 48 +- .../MetaPathways_rRNA_stats_calculator.py | 19 +- resources/Dsignal | 8 + resources/TPCsignal | 15 + template_config.txt | 1 + template_param.txt | 7 +- 12 files changed, 729 insertions(+), 308 deletions(-) create mode 100755 resources/Dsignal create mode 100755 resources/TPCsignal diff --git a/MetaPathways.py b/MetaPathways.py index 7f02d01..dcf5625 100755 --- a/MetaPathways.py +++ b/MetaPathways.py @@ -257,9 +257,6 @@ def main(argv): sorted_input_output_list = sorted(input_output_list.keys()) # PART1 before the blast if len(input_output_list): - print 'length ' - print input_output_list - for input_file in sorted_input_output_list: output_dir = input_output_list[input_file] if run_type=='overwrite' and path.exists(output_dir): @@ -283,6 +280,9 @@ def main(argv): ncbi_sequin_sbt = ncbi_sequin_sbt, run_type = run_type ) + else: + print "ERROR :No input files were found in the input folder" + sys.exit(0) # blast the files diff --git a/libs/python_modules/LCAComputation.py b/libs/python_modules/LCAComputation.py index 83db9b4..790e921 100755 --- a/libs/python_modules/LCAComputation.py +++ b/libs/python_modules/LCAComputation.py @@ -1,17 +1,40 @@ #!/usr/bin/python -import re -import sys +try: + import sys, traceback + import re + import sys + from metapaths_utils import fprintf, printf, GffFileParser +except: + print """ Could not load some user defined module functions""" + print """ Make sure your typed \"source MetaPathwaysrc\"""" + print """ """ + print traceback.print_exc(10) + sys.exit(3) + + + +def copyList(a, b): + [ b.append(x) for x in a ] class LCAComputation: begin_pattern = re.compile("#") + # a readable taxon name to numeric string id map as ncbi name_to_id={} + + # a readable taxon ncbi tax id to name map id_to_name={} + + # this is the tree structure in a id to parent map, you can traverse it to go to the root taxid_to_ptaxid = {} - min_score = 50 - top_percent = 10 - min_support = 5 + + lca_min_score = 50 # an LCA parameter for min score for a hit to be considered + lca_top_percent = 10 # an LCA param to confine the hits to within the top hits score upto the top_percent% + lca_min_support = 5 # a minimum number of reads in the sample to consider a taxon to be present + results_dictionary = None + + # initialize with the ncbi tree file def __init__(self, filename): taxonomy_file = open(filename, 'r') lines = taxonomy_file.readlines() @@ -25,16 +48,22 @@ def __init__(self, filename): continue self.name_to_id[str(fields[0])] = str(fields[1]) self.id_to_name[str(fields[1])] = str(fields[0]) - self.taxid_to_ptaxid[str(fields[1])] = [ str(fields[2]), 0] + # the taxid to ptax map has for each taxid a corresponding 3-tuple + # the first location is the pid, the second is used as a counter for + # lca while a search is traversed up the tree and the third is used for + # the min support + self.taxid_to_ptaxid[str(fields[1])] = [ str(fields[2]), 0, 0] + def setParameters(self, min_score, top_percent, min_support): - self.min_score = min_score - self.top_percent =top_percent - self.min_support = min_support + self.lca_min_score = min_score + self.lca_top_percent =top_percent + self.lca_min_support = min_support def sizeTaxnames(self ): return len(self.name_to_id) + def sizeTaxids(self): return len(self.taxid_to_ptaxid) @@ -44,17 +73,20 @@ def get_a_Valid_ID(self, name_group): return self.name_to_id[name] return -1 + # given a taxon name it returns the correcponding unique ncbi tax id def translateNameToID(self, name): if not name in self.name_to_id: return None return self.name_to_id[name] + # given a taxon id to taxon name map def translateIdToName(self, id): if not id in self.id_to_name: return None return self.id_to_name[id] + # given a name it returns the parents name def getParentName(self, name): if not name in self.name_to_id: return None @@ -63,12 +95,22 @@ def getParentName(self, name): return self.translateIdToName( pid ) + # given a ncbi tax id returns the parents tax id def getParentTaxId(self, ID): if not ID in self.taxid_to_ptaxid: return None return self.taxid_to_ptaxid[ID][0] + # given a set of ids it returns the lowest common ancenstor + # without caring about min support + # here LCA for a set of ids are computed as follows + # first we consider one ID at a time + # for each id we traverse up the ncbi tree using the id to parent id map + # at the same time increasing the count on the second value of the 3-tuple + # note that at the node where all the of the individual ids ( limit in number) + # converges the counter matches the limit for the first time, while climbing up. + # This also this enables us to make the selection of id arbitrary def get_lca(self, IDs): limit = len(IDs) for id in IDs: @@ -78,31 +120,134 @@ def get_lca(self, IDs): if self.taxid_to_ptaxid[tid][1]==limit: return self.id_to_name[tid] tid = self.taxid_to_ptaxid[tid][0] - return "" + return "root" + + def update_taxon_support_count(self, taxonomy): + id = self.get_a_Valid_ID( [taxonomy ]) + tid = id + while( tid in self.taxid_to_ptaxid and tid !='1' ): + self.taxid_to_ptaxid[tid][2]+=1 + tid = self.taxid_to_ptaxid[tid][0] + + def get_supported_taxon(self, taxonomy): + id = self.get_a_Valid_ID( [taxonomy ]) + tid = id + while( tid in self.taxid_to_ptaxid and tid !='1' ): + if self.lca_min_support > self.taxid_to_ptaxid[tid][2] : + tid = self.taxid_to_ptaxid[tid][0] + else: + return self.translateIdToName(tid) + + return self.translateIdToName(tid) + + # need to call this to clear the counts of reads at every node def clear_cells(self, IDs): limit = len(IDs) for id in IDs: tid = id while( tid in self.taxid_to_ptaxid and tid !='1' ): - if self.taxid_to_ptaxid[tid][1]==0: - return self.id_to_name[tid] + #if self.taxid_to_ptaxid[tid][1]==0: + # return self.id_to_name[tid] self.taxid_to_ptaxid[tid][1]=0 tid = self.taxid_to_ptaxid[tid][0] return "" + #given a set of sets of names it computes an lca + # in the format [ [name1, name2], [name3, name4,....namex] ...] + # here name1 and name2 are synonyms and so are name3 through namex def getTaxonomy(self, name_groups): - IDs = [] for name_group in name_groups: - #print name_group id = self.get_a_Valid_ID(name_group) if id!=-1: IDs.append(id) consensus = self.get_lca(IDs) - #print "==============" self.clear_cells(IDs) return consensus + + # extracts taxon names for a refseq annotation + def get_species(self, hit): + if not 'product' in hit: + return None + species = [] + try: + m = re.findall(r'\[([^\[]+)\]', hit['product']) + if m != None: + copyList(m,species) + except: + return None + + if species: + return species + else: + return None + + # used for optimization + def set_results_dictionary(self, results_dictionary): + self.results_dictionary= results_dictionary + + # this returns the megan taxonomy, i.e., it computes the lca but at the same time + # takes into consideration the parameters, min score, min support and top percent + def getMeganTaxonomy(self, orfid): + #compute the top hit wrt score + names = [] + species = [] + if 'refseq' in self.results_dictionary: + if orfid in self.results_dictionary['refseq']: + + top_score = 0 + for hit in self.results_dictionary['refseq'][orfid]: + if hit['bitscore'] >= self.lca_min_score and hit['bitscore'] >= top_score: + top_score = hit['bitscore'] + + for hit in self.results_dictionary['refseq'][orfid]: + if (100-self.lca_top_percent)*top_score/100 < hit['bitscore']: + names = self.get_species(hit) + #if 'MD_2_95' == orfid: + # for hit in self.results_dictionary['refseq'][orfid]: + # print orfid + ':' + str(names) + # else: + # print orfid + ':' + str([]) + if names: + species.append(names) + + taxonomy = self.getTaxonomy(species) + meganTaxonomy = self.get_supported_taxon( taxonomy) + return meganTaxonomy + + + # this is use to compute the min support for each taxon in the tree + # this is called before the getMeganTaxonomy + def compute_min_support_tree(self, annotate_gff_file, pickorfs): + gffreader = GffFileParser(annotate_gff_file) + try: + for contig in gffreader: + for orf in gffreader.orf_dictionary[contig]: + if not orf['id'] in pickorfs: + continue + taxonomy = None + species = [] + if 'refseq' in self.results_dictionary: + if orf['id'] in self.results_dictionary['refseq']: + #compute the top hit wrt score + top_score = 0 + for hit in self.results_dictionary['refseq'][orf['id']]: + if hit['bitscore'] >= self.lca_min_score and hit['bitscore'] >= top_score: + top_score = hit['bitscore'] + + for hit in self.results_dictionary['refseq'][orf['id']]: + if (100-self.lca_top_percent)*top_score/100 < hit['bitscore']: + names = self.get_species(hit) + if names: + species.append(names) + taxonomy=self.getTaxonomy(species) + self.update_taxon_support_count(taxonomy) + pickorfs[orf['id']] = taxonomy + except: + print "ERROR : Cannot read annotated gff file " + + diff --git a/libs/python_modules/metapaths.py b/libs/python_modules/metapaths.py index ac5b72b..32e7cd2 100755 --- a/libs/python_modules/metapaths.py +++ b/libs/python_modules/metapaths.py @@ -107,8 +107,7 @@ def format_db(formatdb_executable, seqType, raw_sequence_file, formatted_db, al if algorithm=='LAST': # dirname = os.path.dirname(raw_sequence_file) - cmd='%s -p -c %s %s' %(formatdb_executable, formatted_db, raw_sequence_file) - print cmd + cmd='%s -s 5G -p -c %s %s' %(formatdb_executable, formatted_db, raw_sequence_file) result= getstatusoutput(cmd) if result[0]==0: @@ -187,8 +186,9 @@ def create_refscores_compute_cmd(input, output, config_settings, algorithm): def formatted_db_exists(dbname, suffixes): for suffix in suffixes: fileList = glob(dbname + suffix) - if not fileList: - return False + if len(fileList)==0 : + print 'ERROR : if formatted correctely then expected the files with pattern ' + dbname + suffix + return False return True def check_if_raw_sequences_exist(filename): @@ -233,7 +233,7 @@ def check_an_format_refdb(dbname, seqType, config_settings, config_params): formattedDBPath = functional_formatted + PATHDELIM + dbname else: print "WARNING : Undefined sequnce type for " + dbname + "!" - return + sys.exit(0) # database formatting executables paths if algorithm == 'LAST' and seqType =='prot': @@ -244,18 +244,18 @@ def check_an_format_refdb(dbname, seqType, config_settings, config_params): if not (formatted_db_exists(formattedDBPath, suffixes) ): print "WARNING : You do not seem to have Database " + dbname + " formatted!" if check_if_raw_sequences_exist(seqPath): - print " Found raw sequences for Database " + dbname + "!" - print " Trying to format on the fly ....!" + print " Found raw sequences for Database " + dbname + " in folder " + seqPath + " !" + print " Trying to format on the fly .... for " + algorithm + "!" result =format_db(executable, seqType, seqPath, formattedDBPath, algorithm) if result ==True: print " Formatting successful!" return else: - print " Formatting failed! Please consider formatting manually or do not try to annotated with this database!" + print " Formatting failed! Please consider formatting manually or do not try to annotate with this database!" sys.exit(0) - print 'seqpath ' + seqPath print "ERROR : You do not even have the raw sequence for Database " + dbname + " to format!" + print " in the folder "+ seqPath print " Please put the appropriate files in folder \"blastDB\"" sys.exit(0) @@ -305,8 +305,6 @@ def create_blastp_against_refdb_cmd(input, output, output_dir, sample_name, check_an_format_refdb(dbfilename, 'prot', config_settings, config_params) if system=='grid': - - batch_size = get_parameter(config_params, 'grid_engine', 'batch_size', default=500) max_concurrent_batches = get_parameter(config_params, 'grid_engine', 'max_concurrent_batches', default=500) user = get_parameter(config_params, 'grid_engine', 'user', default='') @@ -385,9 +383,6 @@ def create_annotate_genebank_cmd(sample_name, mapping_txt, input_unannotated_gff return cmd def create_genbank_ptinput_sequin_cmd(input_annotated_gff, nucleotide_fasta, amino_fasta, outputs, config_settings, ncbi_params_file, ncbi_sequin_sbt): - - #tbl2asn_exe = config_settings['METAPATHWAYS_PATH'] + config_settings['TBL2ASN_EXECUTABLE'] - cmd="%s -g %s -n %s -p %s " %((config_settings['METAPATHWAYS_PATH'] + config_settings['GENBANK_FILE']), input_annotated_gff, nucleotide_fasta, amino_fasta) ; if 'gbk' in outputs: @@ -396,10 +391,8 @@ def create_genbank_ptinput_sequin_cmd(input_annotated_gff, nucleotide_fasta, ami if ncbi_params_file and 'sequin' in outputs: cmd += ' --out-sequin ' + outputs['sequin'] cmd += ' --sequin-params-file ' + ncbi_params_file - # cmd += ' --sequin-tbl2asn ' + tbl2asn_exe cmd += ' --ncbi-sbt-file ' + ncbi_sequin_sbt - if 'ptinput' in outputs: cmd += ' --out-ptinput ' + outputs['ptinput'] @@ -490,8 +483,8 @@ def create_rRNA_scan_statistics(blastoutput, refdbname, bscore_cutoff, eval_cuto def create_tRNA_scan_statistics(in_file,stat_file, fasta_file, config_settings): cmd= "%s -o %s -F %s -i %s -T %s -D %s" %((config_settings['METAPATHWAYS_PATH'] + config_settings['SCAN_tRNA']) ,\ stat_file, fasta_file, in_file,\ - (config_settings['METAPATHWAYS_PATH'] + config_settings['EXECUTABLES_DIR'])+ PATHDELIM + 'TPCsignal',\ - (config_settings['METAPATHWAYS_PATH'] + config_settings['EXECUTABLES_DIR'])+ PATHDELIM + 'Dsignal') + (config_settings['METAPATHWAYS_PATH'] + config_settings['RESOURCES_DIR'])+ PATHDELIM + 'TPCsignal',\ + (config_settings['METAPATHWAYS_PATH'] + config_settings['RESOURCES_DIR'])+ PATHDELIM + 'Dsignal') return cmd # create the command to make the MLTreeMap Images @@ -591,7 +584,7 @@ def write_run_parameters_file(fileName, parameters): # checks if the necessary files, directories and executables really exists or not def check_config_settings(config_settings, file): - essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR'] + essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR'] missingItems = [] for key, value in config_settings.items(): # make sure MetaPathways directory is present @@ -619,6 +612,13 @@ def check_config_settings(config_settings, file): missingItems.append(key) continue + # make sure RESOURCES_DIR directories are present + if key in [ 'RESOURCES_DIR']: + if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) : + print "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"" %(key, file) + print "ERROR: Currently it is set to \"%s\"\n" %( config_settings[key] ) + missingItems.append(key) + continue # make sure MetaPaths directory is present if key in ['PERL_EXECUTABLE', 'PYTHON_EXECUTABLE' , 'PATHOLOGIC_EXECUTABLE' ]: @@ -791,8 +791,8 @@ def checkMissingParam_values(config_params, choices, logger): for category in choices: for parameter in choices[category]: if not config_params[category][parameter]: - logger.write('ERROR: Empty paramter %s of type %s' %(parameter, category)) - print('ERROR: Empty paramter %s of type %s' %(parameter, category)) + logger.write('ERROR: Empty parameter %s of type %s' %(parameter, category)) + print('ERROR: Empty parameter %s of type %s' %(parameter, category)) success = False return success @@ -1457,7 +1457,7 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_ command_Status= get_parameter( config_params,'metapaths_steps','PATHOLOGIC') enable_flag = False - if (run_type in ['overwrite', 'overlay'] and command_Status== 'yes'): + if (run_type in ['overwrite', 'overlay']) and (command_Status in ['redo', 'yes']): enable_flag = True #remove the pathologic dir for the pgdb with the same sample name since a new pgdb is requested remove_existing_pgdb( sample_name, config_settings['PATHOLOGIC_EXECUTABLE']) diff --git a/libs/python_modules/metapaths_utils.py b/libs/python_modules/metapaths_utils.py index 75c9b91..088093f 100755 --- a/libs/python_modules/metapaths_utils.py +++ b/libs/python_modules/metapaths_utils.py @@ -21,42 +21,100 @@ import sys import os import math -#from copy import deepcopy -#from numpy import min, max, median, mean -#import numpy -#from numpy.ma import MaskedArray -#from numpy.ma.extras import apply_along_axis -#from numpy import array, zeros, argsort, shape, vstack,ndarray, asarray, \ -# float, where, isnan -#from cogent import LoadSeqs, Sequence -#from cogent.cluster.procrustes import procrustes -#from cogent.core.alignment import Alignment -#from cogent.core.moltype import MolType, IUPAC_DNA_chars, IUPAC_DNA_ambiguities,\ -# IUPAC_DNA_ambiguities_complements, DnaStandardPairs, ModelDnaSequence -#from cogent.data.molecular_weight import DnaMW -#from cogent.core.sequence import DnaSequence -#from cogent.app.blast import Blastall -#from cogent.app.util import get_tmp_filename -#from cogent.parse.blast import BlastResult -#from cogent.parse.fasta import MinimalFastaParser -#from cogent.util.misc import remove_files -#from cogent.util.dict2d import Dict2D -#from cogent.app.formatdb import build_blast_db_from_fasta_path,\ -# build_blast_db_from_fasta_file -#from cogent import LoadSeqs -#from cogent.util.misc import (parse_command_line_parameters, -# create_dir, -# handle_error_codes) -# -# -#from qiime.parse import (parse_otu_table, -# parse_qiime_config_files, -# parse_coords, -# parse_newick, -# PhyloNode, -# parse_mapping_file) -#from qiime.format import format_otu_table -# +import re + + +class GffFileParser(object): + def __init__(self, gff_filename): + self.Size = 10000 + self.i=0 + self.orf_dictionary = {} + self.gff_beg_pattern = re.compile("^#") + self.lines= [] + self.size=0 + try: + self.gff_file = open( gff_filename,'r') + except AttributeError: + print "Cannot read the map file for database :" + dbname + sys.exit(0) + + def __iter__(self): + return self + + def refillBuffer(self): + self.orf_dictionary = {} + i = 0 + while i < self.Size: + line=self.gff_file.readline() + if not line: + break + if self.gff_beg_pattern.search(line): + continue + self.insert_orf_into_dict(line, self.orf_dictionary) + i += 1 + + self.orfs = self.orf_dictionary.keys() + self.size = len(self.orfs) + self.i = 0 + + def next(self): + if self.i == self.size: + self.refillBuffer() + + if self.size==0: + self.gff_file.close() + raise StopIteration() + + #print self.i + if self.i < self.size: + self.i = self.i + 1 + return self.orfs[self.i-1] + + + + def insert_orf_into_dict(self, line, contig_dict): + rawfields = re.split('\t', line) + fields = [] + for field in rawfields: + fields.append(field.strip()); + + + if( len(fields) != 9): + return + + attributes = {} + attributes['seqname'] = fields[0] # this is a bit of a duplication + attributes['source'] = fields[1] + attributes['feature'] = fields[2] + attributes['start'] = int(fields[3]) + attributes['end'] = int(fields[4]) + + try: + attributes['score'] = float(fields[5]) + except: + attributes['score'] = fields[5] + + attributes['strand'] = fields[6] + attributes['frame'] = fields[7] + + self.split_attributes(fields[8], attributes) + + if not fields[0] in contig_dict : + contig_dict[fields[0]] = [] + + contig_dict[fields[0]].append(attributes) + + def insert_attribute(self, attributes, attribStr): + rawfields = re.split('=', attribStr) + if len(rawfields) == 2: + attributes[rawfields[0].strip().lower()] = rawfields[1].strip() + + def split_attributes(self, str, attributes): + rawattributes = re.split(';', str) + for attribStr in rawattributes: + self.insert_attribute(attributes, attribStr) + + return attributes class Performance: def __init__(self): diff --git a/libs/python_scripts/MetaPathways_annotate_fast.py b/libs/python_scripts/MetaPathways_annotate_fast.py index 633f9c4..937aae4 100755 --- a/libs/python_scripts/MetaPathways_annotate_fast.py +++ b/libs/python_scripts/MetaPathways_annotate_fast.py @@ -137,7 +137,6 @@ def insert_orf_into_dict(line, contig_dict): fields = [] for field in rawfields: fields.append(field.strip()); - if( len(fields) != 9): return @@ -171,7 +170,7 @@ def __init__(self, gff_filename): self.Size = 10000 self.i=0 self.orf_dictionary = {} - self.gff_beg_pattern = re.compile("#") + self.gff_beg_pattern = re.compile("^#") self.lines= [] self.size=0 try: @@ -185,14 +184,13 @@ def __iter__(self): def refillBuffer(self): self.orf_dictionary = {} - line = self.gff_file.readline() i = 0 - while line and i < self.Size: + while i < self.Size: line=self.gff_file.readline() - if self.gff_beg_pattern.search(line): - continue if not line: break + if self.gff_beg_pattern.search(line): + continue insert_orf_into_dict(line, self.orf_dictionary) i += 1 @@ -375,12 +373,12 @@ def add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary, contig_lengths) : for rRNA in rRNA_16S_dictionary: try: - orf_length = abs(int( tRNA_dictionary[tRNA][1] )-int( tRNA_dictionary[tRNA][0] )) + 1 + orf_length = abs(int( tRNA_dictionary[rRNA][1] )-int( tRNA_dictionary[rRNA][0] )) + 1 except: orf_length = 0 - if tRNA in contig_lengths: - contig_length = contig_lengths[tRNA] + if rRNA in contig_lengths: + contig_length = contig_lengths[rRNA] else: contig_length = 0 @@ -417,7 +415,7 @@ def create_annotation(dbname_weight, results_dictionary, input_gff, rRNA_16S_st for contig in gffreader: count = 0 for orf in gffreader.orf_dictionary[contig]: - #print orf + #print orf['id'] value = 0.0001 success =False output_comp_annot_file1_Str = '' @@ -464,7 +462,6 @@ def create_annotation(dbname_weight, results_dictionary, input_gff, rRNA_16S_st else: output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], '','','','') - if success: # there was a database hit fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str) fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str) diff --git a/libs/python_scripts/MetaPathways_create_reports_fast.py b/libs/python_scripts/MetaPathways_create_reports_fast.py index 39d7cc7..de7f222 100755 --- a/libs/python_scripts/MetaPathways_create_reports_fast.py +++ b/libs/python_scripts/MetaPathways_create_reports_fast.py @@ -16,7 +16,7 @@ from python_modules.LCAComputation import * from python_modules.MeganTree import * - from python_modules.metapaths_utils import parse_command_line_parameters, fprintf, printf + from python_modules.metapaths_utils import parse_command_line_parameters, fprintf, printf, GffFileParser from python_modules.sysutil import getstatusoutput except: print """ Could not load some user defined module functions""" @@ -120,100 +120,6 @@ def check_arguments(opts, args): return True -def insert_attribute(attributes, attribStr): - rawfields = re.split('=', attribStr) - if len(rawfields) == 2: - attributes[rawfields[0].strip().lower()] = rawfields[1].strip() - -def split_attributes(str, attributes): - rawattributes = re.split(';', str) - for attribStr in rawattributes: - insert_attribute(attributes, attribStr) - - return attributes - -def insert_orf_into_dict(line, contig_dict): - rawfields = re.split('\t', line) - fields = [] - for field in rawfields: - fields.append(field.strip()); - - - if( len(fields) != 9): - return - - attributes = {} - attributes['seqname'] = fields[0] # this is a bit of a duplication - attributes['source'] = fields[1] - attributes['feature'] = fields[2] - attributes['start'] = int(fields[3]) - attributes['end'] = int(fields[4]) - - try: - attributes['score'] = float(fields[5]) - except: - attributes['score'] = fields[5] - - attributes['strand'] = fields[6] - attributes['frame'] = fields[7] - - split_attributes(fields[8], attributes) - - if not fields[0] in contig_dict : - contig_dict[fields[0]] = [] - - contig_dict[fields[0]].append(attributes) - - -class GffFileParser(object): - - def __init__(self, gff_filename): - self.Size = 10000 - self.i=0 - self.orf_dictionary = {} - self.gff_beg_pattern = re.compile("#") - self.lines= [] - self.size=0 - try: - self.gff_file = open( gff_filename,'r') - except AttributeError: - print "Cannot read the map file for database :" + dbname - sys.exit(0) - - def __iter__(self): - return self - - def refillBuffer(self): - self.orf_dictionary = {} - line = self.gff_file.readline() - i = 0 - while line and i < self.Size: - line=self.gff_file.readline() - if self.gff_beg_pattern.search(line): - continue - if not line: - break - insert_orf_into_dict(line, self.orf_dictionary) - i += 1 - - self.orfs = self.orf_dictionary.keys() - self.size = len(self.orfs) - self.i = 0 - - def next(self): - if self.i == self.size: - self.refillBuffer() - - if self.size==0: - self.gff_file.close() - raise StopIteration() - - #print self.i - if self.i < self.size: - self.i = self.i + 1 - return self.orfs[self.i-1] - - def process_gff_file(gff_file_name, orf_dictionary): try: gfffile = open(gff_file_name, 'r') @@ -269,43 +175,35 @@ def get_species(hit): return None -def create_annotation(results_dictionary, annotated_gff, output_dir, ncbi_taxonomy_tree_file, min_score, top_percent, min_support): +def create_annotation(results_dictionary, dbnames, annotated_gff, output_dir, Taxons, orfsPicked, orfToContig): meganTree = None - lca = None - if 'refseq' in results_dictionary: - lca = LCAComputation(ncbi_taxonomy_tree_file) - lca.setParameters(min_score, top_percent, min_support) - meganTree = MeganTree(lca) - + #lca.set_results_dictionary(results_dictionary) if not path.exists(output_dir): makedirs(output_dir) - orf_dictionary={} #process_gff_file(annotated_gff, orf_dictionary) gffreader = GffFileParser(annotated_gff) - output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w') - fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n") + output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'a') count = 0 for contig in gffreader: for orf in gffreader.orf_dictionary[contig]: + if not orf['id'] in orfsPicked: + continue + + orfToContig[orf['id']] = contig + taxonomy = None if count%10000==0 : pass - species = [] - if 'refseq' in results_dictionary: - if orf['id'] in results_dictionary['refseq']: - for hit in results_dictionary['refseq'][orf['id']]: - if hit['bitscore'] >= min_score: - names = get_species(hit) - if names: - species.append(names) - #print '---------------------------' - # else: - # print "hit " + hit['query'] + ' ' + hit['dbname'] + ' ' + str(hit['bitscore'] ) - if lca: - taxonomy=lca.getTaxonomy(species) + + #update the sequence name + + #taxonomy=lca.getMeganTaxonomy(orf['id']) + #print orf['id'] + taxonomy=Taxons[orf['id']] + fprintf(output_table_file, "%s", orf['id']) fprintf(output_table_file, "\t%s", orf['orf_length']) fprintf(output_table_file, "\t%s", orf['start']) @@ -317,24 +215,22 @@ def create_annotation(results_dictionary, annotated_gff, output_dir, ncbi_taxon # fprintf(output_table_file, "\t%s", str(species)) fprintf(output_table_file, "\t%s", taxonomy) fprintf(output_table_file, "\t%s\n", orf['product']) - if meganTree and taxonomy != '': - meganTree.insertTaxon(taxonomy) - # print 'inserted taxon of taxonomy : ', taxonomy + + # adding taxons to the megan tree + #if meganTree and taxonomy != '': + # meganTree.insertTaxon(taxonomy) #print meganTree.getChildToParentMap() + output_table_file.close() # print meganTree.getParentToChildrenMap() - if meganTree: - print output_dir + '/megan_tree.tre' - megan_tree_file = open(output_dir + '/megan_tree.tre', 'w') - #print meganTree.printTree('1') - # exit() - fprintf(megan_tree_file, "%s;", meganTree.printTree('1')) - # print 'wrote out megan_tree_file' - megan_tree_file.close() + # this prints out the megan tree +# if meganTree: +# megan_tree_file = open(output_dir + '/megan_tree.tre', 'w') +# fprintf(megan_tree_file, "%s;", meganTree.printTree('1')) +# megan_tree_file.close() - #write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos, orf['id']) @@ -439,14 +335,16 @@ def remove_repeats(filtered_words): class BlastOutputTsvParser(object): def __init__(self, dbname, blastoutput): + self.lineToProcess = "" self.dbname = dbname self.blastoutput = blastoutput self.i=0 self.SIZE = 10000 self.data = {} self.fieldmap={} - self.seq_beg_pattern = re.compile("#") + self.seq_beg_pattern = re.compile("^#") self.lines = [] + self.headerline = None try: self.blastoutputfile = open( blastoutput,'r') @@ -454,32 +352,43 @@ def __init__(self, dbname, blastoutput): if not self.seq_beg_pattern.search(line) : print "First line must have field header names and begin with \"#\"" sys.exit(0) - header = re.sub('#','',line) + self.headerline = line.strip() + self.lineToProcess = self.headerline + header = re.sub('^#','',line) fields = [ x.strip() for x in header.rstrip().split('\t')] k = 0 for x in fields: - self.fieldmap[x] = k - k+=1 + self.fieldmap[x] = k + k += 1 except AttributeError: print "Cannot read the map file for database :" + dbname sys.exit(0) + def getHeaderLine(self): + return self.headerline + + def getProcessedLine(self): + return self.lineToProcess + def refillBuffer(self): i = 0 self.lines = [] - line = self.blastoutputfile.readline() - while line and i < self.SIZE: - line=self.blastoutputfile.readline() + while i < self.SIZE: + line = self.blastoutputfile.readline().strip() if not line: break + if self.seq_beg_pattern.search(line): + continue + self.lines.append(line) i += 1 - self.size = len(self.lines) - + def rewind(self): + self.i = self.i - 1 + def __iter__(self): return self @@ -501,24 +410,26 @@ def next(self): self.data['identity'] = float(fields[self.fieldmap['identity']]) self.data['ec'] = fields[self.fieldmap['ec']] self.data['product'] = re.sub(r'=',' ',fields[self.fieldmap['product']]) + self.lineToProcess = self.lines[self.i % self.SIZE] except: print "<<<<<<-------" print 'self size ' + str(self.size) print 'line ' + self.lines[self.i % self.SIZE] + print 'next line ' + self.lines[(self.i + 1) % self.SIZE] + print ' field map ' + str(self.fieldmap) print 'index ' + str(self.i) print 'data ' + str(self.data) + print 'fields ' + str(fields) print ">>>>>>-------" -# import traceback -# print traceback.print_exc() + import traceback + print traceback.print_exc() self.i = self.i + 1 - return None + sys.exit(0) self.i = self.i + 1 - try: - return self.data - except: - return None + return self.data else: + self.lineToProcess = None self.blastoutputfile.close() raise StopIteration() @@ -548,31 +459,29 @@ def isWithinCutoffs(data, cutoffs): return True -# compute the refscores -def process_parsed_blastoutput(dbname, blastoutput, cutoffs, annotation_results): - blastparser = BlastOutputTsvParser(dbname, blastoutput) - +def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs): fields = ['target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ] fields.append('product') - - count = 0 for data in blastparser: - if data!=None and isWithinCutoffs(data, cutoffs) : + if data!=None and isWithinCutoffs(data, cutoffs) : # if dbname=='refseq': # print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product'] annotation = {} for field in fields: if field in data: annotation[field] = data[field] + + if not data['query'] in pickorfs: + blastparser.rewind() + return None + annotation['dbname'] = dbname if not data['query'] in annotation_results: annotation_results[data['query']] = [] annotation_results[data['query']].append(annotation) - count += 1 - # if count%100==0: - # print count + return None def beginning_valid_field(line): @@ -628,7 +537,7 @@ def kegg_id(product): kegg_id=results.group(0) return kegg_id -def create_table(results, dbname_map_filename, dbname, output_dir): +def create_table(results, dbname_map_filename, dbname, output_dir, filePermType = 'w'): if not path.exists(output_dir): makedirs(output_dir) @@ -657,33 +566,33 @@ def create_table(results, dbname_map_filename, dbname, output_dir): add_counts_to_hierarchical_map(hierarchical_map, orthology_count) if dbname=='cog': - outputfile = open( output_dir +'/COG_stats_1.txt', 'w') + outputfile = open( output_dir +'/COG_stats_1.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 0, outputfile, printKey=False,\ header="Functional Category\tGene Count") outputfile.close() - outputfile = open( output_dir +'/COG_stats_2.txt', 'w') + outputfile = open( output_dir +'/COG_stats_2.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 1, outputfile,\ header="Function Abbr\tFunctional Category\tGene Count") outputfile.close() - outputfile = open( output_dir +'/COG_stats_3.txt', 'w') + outputfile = open( output_dir +'/COG_stats_3.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 2, outputfile,\ header="COGID\tFunction\tGene Count") outputfile.close() if dbname=='kegg': - outputfile = open( output_dir +'/KEGG_stats_1.txt', 'w') + outputfile = open( output_dir +'/KEGG_stats_1.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 0, outputfile, printKey=False,\ header="Function Category Level 1\tGene Count") outputfile.close() - outputfile = open( output_dir +'/KEGG_stats_2.txt', 'w') + outputfile = open( output_dir +'/KEGG_stats_2.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 1, outputfile, printKey=False,\ header="Function Category Level 2a\tGene Count") outputfile.close() - outputfile = open( output_dir +'/KEGG_stats_3.txt', 'w') + outputfile = open( output_dir +'/KEGG_stats_3.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 2, outputfile,\ header="ID\tFunction Category Level 3\tGene Count" ) outputfile.close() - outputfile = open( output_dir +'/KEGG_stats_4.txt', 'w') + outputfile = open( output_dir +'/KEGG_stats_4.txt', filePermType) print_counts_at_level(hierarchical_map, field_to_description, 0, 3, outputfile,\ header="KO\tFunction Category Level 4\tGene Count") outputfile.close() @@ -726,6 +635,181 @@ def add_counts_to_hierarchical_map(hierarchical_map, orthology_count): else: add_counts_to_hierarchical_map(hierarchical_map[key], orthology_count) +def get_list_of_queries(annotated_gff): + orfList = {} + gffreader = GffFileParser(annotated_gff) + count = 0 + for contig in gffreader: + for orf in gffreader.orf_dictionary[contig]: + orfList[orf['id']] = 1 + count += 1 + # if count%500000==0: + # print count + + return orfList.keys() + + +def Heapify(A, i, S): + while True: + l = 2*i + 1 + r = l + 1 + max = i + if l < S and A[l][1] > A[max][1]: # should be > + max = l + if r < S and A[r][1] > A[max][1]: # should be > + max = r + + if max != i and i < S: + temp = A[i] + A[i] = A[max] + A[max] = temp + else: + break + i = max + +def BuildHeap(S, A): + i = int(S/2) + while i >= 0: + Heapify(A, i, S) + i = i - 1 + +def writeParsedLines(fieldmapHeaderline, parsedLines, list, names, outputfilename): + try: + outputfile = open(outputfilename, 'w') + except OSError: + print "ERROR: Cannot create sequence file : " + faa_file + sys.exit(0) + + outputStr=fieldmapHeaderline + "\n" + i = 0 + for item in list: + outputStr += parsedLines[item[0]]+'\n' + if i > 1000: + fprintf(outputfile, "%s", outputStr) + i= 0 + outputStr="" + i += 1 + if i > 0: + fprintf(outputfile, "%s", outputStr) + outputfile.close() + +def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks): + linecount = 0 + readerhandles = [] + + try: + for i in range(len(filenames)): + readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) ) + except OSError: + print "ERROR: Cannot read sequence file : " + filenames[i] + sys.exit(1) + + try: + outputfile = open(outputfilename, 'w') + fieldmapHeaderLine = readerhandles[0].getHeaderLine() + fprintf(outputfile, "%s\n",fieldmapHeaderLine) + + except OSError: + print "ERROR: Cannot create sequence file : " + outputfilename + sys.exit(1) + + values = [] + for i in range(len(filenames)): + iterate = iter(readerhandles[i]) + next(iterate) + line = readerhandles[i].getProcessedLine() + fields = [ x.strip() for x in line.split('\t') ] + values.append( (i, orfRanks[fields[0]], line) ) + + S = len(filenames) + BuildHeap(S, values) + + while S>0: + try: + iterate = iter(readerhandles[values[0][0]]) + line = readerhandles[values[0][0]].getProcessedLine() + fields = [ x.strip() for x in line.split('\t') ] + fprintf(outputfile, "%s\n",line) + next(iterate) + + line = readerhandles[values[0][0]].getProcessedLine() + fields = [ x.strip() for x in line.split('\t') ] + values[0] = (values[0][0], orfRanks[fields[0]], line) + except: + #import traceback + #traceback.print_exc() + #print 'finished ' + str(S) + values[0] = values[S-1] + S = S - 1 + + if S>0: + Heapify(values, 0, S) + + #print 'line count ' + str(linecount) + outputfile.close() + + +def create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, size = 100000): + orfRanks = {} + count = 0 + for orf in listOfOrfs: + orfRanks[orf] = count + count += 1 + + sorted_parse_file = blastoutput + ".tmp" + + try: + inputfilehandle = open(blastoutput, 'r') + except OSError: + print "ERROR: Cannot read parsed b/last results : " + blastoutput + sys.exit(1) + + + currSize = 0 + parsedLines={} + list = [] + names = {} + seqid =0 + batch = 0 + filenames = [] + + blastparser = BlastOutputTsvParser(dbname, blastoutput) + + for data in blastparser: + names[seqid] = data['query'] + parsedLines[seqid] = blastparser.getProcessedLine() + list.append( (seqid, names[seqid]) ) + + seqid +=1 + currSize += 1 + if currSize > size: + list.sort(key=lambda tup: tup[1], reverse=False) + fieldmapHeaderLine = blastparser.getHeaderLine() + writeParsedLines(fieldmapHeaderLine, parsedLines, list, names, sorted_parse_file + "." + str(batch)) + filenames.append(sorted_parse_file + "." + str(batch)) + batch += 1 + currSize = 0 + sequences={} + list = [] + names = {} + seqid =0 + + if currSize > 0: + list.sort(key=lambda tup: tup[1], reverse=False) + fieldmapHeaderLine = blastparser.getHeaderLine() + writeParsedLines(fieldmapHeaderLine, parsedLines, list, names, sorted_parse_file + "." + str(batch)) + filenames.append(sorted_parse_file + "." + str(batch)) + + inputfilehandle.close() + merge_sorted_parsed_files(dbname, filenames, sorted_parse_file, orfRanks) + + for file in filenames: + remove(file) + + +import gc +import resource + # the main function def main(argv): global parser @@ -736,35 +820,129 @@ def main(argv): results_dictionary={} dbname_weight={} - #import traceback - for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): - print "Processing database : " + dbname - try: - results_dictionary[dbname]={} - process_parsed_blastoutput(dbname, blastoutput, opts, results_dictionary[dbname]) - except: - import traceback - traceback.print_exc() - print "Error: " + dbname - pass - create_annotation(results_dictionary, opts.input_annotated_gff, opts.output_dir,\ - opts.ncbi_taxonomy_map, opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) +# print "memory used = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000) + listOfOrfs = get_list_of_queries(opts.input_annotated_gff) + listOfOrfs.sort(key=lambda tup: tup, reverse=False) - # print results_dictionary['cog'] - for dbname in results_dictionary: - if dbname=='cog': - create_table(results_dictionary[dbname], opts.input_cog_maps, 'cog', opts.output_dir) +##### uncomment the following lines + for dbname, blastoutput in zip(opts.database_name, opts.input_blastout): + create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs) +##### - if dbname=='kegg': - create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir) + # process in blocks of size _stride + output_table_file = open(opts.output_dir + '/functional_and_taxonomic_table.txt', 'w') + fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n") + output_table_file.close() - peg2subsystem = {} + lca = LCAComputation(opts.ncbi_taxonomy_map) + lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support) + + blastParsers={} + for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): + blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') + + #this part of the code computes the occurence of each of the taxons + # which is use in the later stage is used to evaluate the min support + # as used in the MEGAN software + + start = 0 + Length = len(listOfOrfs) + _stride = 100000 + Taxons = {} + while start < Length: + pickorfs= {} + last = min(Length, start + _stride) + for i in range(start, last): + pickorfs[listOfOrfs[i]]= 'all' + start = last + print 'min support ' + str(start) + + results_dictionary={} + for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): + if dbname=='refseq': + try: + results_dictionary[dbname]={} + process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) + lca.set_results_dictionary(results_dictionary) + lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs ) + for key, taxon in pickorfs.iteritems(): + Taxons[key] = taxon + except: + print "Error: while training for min support tree " + dbname + import traceback + traceback.print_exc() + + blastParsers={} + for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): + blastParsers[dbname] = BlastOutputTsvParser(dbname, blastoutput + '.tmp') + + # this loop determines the actual/final taxonomy of each of the ORFs + # taking into consideration the min support + filePermTypes= {} + orfTablefilePermType = False + start = 0 + while start < Length: + pickorfs= {} + last = min(Length, start + _stride) + for i in range(start, last): + pickorfs[listOfOrfs[i]]= True + start = last + + gc.collect() + printf("memory used = %s MB", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000)) + results_dictionary={} + for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): + try: + results_dictionary[dbname]={} + printf("Processing database %s ", dbname) + process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs) + printf("done\n") + except: + import traceback + traceback.print_exc() + print "Error: " + dbname + pass + print dbname + ' ' + str(len(results_dictionary[dbname])) + + print 'done = ' + str(start) + + # create the annotations now + orfToContig = {} + create_annotation(results_dictionary, opts.database_name, opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig) + + for dbname in results_dictionary: + if dbname in filePermTypes: + filePermType = 'a' + else: + filePermType = 'w' + filePermTypes[dbname] = True + + if dbname=='cog': + create_table(results_dictionary[dbname], opts.input_cog_maps, 'cog', opts.output_dir, filePermType) + if dbname=='kegg': + create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType) + + #peg2subsystem = {} # this feature is useless with the new refseq - if opts.subsystems2peg_file: - process_subsys2peg_file(peg2subsystem, opts.subsystems2peg_file) + #if opts.subsystems2peg_file: + # process_subsys2peg_file(peg2subsystem, opts.subsystems2peg_file) - print_orf_table(results_dictionary, opts.output_dir) + orfTableFilePermType = False + if orfTableFilePermType: + filePermType = 'a' + orfTablefilePermType = True + else: + filePermType = 'w' + + print_orf_table(results_dictionary, orfToContig, opts.output_dir, filePermType) + + # now remove the temporary files + for dbname, blastoutput in zip( opts.database_name, opts.input_blastout): + try: + remove( blastoutput + '.tmp') + except: + pass def refseq_id(product): results = re.search(r'gi\|[0-9.]*', product) @@ -773,13 +951,11 @@ def refseq_id(product): refseq_id=results.group(0) return refseq_id - def process_subsys2peg_file(subsystems2peg, subsystems2peg_file): try: orgfile = open(subsystems2peg_file,'r') except IOError: print "Cannot open " + str(org_file) - lines = orgfile.readlines() orgfile.close() for line in lines: @@ -791,16 +967,16 @@ def process_subsys2peg_file(subsystems2peg, subsystems2peg_file): except: print "Cannot close " + str(org_file) -def print_orf_table(results, output_dir): +def print_orf_table(results, orfToContig, output_dir, filePermType): if not path.exists(output_dir): makedirs(output_dir) - outputfile = open( output_dir +'/ORF_annotation_table.txt', 'w') + outputfile = open( output_dir +'/ORF_annotation_table.txt', filePermType) orf_dict = {} for dbname in results.iterkeys(): - for seqname in results[dbname]: - for orf in results[dbname][seqname]: + for orfname in results[dbname]: + for orf in results[dbname][orfname]: if not orf['query'] in orf_dict: orf_dict[orf['query']] = {} @@ -817,7 +993,7 @@ def print_orf_table(results, output_dir): orf_dict[orf['query']][dbname] = re.sub(r'\[.*\]','', seed).strip() - orf_dict[orf['query']]['contig'] = seqname + orf_dict[orf['query']]['contig'] = orfToContig[orfname] for orfn in orf_dict: if 'cog' in orf_dict[orfn]: @@ -853,4 +1029,3 @@ def MetaPathways_create_reports_fast(argv): if __name__ == "__main__": createParser() main(sys.argv[1:]) - diff --git a/libs/python_scripts/MetaPathways_parse_blast.py b/libs/python_scripts/MetaPathways_parse_blast.py index b20d91e..bff01b7 100755 --- a/libs/python_scripts/MetaPathways_parse_blast.py +++ b/libs/python_scripts/MetaPathways_parse_blast.py @@ -115,22 +115,30 @@ def check_arguments(opts, args): def create_query_dictionary(blastoutputfile, query_dictionary, algorithm ): seq_beg_pattern = re.compile("^#") - blastoutfh = open( blastoutputfile,'r') - - for line in blastoutfh: - if not seq_beg_pattern.search(line): - words = line.rstrip().split('\t') - if len(words) != 12: - continue - - if algorithm =='BLAST': - query_dictionary[words[0]] = 1 - - if algorithm =='LAST': - query_dictionary[words[1]]= 1 - blastoutfh.close() - - + try: + blastoutfh = open( blastoutputfile,'r') + except: + print "" + print "ERROR : cannot open B/LAST output file " + blastoutputfile + " to parse " + pass + + try: + for line in blastoutfh: + if not seq_beg_pattern.search(line): + words = line.rstrip().split('\t') + if len(words) != 12: + continue + + if algorithm =='BLAST': + query_dictionary[words[0]] = 1 + + if algorithm =='LAST': + query_dictionary[words[1]]= 1 + blastoutfh.close() + except: + print "ERROR : while reading B/LAST output file " + blastoutputfile + " to partse " + print " : make sure B/LAST ing was done for the particular database" + pass def create_dictionary(databasemapfile, annot_map, query_dictionary): # print "query size " + str(len(query_dictionary)) @@ -184,8 +192,14 @@ def __init__(self, dbname, blastoutput, database_mapfile, refscore_file, opts): try: query_dictionary = {} create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm) + try: + self.blastoutputfile = open(self.blastoutput,'r') + except: + print "" + print "ERROR : cannot open B/LAST output file " + blastoutput + " to parse " + print " : make sure \"B/LAST\"ing was done for the particular database" + sys.exit(0) - self.blastoutputfile = open(self.blastoutput,'r') create_refscores(refscore_file, self.refscores) # print "Going to read the dictionary\n" diff --git a/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py b/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py index 7809526..067caff 100755 --- a/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py +++ b/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py @@ -53,6 +53,16 @@ def getFormat(dbstring): format = 2 return format +#sequences with no seperate taxonomic name gets the sequences name +def parse_Format_0(line): + fields = re.split(' ', line) + if len( fields ) ==1: + name = fields[0].replace('>','') + taxonomy = name + else: + return( None, None ) + return( name.strip(), taxonomy.strip() ) + #silva def parse_Format_1(line): fields = re.split(' ', line) @@ -84,12 +94,10 @@ def parse_Format_2(line): def getName_and_Taxonomy(line, format=0): - - if format==0: - print "ERROR: Unknown format for the rRNA database files!" - sys.exit(3) - if format==1: + if format==0: + name, taxonomy = parse_Format_0(line) + elif format==1: name, taxonomy = parse_Format_1(line) # print name + " " + taxonomy elif format==2: @@ -244,7 +252,6 @@ def main(argv): options, args = parser.parse_args(argv) - print options if not len(options.blast_files): parser.error('At least one taxonomic BLAST output is required') diff --git a/resources/Dsignal b/resources/Dsignal new file mode 100755 index 0000000..c7ea146 --- /dev/null +++ b/resources/Dsignal @@ -0,0 +1,8 @@ +0.0000 0.0000 0.0000 1.0000 +0.5100 0.0400 0.4200 0.0200 +0.0000 0.0000 1.0000 0.0000 +0.0400 0.6400 0.0500 0.2800 +0.0500 0.2500 0.3000 0.4000 +0.0900 0.4900 0.1300 0.2900 +1.0000 0.0000 0.0000 0.0000 +0.1800 0.0100 0.7500 0.0600 diff --git a/resources/TPCsignal b/resources/TPCsignal new file mode 100755 index 0000000..4a28d41 --- /dev/null +++ b/resources/TPCsignal @@ -0,0 +1,15 @@ +0.0124 0.8100 0.0083 0.1694 +0.1618 0.3320 0.4772 0.0290 +0.0992 0.4008 0.2934 0.2066 +0.1818 0.1818 0.5083 0.1281 +0.1612 0.0289 0.7768 0.0331 +0.0000 0.0000 1.0000 0.0000 +0.0581 0.0000 0.0000 0.9419 +0.0000 0.0000 0.0000 1.0000 +0.0000 1.0000 0.0000 0.0000 +0.1818 0.0000 0.8182 0.0000 +0.9876 0.0000 0.0083 0.0041 +0.4959 0.0620 0.1694 0.2727 +0.0620 0.1859 0.0083 0.7438 +0.0000 1.0000 0.0000 0.0000 +0.0289 0.7686 0.0330 0.1694 diff --git a/template_config.txt b/template_config.txt index d5d3397..187ae31 100644 --- a/template_config.txt +++ b/template_config.txt @@ -18,6 +18,7 @@ FORMATDB_EXECUTABLE 'executables/mac/bit64/makeblastdb' BLASTP_EXECUTABLE 'executables/mac/bit64/blastp' BLASTN_EXECUTABLE 'executables/mac/bit64/blastn' EXECUTABLES_DIR 'executables/mac/bit64/' +RESOURCES_DIR 'resources/' LASTDB_EXECUTABLE 'executables/mac/bit64/lastdb' LAST_EXECUTABLE 'executables/mac/bit64/lastal' ORF_PREDICTION 'executables/mac/bit64/prodigal' diff --git a/template_param.txt b/template_param.txt index df6aac8..db06313 100644 --- a/template_param.txt +++ b/template_param.txt @@ -17,16 +17,17 @@ orf_prediction:min_length 60 # ORF annotation parameters annotation:algorithm last # e.g. blast or last +#annotation:dbs metacyc-v4-2011-07-03,COG-2013-02-05,refseq-nr-2014-01-18,kegg-pep-2011-06-18,seed-2014-01-30 annotation:dbs metacyc-v4-2011-07-03,COG-2013-02-05,refseq-nr-2014-01-18,kegg-pep-2011-06-18,seed-2014-01-30 # e.g. annotation:dbs cog,kegg,refseq,metacyc annotation:min_bsr 0 annotation:max_evalue 0.000001 -annotation:min_score 47 +annotation:min_score 50 annotation:min_length 60 annotation:max_hits 5 # rRNA annotation parameters -rRNA:refdbs SSURef_111_NR_tax_silva-2012-11-06 +rRNA:refdbs # e.g. rRNA:refdbs GREENGENES_gg16S,SSURef_111_NR_tax_silva,LSURef_111_tax_silva rRNA:max_evalue 0.000001 rRNA:min_identity 20 @@ -55,7 +56,7 @@ metapaths_steps:PARSE_BLAST skip metapaths_steps:SCAN_rRNA skip metapaths_steps:STATS_rRNA skip metapaths_steps:SCAN_tRNA skip -metapaths_steps:ANNOTATE redo +metapaths_steps:ANNOTATE skip metapaths_steps:PATHOLOGIC_INPUT skip metapaths_steps:GENBANK_FILE skip metapaths_steps:CREATE_SEQUIN_FILE skip