From 67f4fbcccff476b943746168cf0d0e6209a724f4 Mon Sep 17 00:00:00 2001
From: Kishori <kishori82@gmail.com>
Date: Sun, 2 Mar 2014 16:59:56 -0800
Subject: [PATCH] fixed the large sample processing bottleneck

---
 MetaPathways.py                               |   6 +-
 libs/python_modules/LCAComputation.py         | 175 +++++-
 libs/python_modules/metapaths.py              |  44 +-
 libs/python_modules/metapaths_utils.py        | 130 ++--
 .../MetaPathways_annotate_fast.py             |  19 +-
 .../MetaPathways_create_reports_fast.py       | 565 ++++++++++++------
 .../MetaPathways_parse_blast.py               |  48 +-
 .../MetaPathways_rRNA_stats_calculator.py     |  19 +-
 resources/Dsignal                             |   8 +
 resources/TPCsignal                           |  15 +
 template_config.txt                           |   1 +
 template_param.txt                            |   7 +-
 12 files changed, 729 insertions(+), 308 deletions(-)
 create mode 100755 resources/Dsignal
 create mode 100755 resources/TPCsignal

diff --git a/MetaPathways.py b/MetaPathways.py
index 7f02d01..dcf5625 100755
--- a/MetaPathways.py
+++ b/MetaPathways.py
@@ -257,9 +257,6 @@ def main(argv):
     sorted_input_output_list = sorted(input_output_list.keys())
     # PART1 before the blast
     if len(input_output_list): 
-      print 'length '
-      print input_output_list
-
       for input_file in sorted_input_output_list:
         output_dir = input_output_list[input_file]
         if run_type=='overwrite' and  path.exists(output_dir):
@@ -283,6 +280,9 @@ def main(argv):
            ncbi_sequin_sbt = ncbi_sequin_sbt,
            run_type = run_type
         )
+    else: 
+        print "ERROR :No input files were found in the input folder"
+        sys.exit(0)
 
     # blast the files
 
diff --git a/libs/python_modules/LCAComputation.py b/libs/python_modules/LCAComputation.py
index 83db9b4..790e921 100755
--- a/libs/python_modules/LCAComputation.py
+++ b/libs/python_modules/LCAComputation.py
@@ -1,17 +1,40 @@
 #!/usr/bin/python
 
-import re
-import sys
+try:
+     import sys, traceback
+     import re
+     import sys
+     from   metapaths_utils  import fprintf, printf, GffFileParser
+except:
+     print """ Could not load some user defined  module functions"""
+     print """ Make sure your typed \"source MetaPathwaysrc\""""
+     print """ """
+     print traceback.print_exc(10)
+     sys.exit(3)
+
+
+
+def copyList(a, b): 
+    [ b.append(x) for x in a ] 
 
 class LCAComputation:
     begin_pattern = re.compile("#")
 
+    # a readable taxon name to numeric string id map as ncbi
     name_to_id={}
+
+    # a readable taxon ncbi tax id to name map
     id_to_name={}
+
+    # this is the tree structure in a id to parent map, you can traverse it to go to the root
     taxid_to_ptaxid = {}
-    min_score = 50
-    top_percent = 10 
-    min_support = 5
+
+    lca_min_score = 50   # an LCA parameter for min score for a hit to be considered
+    lca_top_percent = 10    # an LCA param to confine the hits to within the top hits score upto the top_percent% 
+    lca_min_support = 5   # a minimum number of reads in the sample to consider a taxon to be present
+    results_dictionary = None
+
+    # initialize with the ncbi tree file 
     def __init__(self, filename):
        taxonomy_file = open(filename, 'r')
        lines = taxonomy_file.readlines()
@@ -25,16 +48,22 @@ def __init__(self, filename):
               continue
           self.name_to_id[str(fields[0])] = str(fields[1])
           self.id_to_name[str(fields[1])] = str(fields[0])
-          self.taxid_to_ptaxid[str(fields[1])] = [ str(fields[2]), 0]
+          # the taxid to ptax map has for each taxid a corresponding 3-tuple
+          # the first location is the pid, the second is used as a counter for 
+          # lca while a search is traversed up the tree and the third is used for
+          # the min support
+          self.taxid_to_ptaxid[str(fields[1])] = [ str(fields[2]), 0, 0]
+
 
     def setParameters(self, min_score, top_percent, min_support):
-       self.min_score = min_score
-       self.top_percent =top_percent
-       self.min_support = min_support
+       self.lca_min_score = min_score
+       self.lca_top_percent =top_percent
+       self.lca_min_support = min_support
          
     def sizeTaxnames(self ):
          return len(self.name_to_id)
 
+
     def sizeTaxids(self):
          return len(self.taxid_to_ptaxid)
           
@@ -44,17 +73,20 @@ def get_a_Valid_ID(self, name_group):
                return  self.name_to_id[name]
         return -1
 
+    # given a taxon name it returns the correcponding unique ncbi tax id
     def translateNameToID(self, name):
        if not name in self.name_to_id:
            return None
        return self.name_to_id[name]
 
+    # given a taxon id to taxon name map
     def translateIdToName(self, id):
        if not id in self.id_to_name:
            return None
        return self.id_to_name[id]
 
 
+    # given a name it returns the parents name
     def getParentName(self, name):
        if not name in  self.name_to_id:  
           return None
@@ -63,12 +95,22 @@ def getParentName(self, name):
        return self.translateIdToName( pid )
 
 
+    # given a ncbi tax id returns the parents tax id
     def getParentTaxId(self, ID):
        if not ID in self.taxid_to_ptaxid:
           return None
        return self.taxid_to_ptaxid[ID][0]
 
 
+    # given a set of ids it returns the lowest common ancenstor 
+    # without caring about min support
+    # here LCA for a set of ids are computed as follows
+    # first we consider one ID at a time
+    #   for each id we traverse up the ncbi tree using the id to parent id map
+    #   at the same time increasing the count on the second value of the 3-tuple 
+    #   note that at the node where all the of the individual ids ( limit in number)
+    #   converges the counter matches the limit for the first time, while climbing up. 
+    #   This also this enables us to  make the selection of id arbitrary 
     def get_lca(self, IDs):
         limit = len(IDs)
         for id in IDs:
@@ -78,31 +120,134 @@ def get_lca(self, IDs):
                if self.taxid_to_ptaxid[tid][1]==limit:
                   return  self.id_to_name[tid]  
                tid = self.taxid_to_ptaxid[tid][0]
-        return ""
 
+        return "root"
+
+    def update_taxon_support_count(self, taxonomy):
+         id = self.get_a_Valid_ID( [taxonomy ])
+         tid = id 
+         while( tid in self.taxid_to_ptaxid and tid !='1' ):
+               self.taxid_to_ptaxid[tid][2]+=1
+               tid = self.taxid_to_ptaxid[tid][0]
+
+    def get_supported_taxon(self, taxonomy):
+         id = self.get_a_Valid_ID( [taxonomy ])
+         tid = id 
+         while( tid in self.taxid_to_ptaxid and tid !='1' ):
+            if self.lca_min_support > self.taxid_to_ptaxid[tid][2] :
+                tid = self.taxid_to_ptaxid[tid][0]
+            else:
+                return self.translateIdToName(tid)
+
+         return  self.translateIdToName(tid)
+    
+    # need to call this to clear the counts of reads at every node      
     def clear_cells(self, IDs):
         limit = len(IDs)
         for id in IDs:
            tid = id 
            while( tid in self.taxid_to_ptaxid and tid !='1' ):
-               if self.taxid_to_ptaxid[tid][1]==0:
-                  return  self.id_to_name[tid]  
+               #if self.taxid_to_ptaxid[tid][1]==0:
+               #   return  self.id_to_name[tid]  
                self.taxid_to_ptaxid[tid][1]=0
                tid = self.taxid_to_ptaxid[tid][0]
         return ""
 
 
+    #given a set of sets of names it computes an lca 
+    # in the format [ [name1, name2], [name3, name4,....namex] ...]
+    # here name1 and name2 are synonyms and so are name3 through namex
     def getTaxonomy(self, name_groups):
-
          IDs = []
          for name_group in name_groups:
-            #print name_group
             id = self.get_a_Valid_ID(name_group)
             if id!=-1:
               IDs.append(id)
     
          consensus = self.get_lca(IDs)
-         #print "=============="
          self.clear_cells(IDs)
          return consensus
 
+
+    # extracts taxon names for a refseq annotation
+    def get_species(self, hit):
+       if not 'product' in hit: 
+           return None
+       species = []
+       try:
+           m = re.findall(r'\[([^\[]+)\]', hit['product'])
+           if m != None:
+             copyList(m,species)
+       except:
+             return None
+   
+       if species:
+          return species
+       else:
+          return None
+ 
+    # used for optimization
+    def set_results_dictionary(self, results_dictionary):
+        self.results_dictionary= results_dictionary
+
+    # this returns the megan taxonomy, i.e., it computes the lca but at the same time
+    # takes into consideration the parameters, min score, min support and top percent
+    def getMeganTaxonomy(self, orfid):
+         #compute the top hit wrt score
+         names = []
+         species = []
+         if 'refseq' in self.results_dictionary:
+            if orfid in self.results_dictionary['refseq']:
+                 
+               top_score = 0 
+               for hit in self.results_dictionary['refseq'][orfid]:
+                  if hit['bitscore'] >= self.lca_min_score and hit['bitscore'] >= top_score:
+                     top_score = hit['bitscore']
+
+               for hit in self.results_dictionary['refseq'][orfid]:
+                  if (100-self.lca_top_percent)*top_score/100 < hit['bitscore']:
+                     names = self.get_species(hit)
+                     #if 'MD_2_95' == orfid:
+                     #  for hit in self.results_dictionary['refseq'][orfid]:
+                     #     print  orfid  + ':' + str(names)
+                     #  else:
+                     #     print orfid  + ':' + str([])
+                     if names:
+                       species.append(names) 
+
+         taxonomy = self.getTaxonomy(species)
+         meganTaxonomy = self.get_supported_taxon( taxonomy)
+         return meganTaxonomy
+ 
+
+    # this is use to compute the min support for each taxon in the tree
+    # this is called before the  getMeganTaxonomy
+    def compute_min_support_tree(self, annotate_gff_file, pickorfs):
+        gffreader = GffFileParser(annotate_gff_file)
+        try:
+           for contig in  gffreader:
+              for orf in  gffreader.orf_dictionary[contig]:
+                 if not orf['id'] in pickorfs:
+                     continue
+                 taxonomy = None
+                 species = []
+                 if 'refseq' in self.results_dictionary:
+                   if orf['id'] in self.results_dictionary['refseq']:
+                       #compute the top hit wrt score
+                       top_score = 0 
+                       for hit in self.results_dictionary['refseq'][orf['id']]:
+                          if hit['bitscore'] >= self.lca_min_score and hit['bitscore'] >= top_score:
+                            top_score = hit['bitscore']
+       
+                       for hit in self.results_dictionary['refseq'][orf['id']]:
+                          if (100-self.lca_top_percent)*top_score/100 < hit['bitscore']:
+                             names = self.get_species(hit)
+                             if names:
+                               species.append(names) 
+                 taxonomy=self.getTaxonomy(species)
+                 self.update_taxon_support_count(taxonomy)
+                 pickorfs[orf['id']] = taxonomy
+        except:
+           print "ERROR : Cannot read annotated gff file "
+          
+
diff --git a/libs/python_modules/metapaths.py b/libs/python_modules/metapaths.py
index ac5b72b..32e7cd2 100755
--- a/libs/python_modules/metapaths.py
+++ b/libs/python_modules/metapaths.py
@@ -107,8 +107,7 @@ def format_db(formatdb_executable, seqType, raw_sequence_file, formatted_db,  al
 
      if algorithm=='LAST':
          # dirname = os.path.dirname(raw_sequence_file)    
-         cmd='%s -p -c %s  %s' %(formatdb_executable, formatted_db, raw_sequence_file)
-         print cmd
+         cmd='%s -s 5G -p -c %s  %s' %(formatdb_executable, formatted_db, raw_sequence_file)
 
      result= getstatusoutput(cmd)
      if result[0]==0:
@@ -187,8 +186,9 @@ def create_refscores_compute_cmd(input, output, config_settings, algorithm):
 def formatted_db_exists(dbname, suffixes):
     for suffix in suffixes:
        fileList = glob(dbname + suffix) 
-       if not fileList:
-        return False
+       if len(fileList)==0 :
+          print 'ERROR :  if formatted correctely then expected the files with pattern ' + dbname + suffix
+          return False
     return True
 
 def check_if_raw_sequences_exist(filename):
@@ -233,7 +233,7 @@ def check_an_format_refdb(dbname, seqType,  config_settings, config_params):
        formattedDBPath = functional_formatted + PATHDELIM +  dbname
     else:
        print "WARNING : Undefined sequnce type for  " + dbname + "!" 
-       return
+       sys.exit(0)
     
     # database formatting executables paths
     if algorithm == 'LAST' and seqType =='prot':
@@ -244,18 +244,18 @@ def check_an_format_refdb(dbname, seqType,  config_settings, config_params):
     if not (formatted_db_exists(formattedDBPath, suffixes) ):
         print "WARNING : You do not seem to have Database " + dbname + " formatted!"
         if check_if_raw_sequences_exist(seqPath):
-            print "          Found raw sequences for  Database " + dbname + "!"
-            print "          Trying to format on the fly ....!"
+            print "          Found raw sequences for  Database " + dbname + " in folder " + seqPath + "  !"
+            print "          Trying to format on the fly .... for " + algorithm + "!"
             result =format_db(executable, seqType, seqPath, formattedDBPath, algorithm)
             if result ==True:
                 print "          Formatting successful!"
                 return 
             else:
-                print "          Formatting failed! Please consider formatting manually or do not try to annotated with this database!"
+                print "          Formatting failed! Please consider formatting manually or do not try to annotate with this database!"
                 sys.exit(0)
 
-        print 'seqpath ' + seqPath
         print "ERROR : You do not even have the raw sequence for Database " + dbname + " to format!"
+        print "        in the folder "+ seqPath
         print "        Please put the appropriate files in folder \"blastDB\""
         sys.exit(0)
   
@@ -305,8 +305,6 @@ def create_blastp_against_refdb_cmd(input, output, output_dir, sample_name,
          check_an_format_refdb(dbfilename, 'prot',  config_settings, config_params)
 
     if system=='grid':
-       
-       
        batch_size = get_parameter(config_params,  'grid_engine', 'batch_size', default=500)
        max_concurrent_batches = get_parameter(config_params,  'grid_engine', 'max_concurrent_batches', default=500)
        user = get_parameter(config_params,  'grid_engine', 'user', default='')
@@ -385,9 +383,6 @@ def create_annotate_genebank_cmd(sample_name, mapping_txt, input_unannotated_gff
     return cmd
 
 def create_genbank_ptinput_sequin_cmd(input_annotated_gff, nucleotide_fasta, amino_fasta, outputs, config_settings, ncbi_params_file, ncbi_sequin_sbt):
-    
-    #tbl2asn_exe = config_settings['METAPATHWAYS_PATH'] + config_settings['TBL2ASN_EXECUTABLE']
-
     cmd="%s -g %s -n %s -p %s " %((config_settings['METAPATHWAYS_PATH'] + config_settings['GENBANK_FILE']), input_annotated_gff, nucleotide_fasta, amino_fasta) ; 
 
     if 'gbk' in outputs:
@@ -396,10 +391,8 @@ def create_genbank_ptinput_sequin_cmd(input_annotated_gff, nucleotide_fasta, ami
     if ncbi_params_file and  'sequin' in outputs:
        cmd += ' --out-sequin ' + outputs['sequin']
        cmd += ' --sequin-params-file ' + ncbi_params_file
-    #   cmd += ' --sequin-tbl2asn ' +  tbl2asn_exe
        cmd += ' --ncbi-sbt-file ' +  ncbi_sequin_sbt
 
-
     if 'ptinput' in outputs:
        cmd += ' --out-ptinput ' + outputs['ptinput']
 
@@ -490,8 +483,8 @@ def create_rRNA_scan_statistics(blastoutput, refdbname, bscore_cutoff, eval_cuto
 def create_tRNA_scan_statistics(in_file,stat_file, fasta_file,  config_settings):
     cmd= "%s -o %s -F %s  -i %s -T %s  -D %s"  %((config_settings['METAPATHWAYS_PATH'] + config_settings['SCAN_tRNA']) ,\
            stat_file, fasta_file, in_file,\
-          (config_settings['METAPATHWAYS_PATH'] + config_settings['EXECUTABLES_DIR'])+ PATHDELIM + 'TPCsignal',\
-          (config_settings['METAPATHWAYS_PATH'] + config_settings['EXECUTABLES_DIR'])+ PATHDELIM + 'Dsignal')
+          (config_settings['METAPATHWAYS_PATH'] + config_settings['RESOURCES_DIR'])+ PATHDELIM + 'TPCsignal',\
+          (config_settings['METAPATHWAYS_PATH'] + config_settings['RESOURCES_DIR'])+ PATHDELIM + 'Dsignal')
     return cmd
 
 # create the command to make the MLTreeMap Images
@@ -591,7 +584,7 @@ def write_run_parameters_file(fileName, parameters):
 
 # checks if the necessary files, directories  and executables really exists or not
 def check_config_settings(config_settings, file):
-   essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR']
+   essentialItems= ['METAPATHWAYS_PATH', 'EXECUTABLES_DIR', 'RESOURCES_DIR']
    missingItems = []
    for key, value in  config_settings.items():
       # make sure  MetaPathways directory is present
@@ -619,6 +612,13 @@ def check_config_settings(config_settings, file):
             missingItems.append(key) 
          continue
 
+      # make sure RESOURCES_DIR directories are present
+      if key in [ 'RESOURCES_DIR']:
+         if not path.isdir( config_settings['METAPATHWAYS_PATH'] + PATHDELIM + config_settings[key]) :
+            print "ERROR: Path for \"%s\" is NOT set properly in configuration file \"%s\"" %(key, file)  
+            print "ERROR: Currently it is set to \"%s\"\n" %( config_settings[key] )  
+            missingItems.append(key) 
+         continue
 
       # make sure  MetaPaths directory is present
       if key in ['PERL_EXECUTABLE',  'PYTHON_EXECUTABLE' , 'PATHOLOGIC_EXECUTABLE' ]:
@@ -791,8 +791,8 @@ def  checkMissingParam_values(config_params, choices, logger):
      for category in choices:
        for parameter in choices[category]:
          if not config_params[category][parameter]:
-            logger.write('ERROR: Empty paramter %s of type %s'  %(parameter, category))
-            print('ERROR: Empty paramter %s of type %s'  %(parameter, category))
+            logger.write('ERROR: Empty parameter %s of type %s'  %(parameter, category))
+            print('ERROR: Empty parameter %s of type %s'  %(parameter, category))
             success = False
 
      return success
@@ -1457,7 +1457,7 @@ def run_metapathways_after_BLAST(input_fp, output_dir, command_handler, command_
     command_Status=  get_parameter( config_params,'metapaths_steps','PATHOLOGIC')
 
     enable_flag = False
-    if (run_type in ['overwrite', 'overlay']  and  command_Status== 'yes'):
+    if (run_type in ['overwrite', 'overlay'])  and  (command_Status in ['redo', 'yes']):
        enable_flag = True
        #remove the pathologic dir for the pgdb with the same sample name since a new pgdb is requested
        remove_existing_pgdb( sample_name, config_settings['PATHOLOGIC_EXECUTABLE'])
diff --git a/libs/python_modules/metapaths_utils.py b/libs/python_modules/metapaths_utils.py
index 75c9b91..088093f 100755
--- a/libs/python_modules/metapaths_utils.py
+++ b/libs/python_modules/metapaths_utils.py
@@ -21,42 +21,100 @@
 import sys 
 import os
 import math
-#from copy import deepcopy
-#from numpy import min, max, median, mean
-#import numpy
-#from numpy.ma import MaskedArray
-#from numpy.ma.extras import apply_along_axis
-#from numpy import array, zeros, argsort, shape, vstack,ndarray, asarray, \
-#        float, where, isnan
-#from cogent import LoadSeqs, Sequence
-#from cogent.cluster.procrustes import procrustes
-#from cogent.core.alignment import Alignment
-#from cogent.core.moltype import MolType, IUPAC_DNA_chars, IUPAC_DNA_ambiguities,\
-#    IUPAC_DNA_ambiguities_complements, DnaStandardPairs, ModelDnaSequence
-#from cogent.data.molecular_weight import DnaMW
-#from cogent.core.sequence import DnaSequence
-#from cogent.app.blast import Blastall
-#from cogent.app.util import get_tmp_filename
-#from cogent.parse.blast import BlastResult
-#from cogent.parse.fasta import MinimalFastaParser
-#from cogent.util.misc import remove_files
-#from cogent.util.dict2d import Dict2D
-#from cogent.app.formatdb import build_blast_db_from_fasta_path,\
-#    build_blast_db_from_fasta_file
-#from cogent import LoadSeqs
-#from cogent.util.misc import (parse_command_line_parameters, 
-#                                     create_dir, 
-#                                     handle_error_codes)
-#
-#
-#from qiime.parse import (parse_otu_table,
-#                         parse_qiime_config_files,
-#                         parse_coords,
-#                         parse_newick,
-#                         PhyloNode,
-#                         parse_mapping_file)
-#from qiime.format import format_otu_table
-#
+import re
+
+
+class GffFileParser(object):
+   def __init__(self, gff_filename):
+        self.Size = 10000
+        self.i=0
+        self.orf_dictionary = {}
+        self.gff_beg_pattern = re.compile("^#")
+        self.lines= []
+        self.size=0
+        try:
+           self.gff_file = open( gff_filename,'r')
+        except AttributeError:
+           print "Cannot read the map file for database :" + dbname
+           sys.exit(0)
+
+   def __iter__(self):
+        return self
+
+   def refillBuffer(self):
+       self.orf_dictionary = {}
+       i = 0
+       while  i < self.Size:
+          line=self.gff_file.readline()
+          if not line:
+            break
+          if self.gff_beg_pattern.search(line):
+            continue
+          self.insert_orf_into_dict(line, self.orf_dictionary)
+          i += 1
+
+       self.orfs = self.orf_dictionary.keys()
+       self.size = len(self.orfs)
+       self.i = 0
+
+   def next(self):
+        if self.i == self.size:
+           self.refillBuffer()
+
+        if self.size==0:
+           self.gff_file.close()
+           raise StopIteration()
+
+        #print self.i
+        if self.i < self.size:
+           self.i = self.i + 1
+           return self.orfs[self.i-1]
+
+
+
+   def insert_orf_into_dict(self, line, contig_dict):
+        rawfields = re.split('\t', line)
+        fields = []
+        for field in rawfields:
+           fields.append(field.strip());
+       
+        
+        if( len(fields) != 9):
+          return
+   
+        attributes = {}
+        attributes['seqname'] =  fields[0]   # this is a bit of a  duplication  
+        attributes['source'] =  fields[1]
+        attributes['feature'] =  fields[2]
+        attributes['start'] =  int(fields[3])
+        attributes['end'] =  int(fields[4])
+   
+        try:
+           attributes['score'] =  float(fields[5])
+        except:
+           attributes['score'] =  fields[5]
+   
+        attributes['strand'] =  fields[6]
+        attributes['frame'] =  fields[7]
+        
+        self.split_attributes(fields[8], attributes)
+   
+        if not fields[0] in contig_dict :
+          contig_dict[fields[0]] = []
+   
+        contig_dict[fields[0]].append(attributes)
+   
+   def insert_attribute(self, attributes, attribStr):
+        rawfields = re.split('=', attribStr)
+        if len(rawfields) == 2:
+          attributes[rawfields[0].strip().lower()] = rawfields[1].strip()
+   
+   def split_attributes(self, str, attributes):
+        rawattributes = re.split(';', str)
+        for attribStr in rawattributes:
+           self.insert_attribute(attributes, attribStr)
+   
+        return attributes
 
 class Performance:
    def  __init__(self):
diff --git a/libs/python_scripts/MetaPathways_annotate_fast.py b/libs/python_scripts/MetaPathways_annotate_fast.py
index 633f9c4..937aae4 100755
--- a/libs/python_scripts/MetaPathways_annotate_fast.py
+++ b/libs/python_scripts/MetaPathways_annotate_fast.py
@@ -137,7 +137,6 @@ def insert_orf_into_dict(line, contig_dict):
      fields = []
      for field in rawfields:
         fields.append(field.strip());
-    
      
      if( len(fields) != 9):
        return
@@ -171,7 +170,7 @@ def __init__(self, gff_filename):
         self.Size = 10000
         self.i=0
         self.orf_dictionary = {}
-        self.gff_beg_pattern = re.compile("#")
+        self.gff_beg_pattern = re.compile("^#")
         self.lines= []
         self.size=0
         try:
@@ -185,14 +184,13 @@ def __iter__(self):
  
     def refillBuffer(self):
        self.orf_dictionary = {}
-       line = self.gff_file.readline()
        i = 0
-       while line and i < self.Size:
+       while i < self.Size:
           line=self.gff_file.readline()
-          if self.gff_beg_pattern.search(line):
-            continue
           if not line:
             break
+          if self.gff_beg_pattern.search(line):
+            continue
           insert_orf_into_dict(line, self.orf_dictionary)
           i += 1
 
@@ -375,12 +373,12 @@ def add_16S_genes(rRNA_16S_dictionary, rRNA_dictionary, contig_lengths) :
 
     for rRNA in rRNA_16S_dictionary: 
         try:
-           orf_length = abs(int( tRNA_dictionary[tRNA][1] )-int( tRNA_dictionary[tRNA][0] )) + 1
+           orf_length = abs(int( tRNA_dictionary[rRNA][1] )-int( tRNA_dictionary[rRNA][0] )) + 1
         except:
            orf_length = 0
 
-        if tRNA in  contig_lengths: 
-           contig_length = contig_lengths[tRNA]
+        if rRNA in  contig_lengths: 
+           contig_length = contig_lengths[rRNA]
         else:
            contig_length = 0
 
@@ -417,7 +415,7 @@ def create_annotation(dbname_weight, results_dictionary, input_gff,  rRNA_16S_st
     for contig in  gffreader:
        count = 0
        for orf in  gffreader.orf_dictionary[contig]:
-         #print orf
+         #print orf['id']
          value = 0.0001
          success =False
          output_comp_annot_file1_Str = ''
@@ -464,7 +462,6 @@ def create_annotation(dbname_weight, results_dictionary, input_gff,  rRNA_16S_st
                 else:
                    output_comp_annot_file2_Str += '{0}\t{1}\t{2}\t{3}'.format(orf['id'], '','','','')
 
-
          if success:  # there was a database hit
             fprintf(output_comp_annot_file1,'%s\n', output_comp_annot_file1_Str)
             fprintf(output_comp_annot_file2,'%s\n', output_comp_annot_file2_Str)
diff --git a/libs/python_scripts/MetaPathways_create_reports_fast.py b/libs/python_scripts/MetaPathways_create_reports_fast.py
index 39d7cc7..de7f222 100755
--- a/libs/python_scripts/MetaPathways_create_reports_fast.py
+++ b/libs/python_scripts/MetaPathways_create_reports_fast.py
@@ -16,7 +16,7 @@
 
      from python_modules.LCAComputation import *
      from python_modules.MeganTree import *
-     from python_modules.metapaths_utils  import parse_command_line_parameters, fprintf, printf
+     from python_modules.metapaths_utils  import parse_command_line_parameters, fprintf, printf, GffFileParser
      from python_modules.sysutil import getstatusoutput
 except:
      print """ Could not load some user defined  module functions"""
@@ -120,100 +120,6 @@ def check_arguments(opts, args):
 
     return True
 
-def insert_attribute(attributes, attribStr):
-     rawfields = re.split('=', attribStr)
-     if len(rawfields) == 2:
-       attributes[rawfields[0].strip().lower()] = rawfields[1].strip()
-
-def split_attributes(str, attributes):
-     rawattributes = re.split(';', str)
-     for attribStr in rawattributes:
-        insert_attribute(attributes, attribStr)
-
-     return attributes
-
-def insert_orf_into_dict(line, contig_dict):
-     rawfields = re.split('\t', line)
-     fields = []
-     for field in rawfields:
-        fields.append(field.strip());
-    
-     
-     if( len(fields) != 9):
-       return
-
-     attributes = {}
-     attributes['seqname'] =  fields[0]   # this is a bit of a  duplication  
-     attributes['source'] =  fields[1]
-     attributes['feature'] =  fields[2]
-     attributes['start'] =  int(fields[3])
-     attributes['end'] =  int(fields[4])
-
-     try:
-        attributes['score'] =  float(fields[5])
-     except:
-        attributes['score'] =  fields[5]
-
-     attributes['strand'] =  fields[6]
-     attributes['frame'] =  fields[7]
-     
-     split_attributes(fields[8], attributes)
-
-     if not fields[0] in contig_dict :
-       contig_dict[fields[0]] = []
-
-     contig_dict[fields[0]].append(attributes)
-  
-
-class GffFileParser(object):
-
-   def __init__(self, gff_filename):
-        self.Size = 10000
-        self.i=0
-        self.orf_dictionary = {}
-        self.gff_beg_pattern = re.compile("#")
-        self.lines= []
-        self.size=0
-        try:
-           self.gff_file = open( gff_filename,'r')
-        except AttributeError:
-           print "Cannot read the map file for database :" + dbname
-           sys.exit(0)
-
-   def __iter__(self):
-        return self
-
-   def refillBuffer(self):
-       self.orf_dictionary = {}
-       line = self.gff_file.readline()
-       i = 0
-       while line and i < self.Size:
-          line=self.gff_file.readline()
-          if self.gff_beg_pattern.search(line):
-            continue
-          if not line:
-            break
-          insert_orf_into_dict(line, self.orf_dictionary)
-          i += 1
-
-       self.orfs = self.orf_dictionary.keys()
-       self.size = len(self.orfs)
-       self.i = 0
-
-   def next(self):
-        if self.i == self.size:
-           self.refillBuffer()
-
-        if self.size==0:
-           self.gff_file.close()
-           raise StopIteration()
-
-        #print self.i
-        if self.i < self.size:
-           self.i = self.i + 1
-           return self.orfs[self.i-1]
-
-
 def process_gff_file(gff_file_name, orf_dictionary):
      try:
         gfffile = open(gff_file_name, 'r')
@@ -269,43 +175,35 @@ def get_species(hit):
        return None
 
 
-def create_annotation(results_dictionary, annotated_gff,  output_dir, ncbi_taxonomy_tree_file, min_score, top_percent, min_support):
+def create_annotation(results_dictionary, dbnames,  annotated_gff,  output_dir, Taxons, orfsPicked, orfToContig):
     meganTree = None
-    lca = None
-    if 'refseq' in results_dictionary:
-        lca = LCAComputation(ncbi_taxonomy_tree_file)
-        lca.setParameters(min_score, top_percent, min_support)
-        meganTree = MeganTree(lca)
-
+    #lca.set_results_dictionary(results_dictionary)
     if not path.exists(output_dir):
        makedirs(output_dir)
 
-   
     orf_dictionary={}
     #process_gff_file(annotated_gff, orf_dictionary)
     gffreader = GffFileParser(annotated_gff)
-    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'w')
-    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")
+    output_table_file = open(output_dir + '/functional_and_taxonomic_table.txt', 'a')
 
     count = 0
     for contig in  gffreader:
        for orf in  gffreader.orf_dictionary[contig]:
+          if not orf['id'] in orfsPicked:
+            continue
+
+          orfToContig[orf['id']] = contig
+
           taxonomy = None
           if count%10000==0 :
              pass 
-          species = []
-          if 'refseq' in results_dictionary:
-            if orf['id'] in results_dictionary['refseq']:
-                for hit in results_dictionary['refseq'][orf['id']]:
-                   if hit['bitscore'] >= min_score:
-                      names = get_species(hit)
-                      if names:
-                        species.append(names) 
-                      #print '---------------------------'
-          #         else:
-          #              print "hit " + hit['query']  + ' ' + hit['dbname'] + ' ' + str(hit['bitscore'] )
-          if lca: 
-            taxonomy=lca.getTaxonomy(species)
+
+          #update the sequence name
+
+          #taxonomy=lca.getMeganTaxonomy(orf['id'])
+          #print orf['id']
+          taxonomy=Taxons[orf['id']]
+
           fprintf(output_table_file, "%s", orf['id'])
           fprintf(output_table_file, "\t%s", orf['orf_length'])
           fprintf(output_table_file, "\t%s", orf['start'])
@@ -317,24 +215,22 @@ def create_annotation(results_dictionary, annotated_gff,  output_dir, ncbi_taxon
           # fprintf(output_table_file, "\t%s", str(species))
           fprintf(output_table_file, "\t%s", taxonomy)
           fprintf(output_table_file, "\t%s\n", orf['product'])
-          if meganTree and taxonomy != '':
-              meganTree.insertTaxon(taxonomy)
-              # print 'inserted taxon of taxonomy : ', taxonomy
+
+          # adding taxons to the megan tree
+          #if meganTree and taxonomy != '':
+          #    meganTree.insertTaxon(taxonomy)
           #print meganTree.getChildToParentMap()
+
     output_table_file.close()
     # print meganTree.getParentToChildrenMap()
 
-    if meganTree:
-        print output_dir + '/megan_tree.tre'
-        megan_tree_file = open(output_dir + '/megan_tree.tre', 'w')
-        #print meganTree.printTree('1')
-        # exit()
-        fprintf(megan_tree_file,  "%s;", meganTree.printTree('1'))
-        # print 'wrote out megan_tree_file'
-        megan_tree_file.close()
+    # this prints out the megan tree
+#    if meganTree:
+#        megan_tree_file = open(output_dir + '/megan_tree.tre', 'w')
+#        fprintf(megan_tree_file,  "%s;", meganTree.printTree('1'))
+#        megan_tree_file.close()
     
 
-
             #write_annotation_for_orf(outputgff_file, candidatedbname, dbname_weight, results_dictionary, orf_dictionary, contig, candidate_orf_pos,  orf['id']) 
 
 
@@ -439,14 +335,16 @@ def remove_repeats(filtered_words):
 class BlastOutputTsvParser(object):
 
     def __init__(self, dbname,  blastoutput):
+        self.lineToProcess = ""
         self.dbname = dbname
         self.blastoutput = blastoutput
         self.i=0
         self.SIZE = 10000
         self.data = {}
         self.fieldmap={}
-        self.seq_beg_pattern = re.compile("#")
+        self.seq_beg_pattern = re.compile("^#")
         self.lines = []
+        self.headerline = None
 
         try:
            self.blastoutputfile = open( blastoutput,'r')
@@ -454,32 +352,43 @@ def __init__(self, dbname,  blastoutput):
            if not self.seq_beg_pattern.search(line) :
               print "First line must have field header names and begin with \"#\""
               sys.exit(0)
-           header = re.sub('#','',line)
 
+           self.headerline = line.strip()
+           self.lineToProcess = self.headerline
+           header = re.sub('^#','',line)
            fields = [ x.strip()  for x in header.rstrip().split('\t')]
            k = 0 
            for x in fields:
-            self.fieldmap[x] = k 
-            k+=1
+             self.fieldmap[x] = k 
+             k += 1
 
         except AttributeError:
            print "Cannot read the map file for database :" + dbname
            sys.exit(0)
 
+    def getHeaderLine(self):
+       return self.headerline
+
+    def getProcessedLine(self):
+       return self.lineToProcess
+
     def refillBuffer(self):
        i = 0 
        self.lines = []
-       line = self.blastoutputfile.readline()
-       while line and i < self.SIZE:
-         line=self.blastoutputfile.readline()
+       while i < self.SIZE:
+         line = self.blastoutputfile.readline().strip()
          if not line:
            break
+         if self.seq_beg_pattern.search(line):
+            continue
+
          self.lines.append(line)
          i += 1
-
        self.size = len(self.lines)
  
-  
+    def rewind(self): 
+        self.i = self.i - 1
+
     def __iter__(self):
         return self
  
@@ -501,24 +410,26 @@ def next(self):
               self.data['identity'] = float(fields[self.fieldmap['identity']])
               self.data['ec'] = fields[self.fieldmap['ec']]
               self.data['product'] = re.sub(r'=',' ',fields[self.fieldmap['product']])
+              self.lineToProcess = self.lines[self.i % self.SIZE]
            except:
               print "<<<<<<-------"
               print 'self size ' + str(self.size)
               print 'line ' + self.lines[self.i % self.SIZE]
+              print 'next line ' + self.lines[(self.i + 1) % self.SIZE]
+              print ' field map ' + str(self.fieldmap)
               print 'index ' + str(self.i)
               print 'data ' + str(self.data)
+              print 'fields ' + str(fields)
               print ">>>>>>-------"
-#              import traceback 
-#              print traceback.print_exc()
+              import traceback 
+              print traceback.print_exc()
               self.i = self.i + 1
-              return None
+              sys.exit(0)
            
            self.i = self.i + 1
-           try:
-              return self.data
-           except:
-              return None
+           return self.data
         else:
+           self.lineToProcess = None
            self.blastoutputfile.close()
            raise StopIteration()
               
@@ -548,31 +459,29 @@ def isWithinCutoffs(data, cutoffs):
   return True
 
 
-# compute the refscores
-def process_parsed_blastoutput(dbname, blastoutput, cutoffs, annotation_results):
-    blastparser =  BlastOutputTsvParser(dbname, blastoutput)
-
+def process_parsed_blastoutput(dbname, blastparser, cutoffs, annotation_results, pickorfs):
     fields = ['target', 'q_length', 'bitscore', 'bsr', 'expect', 'identity', 'ec', 'query' ]
     fields.append('product')
-
-    count = 0 
     for data in blastparser:
-        if data!=None and isWithinCutoffs(data, cutoffs) :
+        if  data!=None and isWithinCutoffs(data, cutoffs) :
            # if dbname=='refseq':
            # print data['query'] + '\t' + str(data['q_length']) +'\t' + str(data['bitscore']) +'\t' + str(data['expect']) +'\t' + str(data['identity']) + '\t' + str(data['bsr']) + '\t' + data['ec'] + '\t' + data['product']
            annotation = {}
            for field in fields:
              if field in data:
                 annotation[field] = data[field] 
+
+           if not data['query'] in pickorfs: 
+              blastparser.rewind()
+              return None
+          
            annotation['dbname'] = dbname
 
            if not data['query'] in annotation_results:
                annotation_results[data['query']] = []
 
            annotation_results[data['query']].append(annotation)
-        count += 1
-       # if count%100==0:
-       #    print count
+
     return None
 
 def beginning_valid_field(line):
@@ -628,7 +537,7 @@ def kegg_id(product):
        kegg_id=results.group(0)
     return kegg_id
 
-def create_table(results, dbname_map_filename, dbname, output_dir):
+def create_table(results, dbname_map_filename, dbname, output_dir, filePermType = 'w'):
     if not path.exists(output_dir):
        makedirs(output_dir)
 
@@ -657,33 +566,33 @@ def create_table(results, dbname_map_filename, dbname, output_dir):
     add_counts_to_hierarchical_map(hierarchical_map, orthology_count)
 
     if dbname=='cog':
-       outputfile = open( output_dir +'/COG_stats_1.txt', 'w')
+       outputfile = open( output_dir +'/COG_stats_1.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 0, outputfile, printKey=False,\
           header="Functional Category\tGene Count") 
        outputfile.close()
-       outputfile = open( output_dir +'/COG_stats_2.txt', 'w')
+       outputfile = open( output_dir +'/COG_stats_2.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 1, outputfile,\
           header="Function Abbr\tFunctional Category\tGene Count") 
        outputfile.close()
-       outputfile = open( output_dir +'/COG_stats_3.txt', 'w')
+       outputfile = open( output_dir +'/COG_stats_3.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 2, outputfile,\
           header="COGID\tFunction\tGene Count") 
        outputfile.close()
 
     if dbname=='kegg':
-       outputfile = open( output_dir +'/KEGG_stats_1.txt', 'w')
+       outputfile = open( output_dir +'/KEGG_stats_1.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 0, outputfile, printKey=False,\
           header="Function Category Level 1\tGene Count") 
        outputfile.close()
-       outputfile = open( output_dir +'/KEGG_stats_2.txt', 'w')
+       outputfile = open( output_dir +'/KEGG_stats_2.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 1, outputfile, printKey=False,\
           header="Function Category Level 2a\tGene Count") 
        outputfile.close()
-       outputfile = open( output_dir +'/KEGG_stats_3.txt', 'w')
+       outputfile = open( output_dir +'/KEGG_stats_3.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 2, outputfile,\
          header="ID\tFunction Category Level 3\tGene Count" ) 
        outputfile.close()
-       outputfile = open( output_dir +'/KEGG_stats_4.txt', 'w')
+       outputfile = open( output_dir +'/KEGG_stats_4.txt', filePermType)
        print_counts_at_level(hierarchical_map, field_to_description,  0, 3, outputfile,\
          header="KO\tFunction Category Level 4\tGene Count") 
        outputfile.close()
@@ -726,6 +635,181 @@ def  add_counts_to_hierarchical_map(hierarchical_map, orthology_count):
        else:
           add_counts_to_hierarchical_map(hierarchical_map[key], orthology_count)
    
+def get_list_of_queries(annotated_gff):
+    orfList = {}
+    gffreader = GffFileParser(annotated_gff)
+    count = 0
+    for contig in  gffreader:
+       for orf in  gffreader.orf_dictionary[contig]:
+          orfList[orf['id']]  = 1
+          count += 1
+    #      if count%500000==0:
+    #         print count
+            
+    return orfList.keys()
+
+
+def Heapify(A, i, S):
+    while True:
+       l = 2*i + 1
+       r = l + 1
+       max = i
+       if l < S and A[l][1] > A[max][1]:  # should be > 
+          max = l
+       if r < S and A[r][1] > A[max][1]:  # should be >
+          max = r
+
+       if max != i and i < S:
+          temp = A[i]
+          A[i] =  A[max]
+          A[max] =  temp
+       else:
+          break
+       i = max
+
+def BuildHeap(S, A):
+    i = int(S/2)
+    while  i >= 0:
+        Heapify(A, i, S)
+        i = i - 1
+
+def writeParsedLines(fieldmapHeaderline, parsedLines, list, names, outputfilename):
+    try:
+      outputfile = open(outputfilename, 'w')
+    except OSError:
+      print "ERROR: Cannot create sequence file : " + faa_file
+      sys.exit(0)
+
+    outputStr=fieldmapHeaderline + "\n"
+    i = 0
+    for item in list:
+       outputStr += parsedLines[item[0]]+'\n'
+       if i > 1000:
+          fprintf(outputfile, "%s", outputStr)
+          i= 0
+          outputStr=""
+       i += 1
+    if i > 0:
+      fprintf(outputfile, "%s", outputStr)
+    outputfile.close()
+
+def merge_sorted_parsed_files(dbname, filenames, outputfilename, orfRanks):
+    linecount = 0
+    readerhandles = []
+
+    try:
+       for i in range(len(filenames)):
+         readerhandles.append(BlastOutputTsvParser(dbname, filenames[i]) )
+    except OSError:
+      print "ERROR: Cannot read sequence file : " + filenames[i]
+      sys.exit(1)
+
+    try:
+       outputfile = open(outputfilename, 'w')
+       fieldmapHeaderLine = readerhandles[0].getHeaderLine()
+       fprintf(outputfile, "%s\n",fieldmapHeaderLine) 
+      
+    except OSError: 
+       print "ERROR: Cannot create sequence file : " + outputfilename
+       sys.exit(1)
+
+    values = []    
+    for i in range(len(filenames)):
+       iterate = iter(readerhandles[i])
+       next(iterate)
+       line = readerhandles[i].getProcessedLine()  
+       fields  = [ x.strip() for x in line.split('\t') ]
+       values.append( (i, orfRanks[fields[0]], line) )
+
+    S = len(filenames) 
+    BuildHeap(S, values)
+    
+    while S>0:
+       try:
+          iterate = iter(readerhandles[values[0][0]])
+          line = readerhandles[values[0][0]].getProcessedLine()  
+          fields  = [ x.strip() for x in line.split('\t') ]
+          fprintf(outputfile, "%s\n",line) 
+          next(iterate)
+
+          line = readerhandles[values[0][0]].getProcessedLine()  
+          fields  = [ x.strip() for x in line.split('\t') ]
+          values[0] = (values[0][0], orfRanks[fields[0]], line) 
+       except:
+          #import traceback
+          #traceback.print_exc()
+          #print 'finished ' + str(S)
+          values[0] = values[S-1] 
+          S = S - 1
+
+       if S>0:
+          Heapify(values, 0, S)
+
+    #print 'line count ' + str(linecount)
+    outputfile.close()
+
+
+def  create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs, size = 100000):
+    orfRanks = {}
+    count = 0
+    for orf in listOfOrfs:
+       orfRanks[orf] = count
+       count += 1
+
+    sorted_parse_file =  blastoutput + ".tmp"
+
+    try:
+        inputfilehandle = open(blastoutput, 'r')
+    except OSError:
+      print "ERROR: Cannot read parsed b/last results : " + blastoutput
+      sys.exit(1)
+
+
+    currSize = 0 
+    parsedLines={}
+    list = []
+    names = {}
+    seqid =0
+    batch = 0 
+    filenames = []
+
+    blastparser =  BlastOutputTsvParser(dbname, blastoutput)
+    
+    for data in blastparser:
+       names[seqid] = data['query']
+       parsedLines[seqid] = blastparser.getProcessedLine()
+       list.append( (seqid, names[seqid]) )
+         
+       seqid +=1 
+       currSize += 1
+       if currSize > size: 
+           list.sort(key=lambda tup: tup[1], reverse=False)
+           fieldmapHeaderLine = blastparser.getHeaderLine()
+           writeParsedLines(fieldmapHeaderLine, parsedLines, list, names, sorted_parse_file + "." + str(batch))
+           filenames.append(sorted_parse_file + "." + str(batch))
+           batch += 1
+           currSize = 0
+           sequences={}
+           list = []
+           names = {}
+           seqid =0
+
+    if currSize > 0:
+        list.sort(key=lambda tup: tup[1], reverse=False)
+        fieldmapHeaderLine = blastparser.getHeaderLine()
+        writeParsedLines(fieldmapHeaderLine, parsedLines, list, names, sorted_parse_file + "." + str(batch))
+        filenames.append(sorted_parse_file + "." + str(batch))
+
+    inputfilehandle.close()
+    merge_sorted_parsed_files(dbname, filenames, sorted_parse_file, orfRanks)
+
+    for file in filenames:
+       remove(file)
+
+
+import gc
+import resource
+
 # the main function
 def main(argv): 
     global parser
@@ -736,35 +820,129 @@ def main(argv):
 
     results_dictionary={}
     dbname_weight={}
-    #import traceback
-    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
-        print "Processing database : " + dbname
-        try:
-           results_dictionary[dbname]={}
-           process_parsed_blastoutput(dbname, blastoutput, opts, results_dictionary[dbname])
-        except:
-           import traceback
-           traceback.print_exc()
-           print "Error: " + dbname
-           pass
 
-    create_annotation(results_dictionary, opts.input_annotated_gff, opts.output_dir,\
-         opts.ncbi_taxonomy_map, opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)
+#    print "memory used  = %s" %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /1000000)
+    listOfOrfs =  get_list_of_queries(opts.input_annotated_gff)       
+    listOfOrfs.sort(key=lambda tup: tup, reverse=False)
 
-    # print results_dictionary['cog']
-    for dbname in results_dictionary:
-       if  dbname=='cog':
-          create_table(results_dictionary[dbname], opts.input_cog_maps, 'cog', opts.output_dir)
+##### uncomment the following lines
+    for dbname, blastoutput in zip(opts.database_name, opts.input_blastout):
+      create_sorted_parse_blast_files(dbname, blastoutput, listOfOrfs) 
+#####
 
-       if  dbname=='kegg':
-          create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir)
+    # process in blocks of size _stride
+    output_table_file = open(opts.output_dir + '/functional_and_taxonomic_table.txt', 'w')
+    fprintf(output_table_file, "ORF_ID\tORF_length\tstart\tend\tContig_Name\tContig_length\tstrand\tec\ttaxonomy\tproduct\n")
+    output_table_file.close()
 
-    peg2subsystem = {} 
+    lca = LCAComputation(opts.ncbi_taxonomy_map)
+    lca.setParameters(opts.lca_min_score, opts.lca_top_percent, opts.lca_min_support)
+
+    blastParsers={}
+    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
+        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')
+
+    #this part of the code computes the occurence of each of the taxons
+    # which is use in the later stage is used to evaluate the min support
+    # as used in the MEGAN software
+
+    start = 0
+    Length = len(listOfOrfs) 
+    _stride = 100000
+    Taxons = {}
+    while start < Length:
+       pickorfs= {}
+       last =  min(Length, start + _stride)
+       for  i in range(start, last): 
+          pickorfs[listOfOrfs[i]]= 'all'
+       start = last
+       print 'min support ' + str(start)
+
+       results_dictionary={}
+       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
+          if dbname=='refseq':
+            try:
+               results_dictionary[dbname]={}
+               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
+               lca.set_results_dictionary(results_dictionary)
+               lca.compute_min_support_tree(opts.input_annotated_gff, pickorfs )
+               for key, taxon  in pickorfs.iteritems():
+                   Taxons[key] = taxon
+            except:
+               print "Error: while training for min support tree " + dbname
+               import traceback
+               traceback.print_exc()
+
+    blastParsers={}
+    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
+        blastParsers[dbname] =  BlastOutputTsvParser(dbname, blastoutput + '.tmp')
+
+    # this loop determines the actual/final taxonomy of each of the ORFs 
+    # taking into consideration the min support
+    filePermTypes= {}
+    orfTablefilePermType = False
+    start = 0
+    while start < Length:
+       pickorfs= {}
+       last =  min(Length, start + _stride)
+       for  i in range(start, last): 
+          pickorfs[listOfOrfs[i]]= True
+       start = last
+       
+       gc.collect()
+       printf("memory used  = %s MB", str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000))
+       results_dictionary={}
+       for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
+            try:
+               results_dictionary[dbname]={}
+               printf("Processing database %s ", dbname)
+               process_parsed_blastoutput(dbname, blastParsers[dbname], opts, results_dictionary[dbname], pickorfs)
+               printf("done\n")
+            except:
+               import traceback
+               traceback.print_exc()
+               print "Error: " + dbname
+               pass
+            print dbname + ' ' + str(len(results_dictionary[dbname]))
+
+       print 'done = ' + str(start)
+
+       # create the annotations now
+       orfToContig = {}
+       create_annotation(results_dictionary, opts.database_name,  opts.input_annotated_gff, opts.output_dir, Taxons, pickorfs, orfToContig)
+        
+       for dbname in results_dictionary:
+          if dbname in filePermTypes:
+              filePermType = 'a'
+          else:
+              filePermType = 'w'
+              filePermTypes[dbname] = True
+
+          if  dbname=='cog':
+             create_table(results_dictionary[dbname], opts.input_cog_maps, 'cog', opts.output_dir, filePermType)
+          if  dbname=='kegg':
+             create_table(results_dictionary[dbname], opts.input_kegg_maps, 'kegg', opts.output_dir, filePermType)
+
+    #peg2subsystem = {} 
     # this feature is useless with the new refseq
-    if opts.subsystems2peg_file:
-       process_subsys2peg_file(peg2subsystem, opts.subsystems2peg_file)
+    #if opts.subsystems2peg_file:
+    #   process_subsys2peg_file(peg2subsystem, opts.subsystems2peg_file)
 
-    print_orf_table(results_dictionary, opts.output_dir)
+       orfTableFilePermType = False
+       if orfTableFilePermType:
+           filePermType = 'a'
+           orfTablefilePermType = True
+       else:
+           filePermType = 'w'
+
+       print_orf_table(results_dictionary, orfToContig, opts.output_dir, filePermType)
+
+    # now remove the temporary files
+    for dbname, blastoutput in zip( opts.database_name, opts.input_blastout):
+        try:
+           remove( blastoutput + '.tmp')
+        except:
+           pass
 
 def refseq_id(product):
     results = re.search(r'gi\|[0-9.]*', product)
@@ -773,13 +951,11 @@ def refseq_id(product):
        refseq_id=results.group(0)
     return refseq_id
 
-
 def process_subsys2peg_file(subsystems2peg, subsystems2peg_file):
      try:
          orgfile = open(subsystems2peg_file,'r')
      except IOError:
          print "Cannot open " + str(org_file)
-
      lines = orgfile.readlines()
      orgfile.close()
      for line in lines:
@@ -791,16 +967,16 @@ def process_subsys2peg_file(subsystems2peg, subsystems2peg_file):
      except:
          print "Cannot close " + str(org_file)
  
-def print_orf_table(results, output_dir):
+def print_orf_table(results, orfToContig,  output_dir, filePermType):
     if not path.exists(output_dir):
        makedirs(output_dir)
    
-    outputfile = open( output_dir +'/ORF_annotation_table.txt', 'w')
+    outputfile = open( output_dir +'/ORF_annotation_table.txt', filePermType)
 
     orf_dict = {}
     for dbname in results.iterkeys():
-      for seqname in results[dbname]:
-         for orf in results[dbname][seqname]:
+      for orfname in results[dbname]:
+         for orf in results[dbname][orfname]:
            if not orf['query'] in orf_dict:
                orf_dict[orf['query']] = {}
 
@@ -817,7 +993,7 @@ def print_orf_table(results, output_dir):
               orf_dict[orf['query']][dbname] = re.sub(r'\[.*\]','', seed).strip()
              
 
-           orf_dict[orf['query']]['contig'] = seqname
+           orf_dict[orf['query']]['contig'] = orfToContig[orfname]
 
     for orfn in orf_dict:
        if 'cog' in orf_dict[orfn]:
@@ -853,4 +1029,3 @@ def MetaPathways_create_reports_fast(argv):
 if __name__ == "__main__":
     createParser()
     main(sys.argv[1:])
-
diff --git a/libs/python_scripts/MetaPathways_parse_blast.py b/libs/python_scripts/MetaPathways_parse_blast.py
index b20d91e..bff01b7 100755
--- a/libs/python_scripts/MetaPathways_parse_blast.py
+++ b/libs/python_scripts/MetaPathways_parse_blast.py
@@ -115,22 +115,30 @@ def check_arguments(opts, args):
 def create_query_dictionary(blastoutputfile, query_dictionary, algorithm ):
        seq_beg_pattern = re.compile("^#")
 
-       blastoutfh = open( blastoutputfile,'r')
-
-       for line in blastoutfh:
-          if not seq_beg_pattern.search(line):
-              words = line.rstrip().split('\t')
-              if len(words) != 12: 
-                  continue
-
-              if algorithm =='BLAST': 
-                 query_dictionary[words[0]] = 1
-
-              if algorithm =='LAST': 
-                 query_dictionary[words[1]]= 1
-       blastoutfh.close()
-        
-       
+       try:
+          blastoutfh = open( blastoutputfile,'r')
+       except:
+          print ""
+          print "ERROR : cannot open B/LAST output file " + blastoutputfile + " to parse "
+          pass
+  
+       try:
+          for line in blastoutfh:
+             if not seq_beg_pattern.search(line):
+                 words = line.rstrip().split('\t')
+                 if len(words) != 12: 
+                     continue
+   
+                 if algorithm =='BLAST': 
+                    query_dictionary[words[0]] = 1
+   
+                 if algorithm =='LAST': 
+                    query_dictionary[words[1]]= 1
+          blastoutfh.close()
+       except:
+          print "ERROR : while reading  B/LAST output file " + blastoutputfile + " to partse "
+          print "      : make sure B/LAST ing was done for the particular database"
+          pass 
 
 def create_dictionary(databasemapfile, annot_map, query_dictionary):
 #       print "query size " + str(len(query_dictionary))
@@ -184,8 +192,14 @@ def __init__(self, dbname,  blastoutput, database_mapfile, refscore_file, opts):
         try:
            query_dictionary = {}
            create_query_dictionary(self.blastoutput, query_dictionary, self.opts.algorithm) 
+           try:
+              self.blastoutputfile = open(self.blastoutput,'r')
+           except:
+              print ""
+              print "ERROR : cannot open B/LAST output file " + blastoutput + " to parse "
+              print "      : make sure \"B/LAST\"ing was done for the particular database"
+              sys.exit(0)
 
-           self.blastoutputfile = open(self.blastoutput,'r')
            create_refscores(refscore_file, self.refscores)
 
 #           print "Going to read the dictionary\n"
diff --git a/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py b/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py
index 7809526..067caff 100755
--- a/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py
+++ b/libs/python_scripts/MetaPathways_rRNA_stats_calculator.py
@@ -53,6 +53,16 @@ def getFormat(dbstring):
          format = 2
     return format
 
+#sequences with no seperate taxonomic name gets the sequences name
+def parse_Format_0(line):
+    fields = re.split(' ', line)
+    if len( fields ) ==1:
+       name = fields[0].replace('>','')
+       taxonomy = name
+    else:
+       return( None, None ) 
+    return( name.strip(), taxonomy.strip() )
+
 #silva 
 def parse_Format_1(line):
     fields = re.split(' ', line)
@@ -84,12 +94,10 @@ def parse_Format_2(line):
 
 
 def getName_and_Taxonomy(line, format=0):
-    
-    if format==0:
-        print "ERROR: Unknown format for the rRNA database files!"
-        sys.exit(3)
 
-    if format==1:
+    if format==0:
+       name, taxonomy = parse_Format_0(line)
+    elif format==1:
        name, taxonomy = parse_Format_1(line)
     #   print name + "  " + taxonomy
     elif format==2:
@@ -244,7 +252,6 @@ def main(argv):
 
 
     options, args = parser.parse_args(argv)
-    print options
     if not len(options.blast_files):
        parser.error('At least one taxonomic BLAST output is required')
 
diff --git a/resources/Dsignal b/resources/Dsignal
new file mode 100755
index 0000000..c7ea146
--- /dev/null
+++ b/resources/Dsignal
@@ -0,0 +1,8 @@
+0.0000 0.0000 0.0000 1.0000
+0.5100 0.0400 0.4200 0.0200
+0.0000 0.0000 1.0000 0.0000
+0.0400 0.6400 0.0500 0.2800
+0.0500 0.2500 0.3000 0.4000
+0.0900 0.4900 0.1300 0.2900
+1.0000 0.0000 0.0000 0.0000
+0.1800 0.0100 0.7500 0.0600
diff --git a/resources/TPCsignal b/resources/TPCsignal
new file mode 100755
index 0000000..4a28d41
--- /dev/null
+++ b/resources/TPCsignal
@@ -0,0 +1,15 @@
+0.0124 0.8100 0.0083 0.1694
+0.1618 0.3320 0.4772 0.0290
+0.0992 0.4008 0.2934 0.2066
+0.1818 0.1818 0.5083 0.1281
+0.1612 0.0289 0.7768 0.0331
+0.0000 0.0000 1.0000 0.0000
+0.0581 0.0000 0.0000 0.9419
+0.0000 0.0000 0.0000 1.0000
+0.0000 1.0000 0.0000 0.0000
+0.1818 0.0000 0.8182 0.0000
+0.9876 0.0000 0.0083 0.0041
+0.4959 0.0620 0.1694 0.2727
+0.0620 0.1859 0.0083 0.7438
+0.0000 1.0000 0.0000 0.0000
+0.0289 0.7686 0.0330 0.1694
diff --git a/template_config.txt b/template_config.txt
index d5d3397..187ae31 100644
--- a/template_config.txt
+++ b/template_config.txt
@@ -18,6 +18,7 @@ FORMATDB_EXECUTABLE     'executables/mac/bit64/makeblastdb'
 BLASTP_EXECUTABLE       'executables/mac/bit64/blastp'
 BLASTN_EXECUTABLE       'executables/mac/bit64/blastn'
 EXECUTABLES_DIR         'executables/mac/bit64/'
+RESOURCES_DIR           'resources/'
 LASTDB_EXECUTABLE       'executables/mac/bit64/lastdb'
 LAST_EXECUTABLE         'executables/mac/bit64/lastal'
 ORF_PREDICTION          'executables/mac/bit64/prodigal'
diff --git a/template_param.txt b/template_param.txt
index df6aac8..db06313 100644
--- a/template_param.txt
+++ b/template_param.txt
@@ -17,16 +17,17 @@ orf_prediction:min_length 60
 # ORF annotation parameters
 annotation:algorithm last
 # e.g. blast or last
+#annotation:dbs metacyc-v4-2011-07-03,COG-2013-02-05,refseq-nr-2014-01-18,kegg-pep-2011-06-18,seed-2014-01-30
 annotation:dbs metacyc-v4-2011-07-03,COG-2013-02-05,refseq-nr-2014-01-18,kegg-pep-2011-06-18,seed-2014-01-30
 # e.g. annotation:dbs    cog,kegg,refseq,metacyc
 annotation:min_bsr 0
 annotation:max_evalue 0.000001
-annotation:min_score 47
+annotation:min_score 50
 annotation:min_length 60
 annotation:max_hits 5
 
 # rRNA annotation parameters
-rRNA:refdbs SSURef_111_NR_tax_silva-2012-11-06
+rRNA:refdbs 
 # e.g. rRNA:refdbs GREENGENES_gg16S,SSURef_111_NR_tax_silva,LSURef_111_tax_silva
 rRNA:max_evalue 0.000001
 rRNA:min_identity 20
@@ -55,7 +56,7 @@ metapaths_steps:PARSE_BLAST skip
 metapaths_steps:SCAN_rRNA skip
 metapaths_steps:STATS_rRNA skip
 metapaths_steps:SCAN_tRNA skip
-metapaths_steps:ANNOTATE redo
+metapaths_steps:ANNOTATE skip
 metapaths_steps:PATHOLOGIC_INPUT skip
 metapaths_steps:GENBANK_FILE skip
 metapaths_steps:CREATE_SEQUIN_FILE skip