In [1]:
import os
from Bio import Entrez
Entrez.email = 'kivey@hsph.harvard.edu'


#CREATE FUNCTIONS


#function to remove ' from strings
def removequote(searchvariable):
    searchvariable = searchvariable.translate({ord(c): None for c in "'"})
    return searchvariable

#function to remove illegal characters from search term
def join_removebracket(outputvariable):
    outputvariable = '\t'.join(result_output)
    outputvariable = outputvariable.translate({ord(c): None for c in "[]"})
    return outputvariable


#FILE OPERATIONS


#format noname and unclassified file to write to
headerscorrect = ["Taxonomic rank from data","Organism name from data"]
stringheaderscorrect='\t'.join(headerscorrect)
try:
    os.remove("taxonomy_noname_unclass.txt")
except OSError:
    pass
with open("taxonomy_noname_unclass.txt", 'a') as nonameunclass:
    nonameunclass.write("%s\n" % stringheaderscorrect)
opennonameunclass = open('taxonomy_noname_unclass.txt', 'w')

#format matches file to write to
headerscorrect = ["Taxonomic rank from data","Organism name from data","NCBI taxonomy id", "Matched on"]
stringheaderscorrect='\t'.join(headerscorrect)
try:
    os.remove("taxonomy_match.txt")
except OSError:
    pass
with open("taxonomy_match.txt", 'a') as correctfile:
    correctfile.write("%s\n" % stringheaderscorrect)
opencorrect = open('taxonomy_match.txt', 'w')

#format non-match file to write to
headersincorrect = ["Taxonomic rank from data","Organism name from data"]
stringheadersincorrect='\t'.join(headersincorrect)
try:
    os.remove("taxonomy_no_match.txt")
except OSError:
    pass
with open("taxonomy_no_match.txt", 'a') as incorrectfile:
    incorrectfile.write("%s\n" % stringheadersincorrect)
openincorrect = open('taxonomy_no_match.txt', 'w')

#read in name data
name_dict = {}
with open("taxonomy_names_short.txt", "rt") as f:
    for line in f.readlines():
        line_strings = line.split(',')
        line_strings_stripped = [s.strip('\n') for s in line_strings]
        name_dict[line_strings_stripped[0]] = line_strings_stripped[1:]
    
    
#CREATE PERMANENT VARIABLES


#counting variables
add_to_noname_unclass = 0  
add_to_match = 0  
add_to_no_match = 0  
noname = 0           
correct_matches_1 = 0
correct_matches_2 = 0
correct_matches_3 = 0
correct_matches_4 = 0
correct_matches_5 = 0
correct_matches_6 = 0
correct_matches_7 = 0
incorrect_matches = 0
total_searched = 0


#id list variables
taxidlist = []



#PERFORM SEARCH


for key in name_dict:
    for value in name_dict[key]:
        total_searched += 1
        
        #remove if no name or unclassified
        if 'noname' in value or 'unclassified' in value :
            noname += 1
            add_to_noname_unclass += 1  
            result_output = key, value
            opennonameunclass.write("%s\n" % join_removebracket(result_output))
        else:
           
            #search on scientific name and taxonomic rank
            searchstring1 = "%r[SCIN] AND %r[Rank] " % (value , key)
            searchstring1 = removequote(searchstring1)
            handle1 = Entrez.esearch(db="taxonomy", term=searchstring1)
            record1 = Entrez.read(handle1)
            if  record1['IdList'] != []:
                correct_matches_1 += 1
                add_to_match += 1
                taxidlist.append(record1['IdList'])
                result_output = value, key, str(record1['IdList'])
                opencorrect.write("%s\n" % join_removebracket(result_output))
                
            else:
                
                #search on aliases and taxonomic rank
                searchstring2 = "%r[ALLN] AND %r[Rank] " % (value , key)
                searchstring2 = removequote(searchstring2)
                handle2 = Entrez.esearch(db="taxonomy", term=searchstring2)
                record2 = Entrez.read(handle2)    
                if  record2['IdList'] != []:
                    correct_matches_2 += 1 
                    add_to_match += 1
                    taxidlist.append(record2['IdList'])
                    result_output = value, key, str(record2['IdList'])
                    opencorrect.write("%s\n" % join_removebracket(result_output))
                                       
                else:
                    
                    #search on aliases and Viruses taxonomic division
                    searchstring3 = "%r[ALLN] AND Viruses[TXDV] " % (value)
                    searchstring3 = removequote(searchstring3)
                    handle3 = Entrez.esearch(db="taxonomy", term=searchstring3)
                    record3 = Entrez.read(handle3)    
                    if  record3['IdList'] != []:
                        correct_matches_3 += 1        
                        add_to_match += 1
                        taxidlist.append(record3['IdList'])
                        result_output = value, key, str(record3['IdList'])
                        opencorrect.write("%s\n" % join_removebracket(result_output))
                                          
                    else:                
                        
                        #search on aliases and Phage taxonomic division
                        searchstring4 = "%r[ALLN] AND Phages[TXDV] " % (value)
                        searchstring4 = removequote(searchstring4)
                        handle4 = Entrez.esearch(db="taxonomy", term=searchstring4)
                        record4 = Entrez.read(handle4)    
                        if  record4['IdList'] != []:
                            correct_matches_4 += 1        
                            add_to_match += 1
                            taxidlist.append(record4['IdList']) 
                            result_output = value, key, str(record4['IdList'])
                            opencorrect.write("%s\n" % join_removebracket(result_output))
                        else:                
                            
                            #search on aliases and Unassigned taxonomic division
                            searchstring5 = "%r[ALLN] AND Unassigned[TXDV] " % (value)
                            searchstring5 = removequote(searchstring5)
                            handle5 = Entrez.esearch(db="taxonomy", term=searchstring5)
                            record5 = Entrez.read(handle5)    
                            if  record5['IdList'] != []:
                                correct_matches_5 += 1  
                                add_to_match += 1
                                taxidlist.append(record5['IdList']) 
                                result_output = value, key, str(record5['IdList'])
                                opencorrect.write("%s\n" % join_removebracket(result_output))
                                
                            else:                          
                                
                                #search on aliases and Environmental Samples taxonomic division
                                searchstring6 = "%r[ALLN] AND Environmental samples[TXDV] " % (value)
                                searchstring6 = removequote(searchstring6)
                                handle6 = Entrez.esearch(db="taxonomy", term=searchstring6)
                                record6 = Entrez.read(handle6)    
                                if  record6['IdList'] != []:
                                    correct_matches_6 += 1        
                                    add_to_match += 1
                                    taxidlist.append(record6['IdList']) 
                                    result_output = value, key, str(record6['IdList'])
                                    opencorrect.write("%s\n" % join_removebracket(result_output))
                                    
                                else:             
                                    
                                    #search using no filters
                                    searchstring7 = "%r[ALLN]" % (value)
                                    searchstring7 = removequote(searchstring7)
                                    handle7 = Entrez.esearch(db="taxonomy", term=searchstring7)
                                    record7 = Entrez.read(handle7)    
                                    if  record7['IdList'] != []:
                                        correct_matches_7 += 1     
                                        add_to_match += 1
                                        taxidlist.append(record7['IdList']) 
                                        result_output = value, key, str(record7['IdList'])
                                        opencorrect.write("%s\n" % join_removebracket(result_output))
                                 
                             
                                        
                                    else:             
                                        
                                        #identify unmatched names
                                        incorrect_matches += 1
                                        add_to_no_match += 1
                                        result_output = key, value
                                        openincorrect.write("%s\n" % join_removebracket(result_output))
                                        print(key, value, "unmatched")
                                        

#CLOSE 


                                    handle7.close()
                                handle6.close()
                            handle5.close()
                        handle4.close()
                    handle3.close()
                handle2.close()
            handle1.close()

    
#PRINT REPORTS

with open("taxonomy_output.txt", "w") as text_file:
    for taxids in taxidlist:
        handles = Entrez.efetch(db="taxonomy", id=taxids, mode="text", rettype="xml")
        records = Entrez.read(handles)

        for taxon in records:

            taxid = taxon["TaxId"]
            name = taxon["ScientificName"]
            rank = str(taxon["Rank"])
  
            lineage = taxon["Lineage"]
            tids = []
            for t in taxon["LineageEx"]:
                tids.insert(0, t["TaxId"])
            tids.insert(0, taxid)

            text_file.write(','.join([taxid, name, str(rank), str(lineage), " ".join(tids),'\n']))

text_file.close()        
opennonameunclass.close()
opencorrect.close()
openincorrect.close()
  


print ("JOB COMPLETE. A total of ", total_searched, "names were searched. \n")
print (" \n  \n FILE DETAILS \n Number of entries added to 'taxonomy_noname_unclass.txt: ", add_to_noname_unclass, "\n Number of entries added to 'taxonomy_match.txt: ", add_to_match, "\n Number of entries added to 'taxonomy_no_match.txt: ", add_to_no_match)  


species Acinetobacter pittii calcoaceticus nosocomialis unmatched
species Coprococcus sp ART55 1 unmatched
species Lactobacillus casei paracasei unmatched
species Streptococcus mitis oralis pneumoniae unmatched
JOB COMPLETE. A total of  468 names were searched. 

 
  
 FILE DETAILS 
 Number of entries added to 'taxonomy_noname_unclass.txt:  65 
 Number of entries added to 'taxonomy_match.txt:  399 
 Number of entries added to 'taxonomy_no_match.txt:  4
