In [None]:
def search_taxa_all_gene_delay(list_of_taxa):
    #takes a list of taxa and will run blast for sequences contained in a file
    #will loop through the list and run blast for each one
    #will save each result to a separate xml file
    from Bio.Blast import NCBIWWW
        #imports the NCBIWWW module from Biopython
    import time
        #gives us the ability to delay our imputs to not spam the NCBI servers and get kicked off
    
    with open("USH_Search_seq.fasta", "r") as fasta_file:
        sequences = fasta_file.read()
        fasta_file.close()
        #reads in the file we will be searching
        
    for i in list_of_taxa:
        result_handle = NCBIWWW.qblast("blastp", 
                                       "refseq_protein", 
                                       sequences, 
                                       alignments=100, 
                                       descriptions=100, 
                                       expect=0.00001, 
                                       entrez_query=str(i))
        file_name=str("USH_Search_"+str(i)+".xml") #this creates a name for the file
        save_file=open(file_name, "w")  #we are opening a file that does not yet exist to write to it
        save_file.write(result_handle.read())  #writing the result of our blast search to local file
        save_file.close() #closing it to allow the file to actually write it
        result_handle.close() #close the results handle
        print("created "+ file_name)
        time.sleep(60)  #this gives 1 minute between writing the output and sending another request to the ncbi server
            #hopefully this will prevent spamming the server 


The output of a typical blast search against the Uniprot/swissprot database looks like this on the blast screen

RecName: Full=Unconventional myosin-VIIa

RecName: Full=Unconventional myosin-VIIb

Ok we can parse this probably.

let's run a simple search to search and see what the xml output will be, as it might include far more of the annotation




Should use for database:
swissprot

should use txid 9606 for humans

sequences should be the file where we have the best hits first search

can limit alignments to only return the top one

should have an evalue cutoff-
what of? 0.00001 again? in most cases it won't matter as the reciprocal hit will be automatically much lower than that

our output file should have 
sequence searched, sequence found, the E value or bit score, the 

In [2]:
def reciprocal_best_hit_search(sequence_list):
    #takes a file and will run blast for sequences contained in a file
    #will find best hit for each
    #will save as a single xml file
    from Bio.Blast import NCBIWWW
        #imports the NCBIWWW module from Biopython
    import time
        #gives us the ability to delay our imputs to not spam the NCBI servers and get kicked off
    
    with open(sequence_list, "r") as fasta_file:
        sequences = fasta_file.read()
        fasta_file.close()
        #reads in the file we will be searching
        

    result_handle = NCBIWWW.qblast("blastp",
                                   "swissprot", 
                                   sequences, 
                                   alignments=1, 
                                   descriptions=1, 
                                   expect=0.00001, 
                                   entrez_query="txid9606[ORGN]")
#        file_name=str("USH_Search_"+str(i)+".xml") #this creates a name for the file
    save_file=open("testing_reciprocal_hits.xml", "w")  #we are opening a file that does not yet exist to write to it
    save_file.write(result_handle.read())  #writing the result of our blast search to local file
    save_file.close() #closing it to allow the file to actually write it
    result_handle.close() #close the results handle
    print("created "+ file_name)
    time.sleep(60)  #this gives 1 minute between writing the output and sending another request to the ncbi server
        #hopefully this will prevent spamming the server 


Here is the file we are going to search against the blast database


"/home/eeb177-student/Desktop/eeb-177/project/sandbox/Best_hits/seqs_WHRN_best_hit_reduced_taxa_gi.txt"



# Testing out the blast function to make it work

In [6]:
from Bio.Blast import NCBIWWW
with open("/home/eeb177-student/Desktop/eeb-177/project/sandbox/Best_hits/seqs_WHRN_best_hit_reduced_taxa_gi.txt", "r") as fasta_file:
    sequences = fasta_file.read()
    fasta_file.close()
        
result_handle = NCBIWWW.qblast("blastp",
                                "swissprot", 
                               sequences, 
                               alignments=1, 
                               descriptions=1, 
                               expect=0.00001, 
                               entrez_query="txid9606[ORGN]")

output_file_name = "testing_reciprocal_hits.xml"
save_file=open(output_file_name, "w")  #we are opening a file that does not yet exist to write to it
save_file.write(result_handle.read())  #writing the result of our blast search to local file
save_file.close() #closing it to allow the file to actually write it
result_handle.close() #close the results handle
print("created "+ output_file_name)


created testing_reciprocal_hits.xml


# Now to parse the output

In [50]:
result_handle = open("testing_reciprocal_hits.xml", "r")

reciprocal_results = open("reciprocal_results.csv", "w")
from Bio.Blast import NCBIXML
blast_records = NCBIXML.parse(result_handle)
E_VALUE_THRESH=0.04
for blast_record in blast_records:
    for alignment in blast_record.alignments[:1]:
        score_counter=[]
        e_val_counter=[]
        for hsp in alignment.hsps:
            score=hsp.score
            score_counter.append(score)
            evalue=hsp.expect
            e_val_counter.append(evalue)
            query = hsp.query

        seq_designation = alignment.title.split("|")
        gi_number = seq_designation[1]
        ref_number = seq_designation[3]
        annotation = seq_designation[4]
        #breaking apart the annoation of the sequence name
        #otherwise we'd have way too many delimiters within delimiters
        output_line = str(ref_number + "," +
                          annotation+ "," +
                          str(min(e_val_counter))+ "\n")
        #this sets up the output for each alignment
        reciprocal_results.write(output_line)

reciprocal_results.close()       

Ok so what we want to do is take the top result of this search
add the annotation at the end to a file

Merge this file with info from the original blast summary search

namely:
gene ID of search from human sequence,
species it was found in
Annotation of it
then gene ID of top reciprocal hit
annotation of riciprocal hit
if this annotation matches what was expected (aka 1 or 0, or Y and N or TRUE and FALSE)

Ok we took the top result for each thingy



Let's walk through the first file we used to search.



In [51]:
with open("/home/eeb177-student/Desktop/eeb-177/project/sandbox/Best_hits/gene_WHRN_best_hit_reduced_taxa_summary.csv", "r") as initial_search:
    possible_homologs_sequences = initial_search.readlines()

with open("reciprocal_results.csv", "r") as reciprocal_summary:
    recip_results = reciprocal_summary.readlines()
    
for i in range(0, len(possible_homologs_sequences)):
    print(possible_homologs_sequences[i])
    print("best reciprocal hit was:")
    print(recip_results[i])

514683987,XP_004989574.1, hypothetical protein PTSG_09316 [Salpingoeca rosetta],1.00915e-47

best reciprocal hit was:
Q9P202.3,WHRN_HUMAN RecName: Full=Whirlin; AltName: Full=Autosomal recessive deafness type 31 protein,3.13045e-47

470296585,XP_004345405.1, hypothetical protein CAOG_05815 [Capsaspora owczarzaki ATCC 30864],2.61943e-09

best reciprocal hit was:
Q9NSN8.1,SNTG1_HUMAN RecName: Full=Gamma-1-syntrophin; Short=G1SYN; AltName: Full=Syntrophin-4; Short=SYN4,3.6078e-31

196008055,XP_002113893.1, hypothetical protein TRIADDRAFT_27973 [Trichoplax adhaerens],2.6604e-15

best reciprocal hit was:
O75970.2,MPDZ_HUMAN RecName: Full=Multiple PDZ domain protein; AltName: Full=Multi-PDZ domain protein 1,9.71117e-169

1133456164,XP_003385034.3, PREDICTED: titin homolog isoform X1 [Amphimedon queenslandica],4.45008e-74

best reciprocal hit was:
Q9P202.3,WHRN_HUMAN RecName: Full=Whirlin; AltName: Full=Autosomal recessive deafness type 31 protein,3.52564e-73

290746376,NP_056219.3, whirlin i

That gives us a way to interperpolate the results, let's try to format them into a single line

In [62]:
with open("/home/eeb177-student/Desktop/eeb-177/project/sandbox/Best_hits/gene_WHRN_best_hit_reduced_taxa_summary.csv", "r") as initial_search:
    possible_homologs_sequences = initial_search.readlines()

with open("reciprocal_results.csv", "r") as reciprocal_summary:
    recip_results = reciprocal_summary.readlines()
    
for i in range(0, len(possible_homologs_sequences)):
    homolog_info = possible_homologs_sequences[i].split(",")
    homolog_id = homolog_info[1]
    homolog_annotation = homolog_info[2]
    organism = homolog_annotation.split("[")[1].replace("]", "")
    predicted_gene = homolog_annotation.split("[")[0]
    
    recip_hit = recip_results[i].split(",")
    recip_id = recip_hit[0]
    recip_annotation = recip_hit[1].split(":")[1].split(";")[0].replace("Full=", "")
    
    print("for the gene:")
    print(organism + "," + predicted_gene)
    print("best reciprocal hit was:")
    print(recip_id + "," + recip_annotation )

for the gene:
Salpingoeca rosetta, hypothetical protein PTSG_09316 
best reciprocal hit was:
Q9P202.3, Whirlin
for the gene:
Capsaspora owczarzaki ATCC 30864, hypothetical protein CAOG_05815 
best reciprocal hit was:
Q9NSN8.1, Gamma-1-syntrophin
for the gene:
Trichoplax adhaerens, hypothetical protein TRIADDRAFT_27973 
best reciprocal hit was:
O75970.2, Multiple PDZ domain protein
for the gene:
Amphimedon queenslandica, PREDICTED: titin homolog isoform X1 
best reciprocal hit was:
Q9P202.3, Whirlin
for the gene:
Homo sapiens, whirlin isoform 1 
best reciprocal hit was:
Q9P202.3, Whirlin
for the gene:
Nematostella vectensis, predicted protein 
best reciprocal hit was:
Q9P202.3, Whirlin


IndexError: list index out of range