In [1]:
# The Biopython Tutorial and Cookbook is full of simple examples of working with Seq and SeqRecord objects:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html

from urllib.request import urlretrieve
from Bio import SeqIO as seqio

urlretrieve("https://www.uniprot.org/uniprot/P04525.fasta", filename = "gp45.fasta")

gp45 = seqio.read("gp45.fasta", "fasta") # parse() for files with more than one sequence

print(gp45)
print(len(gp45))
print("\n\n")

print(gp45.seq)
print(len(gp45.seq))
print("\n\n")

print(gp45.format("fasta"))

ID: sp|P04525|DPA5_BPT4
Name: sp|P04525|DPA5_BPT4
Description: sp|P04525|DPA5_BPT4 DNA polymerase clamp OS=Enterobacteria phage T4 OX=10665 GN=45 PE=1 SV=3
Number of features: 0
Seq('MKLSKDTTALLKNFATINSGIMLKSGQFIMTRAVNGTTYAEANISDVIDFDVAI...HDF', SingleLetterAlphabet())
228



MKLSKDTTALLKNFATINSGIMLKSGQFIMTRAVNGTTYAEANISDVIDFDVAIYDLNGFLGILSLVNDDAEISQSEDGNIKIADARSTIFWPAADPSTVVAPNKPIPFPVASAVTEIKAEDLQQLLRVSRGLQIDTIAITVKEGKIVINGFNKVEDSALTRVKYSLTLGDYDGENTFNFIINMANMKMQPGNYKLLLWAKGKQGAAKFEGEHANYVVALEADSTHDF
228



>sp|P04525|DPA5_BPT4 DNA polymerase clamp OS=Enterobacteria phage T4 OX=10665 GN=45 PE=1 SV=3
MKLSKDTTALLKNFATINSGIMLKSGQFIMTRAVNGTTYAEANISDVIDFDVAIYDLNGF
LGILSLVNDDAEISQSEDGNIKIADARSTIFWPAADPSTVVAPNKPIPFPVASAVTEIKA
EDLQQLLRVSRGLQIDTIAITVKEGKIVINGFNKVEDSALTRVKYSLTLGDYDGENTFNF
IINMANMKMQPGNYKLLLWAKGKQGAAKFEGEHANYVVALEADSTHDF



In [2]:
from math import sqrt
keep_seqs = []


# Could directly search via NCBI:

# from Bio.Blast.NCBIWWW import qblast
# result = qblast("blastp", "nr", gp45.format("fasta"), expect = 1e-24, hitlist_size = 1000)


# Note: we will use SearchIO, which is intended to replace NCBIXML, and has the advantage of a unified interface,
# but I will leave the NCBIXML implementation here for comparison:

# from Bio.Blast import NCBIXML as xml
# blast_records = xml.parse(result)
# OR
# with open("gp45_blast.xml", 'r') as result:
#     blast_records = xml.parse(result) # Note that we're not keeping all of the records in memory
#     for record in blast_records:
#         for alignment in record.alignments:
#             for hsp in alignment.hsps: # high-scoring pair
#                 if hsp.identities != len(gp45):
#                     if hsp.align_length > sqrt(.5 * 1) * len(gp45) and \
#                     len(hsp.sbjct) < sqrt(1 * 1.5) * len(gp45):
#                         keep_seqs.append(hsp.sbjct)


from Bio import SearchIO as searchio
result = searchio.read("gp45_blast.xml", "blast-xml")
for hit in result:
    for hsp in hit:
        if hsp.ident_num != len(gp45):
            if hsp.hit_span > sqrt(.5 * 1) * len(gp45) and \
            len(hsp.hit_all) < sqrt(1 * 1.5) * len(gp45):
                keep_seqs.append(hsp.hit_all)
    

print(len(keep_seqs))
print(keep_seqs[0])
# sio.write(keep_seqs, "filtered_gp45_hits.fasta", "fasta")

# Parsers also exist for the output of other tools, such as the more sensitive HMM-based search tools from HH-suite
# and HMMER:
# https://biopython.org/DIST/docs/api/Bio.SearchIO.HHsuiteIO-module.html
# https://biopython.org/DIST/docs/api/Bio.SearchIO.HmmerIO-module.html

102
[SeqRecord(seq=Seq('MKLSKDTTALLKNFATINSGIMLKSGQFIMTRAVNGTTYAEANISDVIDFDVAI...HDF', ProteinAlphabet()), id='ref|YP_002854009.1|', name='aligned hit sequence', description='gp45 sliding clamp DNA polymerase [Enterobacteria phage RB51]', dbxrefs=[])]


In [3]:
# QualityIO https://biopython.org/DIST/docs/api/Bio.SeqIO.QualityIO-module.html