In [1]:
from Bio.SeqRecord import SeqRecord
help(SeqRecord)

Help on class SeqRecord in module Bio.SeqRecord:

class SeqRecord(builtins.object)
 |  SeqRecord(seq: Union[ForwardRef('Seq'), ForwardRef('MutableSeq'), NoneType], id: Optional[str] = '<unknown id>', name: str = '<unknown name>', description: str = '<unknown description>', dbxrefs: Optional[list[str]] = None, features: Optional[list['SeqFeature']] = None, annotations: Optional[dict[str, Union[str, int]]] = None, letter_annotations: Optional[dict[str, collections.abc.Sequence[Any]]] = None) -> None
 |  
 |  A SeqRecord object holds a sequence and information about it.
 |  
 |  Main attributes:
 |   - id          - Identifier such as a locus tag (string)
 |   - seq         - The sequence itself (Seq object or similar)
 |  
 |  Additional attributes:
 |   - name        - Sequence name, e.g. gene name (string)
 |   - description - Additional text (string)
 |   - dbxrefs     - List of database cross references (list of strings)
 |   - features    - Any (sub)features defined (list of SeqFeat

In [2]:
from Bio.Seq import Seq
simple_seq = Seq("GATC")
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(simple_seq)

In [4]:
simple_seq_r.id

'<unknown id>'

In [5]:
simple_seq_r.id = "AC12345"

In [6]:
simple_seq_r.description = "Made up sequence I wish I could write a paper about"


In [7]:
print(simple_seq_r.description)


Made up sequence I wish I could write a paper about


In [8]:
simple_seq_r.seq


Seq('GATC')

In [9]:
from Bio.Seq import Seq
simple_seq = Seq("GATC")
from Bio.SeqRecord import SeqRecord
simple_seq_r = SeqRecord(simple_seq, id="AC12345")


In [11]:
simple_seq_r.annotations["evidence"] = "None. I just made it up."
print(simple_seq_r.annotations)

{'evidence': 'None. I just made it up.'}


In [12]:
print(simple_seq_r.annotations["evidence"])

None. I just made it up.


In [14]:
simple_seq_r.letter_annotations["phred_quality"] = [40, 40, 38, 30]
print(simple_seq_r.letter_annotations)

{'phred_quality': [40, 40, 38, 30]}


In [15]:
print(simple_seq_r.letter_annotations["phred_quality"])

[40, 40, 38, 30]


SeqRecord objects from FASTA files

In [16]:
from Bio import SeqIO
record = SeqIO.read("NC_005816.fna", "fasta")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='gi|45478711|ref|NC_005816.1|', name='gi|45478711|ref|NC_005816.1|', description='gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

In [17]:
record.id

'gi|45478711|ref|NC_005816.1|'

In [19]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')

In [20]:
record.name

'gi|45478711|ref|NC_005816.1|'

In [21]:
record.description

'gi|45478711|ref|NC_005816.1| Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence'

In [22]:
record.dbxrefs
record.annotations
record.letter_annotations
record.features

[]

SeqRecord objects from GenBank files

In [23]:
from Bio import SeqIO
record = SeqIO.read("NC_005816.gb","genbank")
record

SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])

In [24]:
record.seq

Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG')

In [25]:
record.id

'NC_005816.1'

In [26]:
record.name

'NC_005816'

In [27]:
record.description

'Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence'

In [28]:
len(record.annotations)

13

In [29]:
record.annotations["source"]

'Yersinia pestis biovar Microtus str. 91001'

In [30]:
record.dbxrefs

['Project:58037']

In [31]:
len(record.features)

41

In [35]:
from Bio import SeqFeature

In [36]:
start_pos = SeqFeature.AfterPosition(5)
end_pos = SeqFeature.BetweenPosition(9, left=8, right=9)
my_location = SeqFeature.SimpleLocation(start_pos, end_pos)

In [38]:
print(my_location)

[>5:(8^9)]


In [39]:
my_location.start

AfterPosition(5)

In [40]:
my_location.end

BetweenPosition(9, left=8, right=9)

In [41]:
int(my_location.start)


5

In [42]:
int(my_location.end)


9

Location testing

In [43]:
from Bio import SeqIO
my_snp = 4350
record = SeqIO.read("NC_005816.gb", "genbank")
for feature in record.features:
    if my_snp in feature:
        print("%s %s" % (feature.type, feature.qualifiers.get("db_xref")))

source ['taxon:229193']
gene ['GeneID:2767712']
CDS ['GI:45478716', 'GeneID:2767712']


Sequence described by a feature or location

In [44]:
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, SimpleLocation
seq = Seq("ACCGAGACGGCAAAGGCTAGCATAGGTATGAGACTTCCTTCCTGCCAGTGCTGAGGAACTGGGAGCCTAC")
feature = SeqFeature(SimpleLocation(5, 18, strand=-1), type="gene")

In [45]:
feature_seq = seq[feature.location.start : feature.location.end].reverse_complement()
print(feature_seq)

AGCCTTTGCCGTC


In [46]:
feature_seq = feature.extract(seq)
print(feature_seq)

AGCCTTTGCCGTC


In [47]:
print(len(feature_seq))
print(len(feature))
print(len(feature.location))

13
13
13


Comparison

In [48]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
record1 = SeqRecord(Seq("ACGT"), id="test")
record2 = SeqRecord(Seq("ACGT"), id="test")

In [49]:
record1 == record2

NotImplementedError: SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest.

In [50]:
record1.id == record2.id
record1.seq == record2.seq

True

The format method

In [51]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
record = SeqRecord(
    Seq(
        "MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD"
        "GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK"
        "NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM"
        "SSAC"
    ),
    id="gi|14150838|gb|AAK54648.1|AF376133_1",
    description="chalcone synthase [Cucumis sativus]",
)
print(record.format("fasta"))

>gi|14150838|gb|AAK54648.1|AF376133_1 chalcone synthase [Cucumis sativus]
MMYQQGCFAGGTVLRLAKDLAENNRGARVLVVCSEITAVTFRGPSETHLDSMVGQALFGD
GAGAVIVGSDPDLSVERPLYELVWTGATLLPDSEGAIDGHLREVGLTFHLLKDVPGLISK
NIEKSLKEAFTPLGISDWNSTFWIAHPGGPAILDQVEAKLGLKEEKMRATREVLSEYGNM
SSAC



Slicing a SeqRecord

In [53]:
from Bio import SeqIO
record = SeqIO.read("NC_005816.gb", "genbank")
record
SeqRecord(seq=Seq('TGTAACGAACGGTGCAATAGTGATCCACACCCAACGCCTGAAATCAGATCCAGG...CTG'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=['Project:58037'])
len(record)


9609

In [54]:
len(record.features)

41

In [55]:
print(record.features[20])

type: gene
location: [4342:4780](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']



In [59]:
print(record.features[21])

type: CDS
location: [4342:4780](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GI:45478716', 'GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']
    Key: note, Value: ['similar to many previously sequenced pesticin immunity protein entries of Yersinia pestis plasmid pPCP, e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) , gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655| emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1| (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL109969)']
    Key: product, Value: ['pesticin immunity protein']
    Key: protein_id, Value: ['NP_995571.1']
    Key: transl_table, Value: ['11']
    Key: translation, Value: ['MGGGMISKLFCLALIFLSSSGLAEKNTYTAKDILQNLELNTFGNSLSHGIYGKQTTFKQTEFTNIKSNTKKHIALINKDNSWMISLKILGIKRDEYTVCFEDFSLIRPPTYVAIHPLLIKKVKSGNFIVVKEIKKSIPGCTVYYH']



In [60]:
sub_record = record[4300:4800]
sub_record

SeqRecord(seq=Seq('ATAAATAGATTATTCCAAATAATTTATTTATGTAAGAACAGGATGGGAGGGGGA...TTA'), id='NC_005816.1', name='NC_005816', description='Yersinia pestis biovar Microtus str. 91001 plasmid pPCP1, complete sequence', dbxrefs=[])

In [61]:
len(sub_record.features)

2

In [62]:
print(sub_record.features[0])
print(sub_record.features[1])

type: gene
location: [42:480](+)
qualifiers:
    Key: db_xref, Value: ['GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']

type: CDS
location: [42:480](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GI:45478716', 'GeneID:2767712']
    Key: gene, Value: ['pim']
    Key: locus_tag, Value: ['YP_pPCP05']
    Key: note, Value: ['similar to many previously sequenced pesticin immunity protein entries of Yersinia pestis plasmid pPCP, e.g. gi| 16082683|,ref|NP_395230.1| (NC_003132) , gi|1200166|emb|CAA90861.1| (Z54145 ) , gi|1488655| emb|CAA63439.1| (X92856) , gi|2996219|gb|AAC62543.1| (AF053945) , and gi|5763814|emb|CAB531 67.1| (AL109969)']
    Key: product, Value: ['pesticin immunity protein']
    Key: protein_id, Value: ['NP_995571.1']
    Key: transl_table, Value: ['11']
    Key: translation, Value: ['MGGGMISKLFCLALIFLSSSGLAEKNTYTAKDILQNLELNTFGNSLSHGIYGKQTTFKQTEFTNIKSNTKKHIALINKDNSWMISLKILGIKRDEYTVCFEDFSLIRPPTYVAIHPLLIKKVK

Reverse-complementing SeqRecord objects

In [64]:
from Bio import SeqIO
rec = SeqIO.read("NC_005816.gb", "genbank")
print(rec.id, len(rec), len(rec.features), len(rec.dbxrefs), len(rec.annotations))

NC_005816.1 9609 41 1 13


In [65]:
rc = rec.reverse_complement(id="TESTING")
print(rc.id, len(rc), len(rc.features), len(rc.dbxrefs), len(rc.annotations))

TESTING 9609 41 0 0
