# Sequence Retrieval with Biopython

In [18]:
from Bio import Entrez, SeqIO, Medline

In [19]:
Entrez.email = 'user@gmail.com' # put your email here

In [20]:
# Check all available databases
handle0 = Entrez.einfo()  # Create a handle object for the Entrez API
rec = Entrez.read(handle0)   # Read the data returned by the API into a python object, typically a dictionary.
print(rec)

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']}


In [21]:
# Search ZAT12 gene in Arabidopsis thaliana on the nucleotide database
handle = Entrez.esearch(db="nucleotide", term='ZAT12[Gene Name] AND "Arabidopsis thaliana"[Organism]')
rec_list = Entrez.read(handle)
rec_list

{'Count': '5', 'RetMax': '5', 'RetStart': '0', 'IdList': ['1063742237', '240256493', '332002898', '1418336', '1418324'], 'TranslationSet': [{'From': '"Arabidopsis thaliana"[Organism]', 'To': '"Arabidopsis thaliana"[Organism]'}], 'TranslationStack': [{'Term': 'ZAT12[Gene Name]', 'Field': 'Gene Name', 'Count': '27', 'Explode': 'N'}, {'Term': '"Arabidopsis thaliana"[Organism]', 'Field': 'Organism', 'Count': '2700832', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'ZAT12[Gene Name] AND "Arabidopsis thaliana"[Organism]'}

In [37]:
# 'RetMax' specifies the maximum number of records returned from a search, which is limited to 20. 
# 'Count' represents the total number of records that match a given criteria.
# If 'Count' > 'RetMax'(20), we have to repeat the query with an increased maximum limit. 
# So, we make retmax equal to 'Count', thus to retrieve all matching records.
if rec_list['RetMax'] < rec_list['Count']:
    handle = Entrez.esearch(db="nucleotide", term='CRT[Gene Name] AND "Plasmodium falciparum"[Organism]',
                   retmax=rec_list['Count'])
    rec_list = Entrez.read(handle)

In [38]:
print(rec_list)

{'Count': '5', 'RetMax': '5', 'RetStart': '0', 'IdList': ['1063742237', '240256493', '332002898', '1418336', '1418324'], 'TranslationSet': [{'From': '"Arabidopsis thaliana"[Organism]', 'To': '"Arabidopsis thaliana"[Organism]'}], 'TranslationStack': [{'Term': 'ZAT12[Gene Name]', 'Field': 'Gene Name', 'Count': '27', 'Explode': 'N'}, {'Term': '"Arabidopsis thaliana"[Organism]', 'Field': 'Organism', 'Count': '2700832', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': 'ZAT12[Gene Name] AND "Arabidopsis thaliana"[Organism]'}


In [39]:
id_list = rec_list['IdList']
id_list

['1063742237', '240256493', '332002898', '1418336', '1418324']

In [40]:
# Retrieve records
# download just a few at a time and stop when we have found the one that we need.
# Use retmax to restrict how many data you would like to download. 
# Here we only have 5 records, which is a small number.
hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb') # gb: GenBank format

In [41]:
# Parse the obtained GenBank-formatted data 
recs = list(SeqIO.parse(hdl, 'gb'))

In [42]:
recs

[SeqRecord(seq=Seq('TTTACCCCTTATCTTTCCACGTATTTACGTTAATACCTACACTCTTCGCTGCCA...TAC'), id='NM_125374.3', name='NM_125374', description='Arabidopsis thaliana C2H2-type zinc finger family protein (RHL41), mRNA', dbxrefs=['BioProject:PRJNA116', 'BioSample:SAMN03081427']),
 SeqRecord(seq=Seq(None, length=26975502), id='NC_003076.8', name='NC_003076', description='Arabidopsis thaliana chromosome 5, complete sequence', dbxrefs=['BioProject:PRJNA116', 'BioSample:SAMN03081427', 'Assembly:GCF_000001735.4']),
 SeqRecord(seq=Seq('TATACCATGTACCCTCAACCTTAAAACCCTAAAACCTATACTATAAATCTTTAA...ATC'), id='CP002688.1', name='CP002688', description='Arabidopsis thaliana chromosome 5, partial sequence', dbxrefs=['BioProject:PRJNA10719', 'BioSample:SAMN03081427']),
 SeqRecord(seq=Seq('ACCAAACTCAAAAAACACAAACCACAAGAGGATCATTTCATTTTTTATTGTTTC...AAA'), id='X98673.1', name='X98673', description='A.thaliana mRNA for ZAT12 protein', dbxrefs=[]),
 SeqRecord(seq=Seq('AAGCTACGCGGTGTCGCAAATCGTGACCACATAACCCGTTTTTTCCTTCTTTTA.

In [43]:
# Concentrate on a single record
for rec in recs:
    if rec.name == 'X98674':
        break
print(rec.name)
print(rec.description)

X98674
A.thaliana zat12 gene


In [30]:
for feature in rec.features:
    print(feature.type)
    print(feature.location)
    print(feature.qualifiers)

source
[0:1000](+)
{'organism': ['Arabidopsis thaliana'], 'mol_type': ['genomic DNA'], 'db_xref': ['taxon:3702'], 'ecotype': ['Columbia']}
gene
[309:798](+)
{'gene': ['Zat12']}
CDS
[309:798](+)
{'gene': ['Zat12'], 'note': ['putative'], 'codon_start': ['1'], 'product': ['zinc finger protein'], 'protein_id': ['CAA67232.1'], 'db_xref': ['GOA:Q42410', 'InterPro:IPR007087', 'InterPro:IPR015880', 'UniProtKB/TrEMBL:Q42410'], 'translation': ['MVAISEIKSTVDVTAANCLMLLSRVGQENVDGGDQKRVFTCKTCLKQFHSFQALGGHRASHKKPNNDALSSGLMKKVKTSSHPCPICGVEFPMGQALGGHMRRHRNESGAAGGALVTRALLPEPTVTTLKKSSSGKRVACLDLSLGMVDNLNLKLELGRTVY']}


In [31]:
print(dir(feature))

['__bool__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_flip', '_get_location_operator', '_get_ref', '_get_ref_db', '_get_strand', '_set_location_operator', '_set_ref', '_set_ref_db', '_set_strand', '_shift', 'extract', 'id', 'location', 'location_operator', 'qualifiers', 'ref', 'ref_db', 'strand', 'translate', 'type']


In [32]:
# Extract specific information from a sequence record
for feature in rec.features:
    if feature.type == 'gene':
        print(feature.qualifiers['gene'])
    elif feature.type == 'exon':
        loc = feature.location
        print('Exon', loc.start, loc.end, loc.strand)
    else:
        print('not processed: \n%s' % feature)
# qualifiers: additional information about the feature
# \n: new line
# %s: placeholder for a string

not processed: 
type: source
location: [0:1000](+)
qualifiers:
    Key: db_xref, Value: ['taxon:3702']
    Key: ecotype, Value: ['Columbia']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Arabidopsis thaliana']

['Zat12']
not processed: 
type: CDS
location: [309:798](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: db_xref, Value: ['GOA:Q42410', 'InterPro:IPR007087', 'InterPro:IPR015880', 'UniProtKB/TrEMBL:Q42410']
    Key: gene, Value: ['Zat12']
    Key: note, Value: ['putative']
    Key: product, Value: ['zinc finger protein']
    Key: protein_id, Value: ['CAA67232.1']
    Key: translation, Value: ['MVAISEIKSTVDVTAANCLMLLSRVGQENVDGGDQKRVFTCKTCLKQFHSFQALGGHRASHKKPNNDALSSGLMKKVKTSSHPCPICGVEFPMGQALGGHMRRHRNESGAAGGALVTRALLPEPTVTTLKKSSSGKRVACLDLSLGMVDNLNLKLELGRTVY']



In [33]:
# Get metadata associated with a sequence record, which is not related to the sequence position
for name, value in rec.annotations.items():
    print('%s=%s' % (name, value))

# rec.annotations.items() returns a list of tuples, containing key-value pairs from the 'annotation' dictionary.

molecule_type=DNA
topology=linear
data_file_division=PLN
date=22-APR-1997
accessions=['X98674']
sequence_version=1
keywords=['zat gene', 'zinc finger protein']
source=Arabidopsis thaliana (thale cress)
organism=Arabidopsis thaliana
taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliopsida', 'eudicotyledons', 'Gunneridae', 'Pentapetalae', 'rosids', 'malvids', 'Brassicales', 'Brassicaceae', 'Camelineae', 'Arabidopsis']
references=[Reference(title='Isolation and characterisation of a diverse family of Arabidopsis two and three-fingered C2H2 zinc finger protein genes and cDNAs', ...), Reference(title='Direct Submission', ...)]


In [34]:
print(rec.seq)
print(len(rec.seq))

AAGCTACGCGGTGTCGCAAATCGTGACCACATAACCCGTTTTTTCCTTCTTTTACCCCTTATCTTTCCACGTATTTACGTTAATACCTACACTCTTCGCTGCCACTTCCTTACCCACCAAGTCACAAGGGTATATTCGTCACTTTCCACAAGTCTTTGGTCCACACAAACTCGGTATCTATATATAGTTCCCAAACGGACACGAACACATCATCACAACTACTATCACACCAAACTCAAAAAACACAAACCACAAGAGGATCATTTCATTTTTTATTGTTTCGTTTTAATCATCATCATCAGAAGAAAAATGGTTGCGATATCGGAGATCAAGTCGACGGTGGATGTCACGGCGGCGAATTGTTTGATGCTTTTATCTAGAGTTGGACAAGAAAACGTTGACGGTGGCGATCAAAAACGCGTTTTCACATGTAAAACGTGTTTGAAGCAGTTTCATTCGTTCCAAGCCTTAGGAGGTCACCGTGCGAGTCACAAGAAGCCTAACAACGACGCTTTGTCGTCTGGATTGATGAAGAAGGTGAAAACGTCGTCGCATCCTTGTCCCATATGTGGAGTGGAGTTTCCGATGGGACAAGCTTTGGGAGGACACATGAGGAGACACAGGAACGAGAGTGGGGCTGCTGGTGGCGCGTTGGTTACACGCGCTTTGTTGCCGGAGCCCACGGTGACTACGTTGAAGAAATCTAGCAGTGGGAAGAGAGTGGCTTGTTTGGATCTGAGTCTAGGGATGGTGGACAATTTGAATCTCAAGTTGGAGCTTGGAAGAACAGTTTATTGATTTTATTTATTTTCCTTAAATTTTCTGAATATATTTGTTTCTCTCATTCTTTGAATTTTTCTTAATATTCTAGATTATACATACATCCGCAGATTTAGGAAACTTTCATAGAGTGTAATCTTTTCTTTCTGTAAAAATATATTTTACTTGTAGCATTGGAGATTTGTTATGAGATTATCTTACTTAGCATTTAGTGAATAAT

In [35]:
refs = rec.annotations['references']
print(refs)

[Reference(title='Isolation and characterisation of a diverse family of Arabidopsis two and three-fingered C2H2 zinc finger protein genes and cDNAs', ...), Reference(title='Direct Submission', ...)]


In [36]:
# Extract the information about references

for ref in refs:
    if ref.pubmed_id != '':
        print(ref.pubmed_id)
        handle = Entrez.efetch(db='pubmed', id=[ref.pubmed_id], rettype='medline', retmode='text')
        records = Medline.parse(handle)
        for med_rec in records:
            for k, v in med_rec.items():
                print('%s: %s' % (k, v))
        

9132053
PMID: 9132053
OWN: NLM
STAT: MEDLINE
DCOM: 19970428
LR: 20190822
IS: 0167-4412 (Print) 0167-4412 (Linking)
VI: 33
IP: 4
DP: 1997 Mar
TI: Isolation and characterisation of a diverse family of Arabidopsis two and three-fingered C2H2 zinc finger protein genes and cDNAs.
PG: 615-24
AB: In animal systems the C2H2 zinc finger protein (ZFP) gene family is the largest group of regulatory proteins and its members have a wide and important role in growth and development. It is likely that this family of ZFP transcription factors will also be important in plants. We have used a PCR approach employing highly degenerate oligonucleotide primers to isolate several Arabidopsis genomic and cDNA clones encoding potential ZFPs. In addition we have used the sequence information from these clones to identify two ESTs as members of this family. Five two-fingered and one three-fingered ZFPs have been identified. Outside of the zinc finger regions, there is considerable sequence diversity, including t