# Setup 

## Import packages 

In [1]:
# For working with sequence objects 
from Bio.Seq import Seq

In [2]:
# For fetching sequences from Entrez 
from Bio import Entrez
from Bio import SeqIO

In [104]:
# For extracting features 
from Bio.SeqFeature import SeqFeature, FeatureLocation

## Misc

In [3]:
Entrez.email = "kehaliwoldemichael@gmail.com"  # Always tell NCBI who you are

# Functions 

## Sequence

In [40]:
def seq_returnEntrez(sequenceID, retType):
    with Entrez.efetch(
        db="nucleotide", rettype=retType, retmode="text", id=sequenceID
    ) as handle:
        seqRecord = SeqIO.read(handle, "gb")  # using "gb" as an alias for "genbank"
        
    handle = Entrez.efetch(db="nucleotide", id=sequenceID, rettype=retType, retmode="text")
    
    return seqRecord, handle 

## Metrics 

In [9]:
# Returns GC content 
def metric_gcContent(sequence):
    return (sequence.count("G") + sequence.count("C"))/(len(sequence))

# Selecting Sequence 

In [58]:
# Defining sequence ID 
sequence_id = "XM_039093242.1" 
# Fetching sequence by ID from GenBank 
seq_record, handle = seq_returnEntrez(sequence_id, "gb")

In [59]:
len(seq_record.seq)

2510

In [60]:
# Output sequence information 
print(handle.read())

LOCUS       XM_039093242            2510 bp    mRNA    linear   ROD 21-JAN-2021
DEFINITION  PREDICTED: Rattus norvegicus Fez family zinc finger 2 (Fezf2),
            transcript variant X1, mRNA.
ACCESSION   XM_039093242
VERSION     XM_039093242.1
DBLINK      BioProject: PRJNA677964
KEYWORDS    RefSeq.
SOURCE      Rattus norvegicus (Norway rat)
  ORGANISM  Rattus norvegicus
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Myomorpha;
            Muroidea; Muridae; Murinae; Rattus.
COMMENT     MODEL REFSEQ:  This record is predicted by automated computational
            analysis. This record is derived from a genomic sequence
            (NC_051350.1) annotated using gene prediction method: Gnomon,
            supported by mRNA and EST evidence.
            Also see:
                Documentation of NCBI's Annotation Process
            
            ##Genome-Annotation-Data-START##
         

In [67]:
seq_record.seq[:]

Seq('CGAGGCAGGTTCTCAGGAAGCCCTTGGAAGCCTTTCAGTTTGCCCGGGTATTCA...GAA')

In [72]:
seq_record.features[2]

SeqFeature(FeatureLocation(ExactPosition(740), ExactPosition(2108), strand=1), type='CDS')

## CDS 

In [78]:
seq_record.features 

[SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(2510), strand=1), type='source'),
 SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(2510), strand=1), type='gene'),
 SeqFeature(FeatureLocation(ExactPosition(740), ExactPosition(2108), strand=1), type='CDS')]

In [79]:
seq_record.features[2]

SeqFeature(FeatureLocation(ExactPosition(740), ExactPosition(2108), strand=1), type='CDS')

In [83]:
cdsStart = seq_record.features[2].location._start.position 
cdsStop = seq_record.features[2].location._end.position 

In [89]:
feature_cds = SeqFeature(FeatureLocation(cdsStart, cdsStop), type = "exon", strand=1)

In [90]:
cds_seq = feature_cds.extract(seq_record.seq)
print(cds_seq)

ATGGCCAGCTCAGCTTCCCTGGAGACCATGGTGCCCCCGGCCTGCCCGCGCGCTGGAGCGTCACCGGCCACTTCGAAAACACTAGCTTTCTCCATCGAGCGCATCATGGCCAAGACGTCCGAGCCCCGAGCGCCTTTTGAGCCCCGGCCTGCTGCGCTAGAGGCAGACAGCAGCCAGAGCAAGAAACTGCTCAACCTCTGCTCGCCGCTGCCCTGTATGATCCCCCTCCAGCCTCTAGGCTACGAGGTGCCGTCCAAGACACTGCTCAGTTACTCGGAGTTCTGGAAAAGCAGCCTCCGGGCGGGCGGCGGTGGAGGAGGAGGCAGCGGCGGGGGGGGCCCAGTGTGCGGCGCCAGTGGCTTGTGCAAAACCAACTGTGGCGTGTGCTGCAAGGCCGAACTGGGCCTTGCGCCTTCTGCGCTGCCCGCCGGCAGGGTCATCAAGCCGCAGGTCATCAACCAGGCTGTGGGGCTGCCGGCCAGCGGCTCTCTCTACTACTTCAACTACCTGGACTCCACCACTTACCCACCATCGGAGCTCCTCGGAGGCCACCTTTTCCCATCTGGCCTCCTCAACGCACAGGCCCCCACTTCCCTGGCTGCTCACCCCAAGCTTTTTCTGCTGGAGAATGCCAAACTGGCCAGCCTGACTGCGGACAAGTTCCCCCACCCAGCTTCCTATCCCCATAAGGAGCGCTTGCATGCGCCGCTGGAGCAGGTGCTGAAGGAGAACTCGGCCTTGACCGCTGAACGAGGGGGAGTCAAGAGCCACAGCAAACTACCGGGGGGCTCTACTGACAGCAAACCCAAAAACTTCACCTGCGAAGTGTGCGGCAAGGTGTTCAATGCTCACTATAACCTCACCCGCCACATGCCTGTCCACACCGGAGCTAGACCGTTTGTGTGCAAAGTCTGTGGCAAAGGCTTCCGCCAGGCCAGCACTCTCTGCAGACACAAAATTATCCATACCCAGGAAAAACCACATAAGTGTAACCAGTGCG

In [92]:
len(cds_seq)

1368

## Gene Info

In [107]:
geneName = 'Fezf2'
species = 'rat'
searchTerm = geneName + ' [GENE] AND ' + species + ' [ORGN]'

In [108]:
searchOutput = Entrez.esearch(db="nucleotide", retmax=10, term=searchTerm, idtype="acc")
geneInfo = Entrez.read(searchOutput)
searchOutput.close()

In [109]:
geneInfo

{'Count': '7', 'RetMax': '7', 'RetStart': '0', 'IdList': ['NM_001107251.1', 'XM_039093243.1', 'XM_039093242.1', 'NC_051350.1', 'XM_032918124.1', 'NC_046165.1', 'BC168214.1'], 'TranslationSet': [{'From': 'rat[ORGN]', 'To': '"Rattus"[Organism] OR "Rattus norvegicus"[Organism]'}], 'TranslationStack': [{'Term': 'Fezf2[GENE]', 'Field': 'GENE', 'Count': '1221', 'Explode': 'N'}, {'Term': '"Rattus"[Organism]', 'Field': 'Organism', 'Count': '2429540', 'Explode': 'Y'}, {'Term': '"Rattus norvegicus"[Organism]', 'Field': 'Organism', 'Count': '2308989', 'Explode': 'Y'}, 'OR', 'GROUP', 'AND'], 'QueryTranslation': 'Fezf2[GENE] AND ("Rattus"[Organism] OR "Rattus norvegicus"[Organism])'}

In [110]:
geneInfo['IdList']

['NM_001107251.1', 'XM_039093243.1', 'XM_039093242.1', 'NC_051350.1', 'XM_032918124.1', 'NC_046165.1', 'BC168214.1']

In [103]:
# Return avalible sequences 
geneInfo['IdList']

['NM_001107251.1', 'XM_039093243.1', 'XM_039093242.1', 'NC_051350.1', 'XM_032918124.1', 'NC_046165.1', 'BC168214.1']

## pyensembl 

In [111]:
from pyensembl import EnsemblRelease

In [124]:
data = EnsemblRelease(77, species = 'rat')
data.download()

INFO:pyensembl.download_cache:Fetching /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.77.gtf.gz from URL ftp://ftp.ensembl.org/pub/release-77/gtf/rattus_norvegicus/Rattus_norvegicus.Rnor_5.0.77.gtf.gz
INFO:datacache.download:Downloading ftp://ftp.ensembl.org/pub/release-77/gtf/rattus_norvegicus/Rattus_norvegicus.Rnor_5.0.77.gtf.gz to /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.77.gtf.gz
INFO:pyensembl.download_cache:Fetching /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.cdna.all.fa.gz from URL ftp://ftp.ensembl.org/pub/release-77/fasta/rattus_norvegicus/cdna/Rattus_norvegicus.Rnor_5.0.cdna.all.fa.gz
INFO:datacache.download:Downloading ftp://ftp.ensembl.org/pub/release-77/fasta/rattus_norvegicus/cdna/Rattus_norvegicus.Rnor_5.0.cdna.all.fa.gz to /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.cdna.all.fa.gz
INFO:pyensembl.download_cache:Fetching /home/user1/.cache/pyensembl/

In [125]:
data.index()

INFO:pyensembl.database:Creating database: /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.77.gtf.db
INFO:pyensembl.database:Reading GTF from /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.77.gtf.gz
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_biotype', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version']
INFO:pyensembl.database:Skipping database index for {ccds_id}
INFO:datacache.database_helpers:Creating database /home/user1/.cache/pyensembl/Rnor_5.0/ensembl77/Rattus_norvegicus.Rnor_5.0.77.gtf.db containing: transcript, stop_codon, CDS, gene, exon, start_codon
INFO:datacache.database:Running sqlite query: "CREATE TABLE transcript (gene_version TEXT NOT NULL, strand TEXT NOT NULL, transcript_id TEXT UNIQUE PRIMARY KEY NOT NULL, start INT NOT NULL, transcript_version TEXT NOT NULL, source TEXT 

In [126]:
# get all exons associated with gene name 
exon_ids  = data.exon_ids_of_gene_name('Fezf2')

In [127]:
exon_ids

['ENSRNOE00000433687',
 'ENSRNOE00000086637',
 'ENSRNOE00000086900',
 'ENSRNOE00000087167',
 'ENSRNOE00000229690']

In [200]:
exon_id = 'ENSRNOE00000086637'
exonObj = data.exon_by_id(exon_id)

In [201]:
exonObj

Exon(exon_id='ENSRNOE00000086637', gene_id='ENSRNOG00000009206', gene_name='Fezf2', contig='15', start=16855518, end=16856407, strand='+')

In [202]:
exonStart = exonObj.start 
exonStop = exonObj.end

In [203]:
exonStop - exonStart

889

In [167]:
data.gene_by_id('ENSRNOG00000009206')

Gene(gene_id='ENSRNOG00000009206', gene_name='Fezf2', biotype='protein_coding', contig='15', start=16854926, end=16858715, strand='+', genome='Rnor_5.0')

In [159]:
data.transcript_ids_of_gene_name('Fezf2')

['ENSRNOT00000012452']

In [169]:
transcript_id = 'ENSRNOT00000012452'
transcript = data.transcript_by_id(transcript_id, start = exonObj.start, end = exonObj.end)

TypeError: transcript_by_id() got an unexpected keyword argument 'start'

In [173]:
transcript.sequence

'TAGTTGCCTCCTTTAAAGTTTGAGGGGCGGTGGCGGCGGCCGGCAGGCGCGGGGGAACGCAGGGTCCTCAACGGGAGTCGCGCCGCCGCCCATGTCATTCCACTTCAAGTGACTTCATGTGATGTCAGCTGAATGTAAAAGACAGTGATCTCACGCGGAGGGGAGGATGTTTGCCATCAAAATGTGACAGAAGAGACACGCTGCATGGCTCGGAACGCATCTCCTTGGCGGTGGGGGAAAAAGACTTAGAGGAGAGAGGCTGTGCCCTGGCCCAGCCTGGCTCAGCTTGGCGCGCCATGGCCAGCTCAGCTTCCCTGGAGACCATGGTGCCCCCGGCCTGCCCGCGCGCTGGAGCGTCACCGGCCACTTCGAAAACACTAGCTTTCTCCATCGAGCGCATCATGGCCAAGACGTCCGAGCCCCGAGCGCCTTTTGAGCCCCGGCCTGCTGCGCTAGAGGCAGACAGCAGCCAGAGCAAGAAACTGCTCAACCTCTGCTCGCCGCTGCCCTGTATGATCCCCCTCCAGCCTCTAGGCTACGAGGTGCCGTCCAAGACACTGCTCAGTTACTCGGAGTTCTGGAAAAGCAGCCTCCGGGCGGGCGGCGGTGGAGGAGGAGGCAGCGGCGGGGGGGGCCCAGTGTGCGGCGCCAGTGGCTTGTGCAAAACCAACTGTGGCGTGTGCTGCAAGGCCGAACTGGGCCTTGCGCCTTCTGCGCTGCCCGCCGGCAGGGTCATCAAGCCGCAGGTCATCAACCAGGCTGTGGGGCTGCCGGCCAGCGGCTCTCTCTACTACTTCAACTACCTGGACTCCACCACTTACCCACCATCGGAGCTCCTCGGAGGCCACCTTTTCCCATCTGGCCTCCTCAACGCACAGGCCCCCACTTCCCTGGCTGCTCACCCCAAGCTTTTTCTGCTGGAGAATGCCAAACTGGCCAGCCTGACTGCGGACAAGTTCCCCCACCCAGCTTCCTATCCCCATAAGGAGCGCTTGCATG

In [174]:
transcript 

Transcript(transcript_id='ENSRNOT00000012452', transcript_name='Fezf2-201', gene_id='ENSRNOG00000009206', biotype='protein_coding', contig='15', start=16854926, end=16858715, strand='+', genome='Rnor_5.0')

In [185]:
transcript_start = transcript.start
transcript_stop = transcript.end

In [189]:
transcript_stop - transcript_start 

3789

In [194]:
transcript_sequence = data.transcript_sequence(transcript_id)
transcript_cds = transcript.coding_sequence
transcript_fiveUTR = transcript.five_prime_utr_sequence
transcript_threeUTR = transcript.three_prime_utr_sequence


In [191]:
len(transcript_sequence)

2072

In [193]:
len(transcript_cds)

1368

In [195]:
len(transcript_fiveUTR)

296

In [196]:
len(transcript_threeUTR)

408

In [175]:
transcript.start_codon_positions 

[16855571, 16855572, 16855573]

In [177]:
transcript.stop_codon_positions

[16858305, 16858306, 16858307]

In [178]:
data.transcript_sequence(transcript_id)

'TAGTTGCCTCCTTTAAAGTTTGAGGGGCGGTGGCGGCGGCCGGCAGGCGCGGGGGAACGCAGGGTCCTCAACGGGAGTCGCGCCGCCGCCCATGTCATTCCACTTCAAGTGACTTCATGTGATGTCAGCTGAATGTAAAAGACAGTGATCTCACGCGGAGGGGAGGATGTTTGCCATCAAAATGTGACAGAAGAGACACGCTGCATGGCTCGGAACGCATCTCCTTGGCGGTGGGGGAAAAAGACTTAGAGGAGAGAGGCTGTGCCCTGGCCCAGCCTGGCTCAGCTTGGCGCGCCATGGCCAGCTCAGCTTCCCTGGAGACCATGGTGCCCCCGGCCTGCCCGCGCGCTGGAGCGTCACCGGCCACTTCGAAAACACTAGCTTTCTCCATCGAGCGCATCATGGCCAAGACGTCCGAGCCCCGAGCGCCTTTTGAGCCCCGGCCTGCTGCGCTAGAGGCAGACAGCAGCCAGAGCAAGAAACTGCTCAACCTCTGCTCGCCGCTGCCCTGTATGATCCCCCTCCAGCCTCTAGGCTACGAGGTGCCGTCCAAGACACTGCTCAGTTACTCGGAGTTCTGGAAAAGCAGCCTCCGGGCGGGCGGCGGTGGAGGAGGAGGCAGCGGCGGGGGGGGCCCAGTGTGCGGCGCCAGTGGCTTGTGCAAAACCAACTGTGGCGTGTGCTGCAAGGCCGAACTGGGCCTTGCGCCTTCTGCGCTGCCCGCCGGCAGGGTCATCAAGCCGCAGGTCATCAACCAGGCTGTGGGGCTGCCGGCCAGCGGCTCTCTCTACTACTTCAACTACCTGGACTCCACCACTTACCCACCATCGGAGCTCCTCGGAGGCCACCTTTTCCCATCTGGCCTCCTCAACGCACAGGCCCCCACTTCCCTGGCTGCTCACCCCAAGCTTTTTCTGCTGGAGAATGCCAAACTGGCCAGCCTGACTGCGGACAAGTTCCCCCACCCAGCTTCCTATCCCCATAAGGAGCGCTTGCATG

In [179]:
transcript.coding_sequence

'ATGGCCAGCTCAGCTTCCCTGGAGACCATGGTGCCCCCGGCCTGCCCGCGCGCTGGAGCGTCACCGGCCACTTCGAAAACACTAGCTTTCTCCATCGAGCGCATCATGGCCAAGACGTCCGAGCCCCGAGCGCCTTTTGAGCCCCGGCCTGCTGCGCTAGAGGCAGACAGCAGCCAGAGCAAGAAACTGCTCAACCTCTGCTCGCCGCTGCCCTGTATGATCCCCCTCCAGCCTCTAGGCTACGAGGTGCCGTCCAAGACACTGCTCAGTTACTCGGAGTTCTGGAAAAGCAGCCTCCGGGCGGGCGGCGGTGGAGGAGGAGGCAGCGGCGGGGGGGGCCCAGTGTGCGGCGCCAGTGGCTTGTGCAAAACCAACTGTGGCGTGTGCTGCAAGGCCGAACTGGGCCTTGCGCCTTCTGCGCTGCCCGCCGGCAGGGTCATCAAGCCGCAGGTCATCAACCAGGCTGTGGGGCTGCCGGCCAGCGGCTCTCTCTACTACTTCAACTACCTGGACTCCACCACTTACCCACCATCGGAGCTCCTCGGAGGCCACCTTTTCCCATCTGGCCTCCTCAACGCACAGGCCCCCACTTCCCTGGCTGCTCACCCCAAGCTTTTTCTGCTGGAGAATGCCAAACTGGCCAGCCTGACTGCGGACAAGTTCCCCCACCCAGCTTCCTATCCCCATAAGGAGCGCTTGCATGCGCCGCTGGAGCAGGTGCTGAAGGAGAACTCGGCCTTGACCGCTGAACGAGGGGGAGTCAAGAGCCACAGCAAACTACCGGGGGGCTCTACTGACAGCAAACCCAAAAACTTCACCTGCGAAGTGTGCGGCAAGGTGTTCAATGCTCACTATAACCTCACCCGCCACATGCCTGTCCACACCGGAGCTAGACCGTTTGTGTGCAAAGTCTGTGGCAAAGGCTTCCGCCAGGCCAGCACTCTCTGCAGACACAAAATTATCCATACCCAGGAAAAACCACATAAGTGTAACCAGTGC

In [180]:
transcript.five_prime_utr_sequence

'TAGTTGCCTCCTTTAAAGTTTGAGGGGCGGTGGCGGCGGCCGGCAGGCGCGGGGGAACGCAGGGTCCTCAACGGGAGTCGCGCCGCCGCCCATGTCATTCCACTTCAAGTGACTTCATGTGATGTCAGCTGAATGTAAAAGACAGTGATCTCACGCGGAGGGGAGGATGTTTGCCATCAAAATGTGACAGAAGAGACACGCTGCATGGCTCGGAACGCATCTCCTTGGCGGTGGGGGAAAAAGACTTAGAGGAGAGAGGCTGTGCCCTGGCCCAGCCTGGCTCAGCTTGGCGCGCC'

In [181]:
transcript.three_prime_utr_sequence

'GAGCTACTGCCTTGTCCTTTCTTCCTGCCCTGTACCAACCCGAGCAGATCCCACGTATAAACTTATTTCTAAAATTAAAAGGAAAAAAACTATAGCAGAGAGGCTAAAATCTATTTATCGAAACCAGCATATTTTTGGGAAATGTAAACGTGTCCTCGATGACCGGCAGCAAACGCGTGGCTCCCACCTTTGTACATTCAGGAAACGTATTTAAATCCAGTGCGCTGAAACATGTTTAATTCCAGGCCTCGGCTTCTCCTCAGGCAGCCGGCTTTTAATCCCAGCCTGTCACCATGAGCGCCCAGAAGAACGTGATGCCCCCAGCCGTCTTCACACAACCTTGTAATCTCTCCTGTACAAGCGAACACGGAATATTACACATATATGACTCAATAAACAGAACCACTA'

# Testing 

In [None]:
# create a sequence object
my_seq = Seq("CATGTAGACTAG")

# print out some details about it
print("seq %s is %i bases long" % (my_seq, len(my_seq)))
print("reverse complement is %s" % my_seq.reverse_complement())
print("protein translation is %s" % my_seq.translate())

In [None]:
# https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec59
# https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec%3Aefetch
# https://www.ncbi.nlm.nih.gov/books/NBK3837/
Entrez.email = "kehaliwoldemichael@gmail.com"
with Entrez.efetch(
    db="nucleotide", rettype="fasta", retmode="text", id="6273291"
) as handle:
    seq_record = SeqIO.read(handle, "fasta")
print("%s with %i features" % (seq_record.id, len(seq_record.features)))

In [None]:
# Test of fezf2 ... rat 
with Entrez.efetch(
    db="nucleotide", rettype="gb", retmode="text", id="NC_051350.1"
) as handle:
    seq_record = SeqIO.read(handle, "gb")  # using "gb" as an alias for "genbank"
print("%s with %i features" % (seq_record.id, len(seq_record.features)))

In [None]:
len(seq_record)