In [25]:
import numpy as np
from Bio import Alphabet
from Bio import Entrez
from Bio import Seq
from Bio import SeqIO
from Bio import SeqUtils

In [2]:
Entrez.email = "lboat@ufl.edu"
handle = Entrez.efetch(db="nucleotide", id="AF174428.2", rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()

In [3]:
# We can print the whole record
print(record)

# Or specific parts we are interested in
print("\nFull length sequence:")
print(record.seq)

ID: AF174428.2
Name: AF174428
Description: Arabidopsis thaliana FH protein interacting protein FIP1 (FIP1) mRNA, complete cds.
Number of features: 3
/keywords=['']
/references=[Reference(title='Characterization of Arabidopsis formin-like protein AFH1 and its interacting proteins', ...), Reference(title='Direct Submission', ...), Reference(title='Direct Submission', ...)]
/date=06-OCT-2017
/comment=On Oct 6, 2017 this sequence version replaced gi:6503011.
/data_file_division=PLN
/organism=Arabidopsis thaliana
/sequence_version=2
/source=Arabidopsis thaliana (thale cress)
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliophyta', 'eudicotyledons', 'Gunneridae', 'Pentapetalae', 'rosids', 'malvids', 'Brassicales', 'Brassicaceae', 'Camelineae', 'Arabidopsis']
/accessions=['AF174428']
Seq('CATCAAATCATGAGTGGGCAAGAGAATCATGATCATGGCCGGATCTCTTCTACG...AAA', IUPACAmbiguousDNA())

Full length sequence:
CATCAAATCATGAGTGGGCAAGAGAATCATGATCA

In [16]:
print("The GC content of my sequence is: {0:.2f}%".format(SeqUtils.GC(record.seq)))
print("Length of sequence: {0}nt".format(len(record.seq)))

The GC content of my sequence is: 46.19%
Length of sequence: 985nt


In [10]:
# Check how GC content is distributed within the sequence
# returns (Total, first position, second, third)
SeqUtils.GC123(record.seq)

(46.192893401015226, 47.72036474164134, 44.51219512195122, 46.34146341463415)

In [12]:
# Calculate GC skew (G-C)/(G+C) for a multiple windows along the sequence (NOT A SLIDING WINDOW)
SeqUtils.GC_skew(seq=record.seq, window=100)

[-0.15789473684210525,
 -0.5081967213114754,
 0.018867924528301886,
 0.12,
 -0.047619047619047616,
 0.0,
 0.0,
 0.21568627450980393,
 0.125,
 -0.29411764705882354]

In [21]:
# Print six frame translations
translated = SeqUtils.six_frame_translations(record.seq)
print(translated)

GC_Frame: a:264 t:266 g:215 c:240 
Sequence: catcaaatca ... aaaaaaaaaa, 985 nt, 46.19 %GC


1/1
  S  N  H  E  W  A  R  E  S  *  S  W  P  D  L  F  Y  A  R  R
 I  K  S  *  V  G  K  R  I  M  I  M  A  G  S  L  L  R  P  P
H  Q  I  M  S  G  Q  E  N  H  D  H  G  R  I  S  S  T  P  A
catcaaatcatgagtgggcaagagaatcatgatcatggccggatctcttctacgcccgcc   51 %
gtagtttagtactcacccgttctcttagtactagtaccggcctagagaagatgcgggcgg
M  L  D  H  T  P  L  L  I  M  I  M  A  P  D  R  R  R  G  G 
 D  F  *  S  H  A  L  S  D  H  D  H  G  S  R  K  *  A  R  R
  *  I  M  L  P  C  S  F  *  S  *  P  R  I  E  E  V  G  A  A

61/21
  R  V  G  T  F  Q  G  C  C  S  L  L  *  L  R  S  L  P  *  A
 P  R  R  N  L  P  R  L  L  L  T  P  L  T  T  L  L  T  L  S
A  A  S  E  P  S  K  A  A  A  H  S  S  D  Y  A  P  Y  P  K
gccgcgtcggaaccttccaaggctgctgctcactcctctgactacgctccttaccctaag   60 %
cggcgcagccttggaaggttccgacgacgagtgaggagactgatgcgaggaatgggattc
G  R  R  F  R  G  L  S  S  S  V  G  R  V  V  S  R  V  R  L 
 R  T  P  V  K  W  P  Q  Q  E  S  R  Q

In [31]:
# For a specific translation use
Seq.translate(record.seq, to_stop=True)



Seq('HQIMSGQENHDHGRISSTPAAASEPSKAAAHSSDYAPYPKLDPTDVTPPPPQPI...HGP', ExtendedIUPACProtein())

In [29]:
Seq.translate?