# Percentage of CDS, tRNAs and rRNAs in cpDNA of Sacha inchi

### by Javier C. Alvarez, EAFIT University
### January, 2020

### Funtions and libraries

In [1]:
from Bio import SeqIO

In [2]:
#please select: CDS, gene, intron, exon, tRNA, rRNA, misc_feature or repeat_region
def length_features(feature):
    lengths=[]
    for ft in record.features:
        if ft.type == feature:
            lengths.append(len(ft))
    return(lengths)

In [3]:
#GC = lambda x: (x.count('G')+x.count('C'))/len(x)
def GC(seq):
    return (seq.count("G")+seq.count("C"))/len(seq)

## Content of genes in Plukenetia volubilis - This work

In [4]:
#loading genbank file 
record = SeqIO.read("Data/Sacha_GenBank.gb", "genbank")

In [5]:
print("%GC content in "+ record.id + " is " + str(GC(record.seq)*100) + "%")

%GC content in Plukenetia is 35.79467555495975%


In [6]:
#length for genes
print("GenBank file: "+ record.id + " has " + str(len(length_features("gene")))+" features with genes in annotation, summing " +  str(sum(length_features("gene"))) + " bp in total")

GenBank file: Plukenetia has 130 features with genes in annotation, summing 106095 bp in total


In [18]:
len(record)

164111

In [17]:
#length for CDS
print("GenBank file: "+ record.id + " has " + str(len(length_features("CDS")))+" features with CDSs in annotation, summing " +  str(sum(length_features("CDS"))) + " bp in total")
print("Percentage of Coding Sequences is: " + str((sum(length_features("CDS"))/len(record))*100)+"%")

GenBank file: Plukenetia has 86 features with CDSs in annotation, summing 80816 bp in total
Percentage of Coding Sequences is: 49.24471851368891%


In [19]:
#length for tRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("tRNA")))+" features with tRNAs in annotation, summing " +  str(sum(length_features("tRNA"))) + " bp in total")

GenBank file: Plukenetia has 36 features with tRNAs in annotation, summing 2809 bp in total


In [20]:
#length for rRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("rRNA")))+" features with rRNAs in annotation, summing " +  str(sum(length_features("rRNA"))) + " bp in total")

GenBank file: Plukenetia has 8 features with rRNAs in annotation, summing 9058 bp in total


In [21]:
#length for introns
print("GenBank file: "+ record.id + " has " + str(len(length_features("intron")))+" features with introns in annotation, summing " +  str(sum(length_features("intron"))) + " bp in total")
print("Percentage of intron is: " + str((sum(length_features("intron"))/len(record))*100)+"%")

GenBank file: Plukenetia has 20 features with introns in annotation, summing 14936 bp in total
Percentage of intron is: 9.101157143640584%


In [22]:
#intron genes and length
for ft in record.features:
    for f in ft.qualifiers:
        if ft.type == "intron" and "locus_tag":
            print (ft.qualifiers["gene"][0] + " intron " + ft.qualifiers["number"][0] + " with a size " +str(len(ft)))

rps16 intron 1 with a size 903
rps16 intron 1 with a size 903
ycf3 intron 1 with a size 747
ycf3 intron 1 with a size 747
ycf3 intron 2 with a size 708
ycf3 intron 2 with a size 708
rpoC1 intron 1 with a size 806
rpoC1 intron 1 with a size 806
atpF intron 1 with a size 718
atpF intron 1 with a size 718
trnL-UAA intron 1 with a size 567
trnL-UAA intron 1 with a size 567
trnC-ACA intron 1 with a size 566
trnC-ACA intron 1 with a size 566
clpP intron 2 with a size 638
clpP intron 2 with a size 638
clpP intron 1 with a size 861
clpP intron 1 with a size 861
rpl2 intron 1 with a size 668
rpl2 intron 1 with a size 668
ndhB intron 1 with a size 682
ndhB intron 1 with a size 682
rps12 intron 1 with a size 537
rps12 intron 1 with a size 537
trnI-GAU intron 1 with a size 932
trnI-GAU intron 1 with a size 932
trnA-UGC intron 1 with a size 803
trnA-UGC intron 1 with a size 803
ndhA intron 1 with a size 1178
ndhA intron 1 with a size 1178
trnA-UGC intron 1 with a size 803
trnA-UGC intron 1 with a s

In [23]:
#a gene size
for ft in record.features:
    for f in ft.qualifiers:
        if ft.type == "gene" and "locus_tag":
            print (ft.qualifiers["gene"][0] + " with a size " +str(len(ft)))

rps12 with a size 909
rps12 with a size 909
rps12 with a size 909
psbA with a size 1062
psbA with a size 1062
matK with a size 1518
matK with a size 1518
rps16 with a size 1170
rps16 with a size 1170
trnQ-UUG with a size 73
trnQ-UUG with a size 73
psbK with a size 186
psbK with a size 186
psbI with a size 111
psbI with a size 111
trnS-GGA with a size 87
trnS-GGA with a size 87
ycf3 with a size 1962
ycf3 with a size 1962
psaA with a size 2253
psaA with a size 2253
psaB with a size 2205
psaB with a size 2205
rps14 with a size 303
rps14 with a size 303
trnM-CAU with a size 75
trnM-CAU with a size 75
trnG-GCC with a size 72
trnG-GCC with a size 72
psbZ with a size 189
psbZ with a size 189
trnS-UGA with a size 90
trnS-UGA with a size 90
psbC with a size 1386
psbC with a size 1386
psbD with a size 1062
psbD with a size 1062
trnT-GGU with a size 74
trnT-GGU with a size 74
trnE-UUC with a size 73
trnE-UUC with a size 73
trnY-GUA with a size 86
trnY-GUA with a size 86
trnD-GUC with a size 74
tr

## Using fasta files attached in annotation output 

In [14]:
#Estimating size for CDS from fasta files
filename = "Data/Sacha_CDS.fasta"

def long_seq(filename):
    long_sequences = []
    input_handle=open(filename,'r')
    
    for record in SeqIO.parse(input_handle, "fasta") :
            # Add this record to our list
            long_sequences.append(len(record))
 
    #print ("Found %i long sequences" % len(long_sequences)
    input_handle.close()
    return(long_sequences)

print ("Found "+str(len(long_seq(filename)))+" CDS"+" with a total length of "+ str(sum(long_seq(filename))))

Found 81 CDS with a total length of 70447


In [15]:
#Estimating size for genes
filename = "Data/Sacha_gene.fasta"

def long_seq(filename):
    long_sequences = []
    input_handle=open(filename,'r')
    
    for record in SeqIO.parse(input_handle, "fasta") :
            # Add this record to our list
            long_sequences.append(len(record))
 
    #print ("Found %i long sequences" % len(long_sequences)
    input_handle.close()
    return(long_sequences)

print ("Found "+str(len(long_seq(filename)))+" genes"+" with a total length of "+ str(sum(long_seq(filename))))

Found 85 genes with a total length of 83423


In [16]:
#Estimating size for rRNAs
filename = "Data/Sacha_rrna.fasta"

def long_seq(filename):
    long_sequences = []
    input_handle=open(filename,'r')
    
    for record in SeqIO.parse(input_handle, "fasta") :
            # Add this record to our list
            long_sequences.append(len(record))
 
    #print ("Found %i long sequences" % len(long_sequences)
    input_handle.close()
    return(long_sequences)

print ("Found "+str(len(long_seq(filename)))+" rRNAs"+" with a total length of "+ str(sum(long_seq(filename))))

Found 4 rRNAs with a total length of 4529


## Comparison with Sacha inchi China accession as a reference

In [18]:
#loading genbank file 
record = SeqIO.read("Data/ChinaSI_genbank.gb", "genbank")

In [19]:
print("%GC content in "+ record.id + " is " + str(GC(record.seq)*100) + "%")
print("Genome size: " + str(len(record))+ " bp")

%GC content in MF062253.1 is 36.22760970241076%
Genome size: 161733 bp


In [20]:
#length for genes
print("GenBank file: "+ record.id + " has " + str(len(length_features("gene")))+" features with genes in annotation, summing " +  str(sum(length_features("gene"))) + " bp in total")

GenBank file: MF062253.1 has 130 features with genes in annotation, summing 106963 bp in total


In [21]:
#length for CDS
print("GenBank file: "+ record.id + " has " + str(len(length_features("CDS")))+" features with CDSs in annotation, summing " +  str(sum(length_features("CDS"))) + " bp in total")
print("Percentage of Coding Sequences is: " + str((sum(length_features("CDS"))/len(record))*100)+"%")

GenBank file: MF062253.1 has 86 features with CDSs in annotation, summing 80193 bp in total
Percentage of Coding Sequences is: 49.583572925747994%


In [22]:
#length for tRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("tRNA")))+" features with tRNAs in annotation, summing " +  str(sum(length_features("tRNA"))) + " bp in total")

GenBank file: MF062253.1 has 36 features with tRNAs in annotation, summing 2809 bp in total


In [23]:
#length for rRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("rRNA")))+" features with rRNAs in annotation, summing " +  str(sum(length_features("rRNA"))) + " bp in total")

GenBank file: MF062253.1 has 8 features with rRNAs in annotation, summing 9058 bp in total


In [24]:
#length for introns
print("GenBank file: "+ record.id + " has " + str(len(length_features("intron")))+" features with introns in annotation, summing " +  str(sum(length_features("intron"))) + " bp in total")
print("Percentage of Coding Sequences is: " + str((sum(length_features("intron"))/len(record))*100)+"%")

GenBank file: MF062253.1 has 18 features with introns in annotation, summing 13553 bp in total
Percentage of Coding Sequences is: 8.379860634502544%


## Comparison with Ricinus communis chloroplast as a reference
### https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0021743

![image1.png](Images/journal.pone.0021743.g001.png)

In [25]:
#loading genbank file 
record = SeqIO.read("Data/Ricinus_genbank.gb", "genbank")

In [26]:
print("%GC content in "+ record.id + " is " + str(GC(record.seq)*100) + "%")
print("Genome size: " + str(len(record))+ " bp")

%GC content in NC_016736.1 is 35.74199716844099%
Genome size: 163161 bp


In [27]:
#length for genes
print("GenBank file: "+ record.id + " has " + str(len(length_features("gene")))+" features with genes in annotation, summing " +  str(sum(length_features("gene"))) + " bp in total")

GenBank file: NC_016736.1 has 131 features with genes in annotation, summing 107486 bp in total


In [28]:
#length for CDS
print("GenBank file: "+ record.id + " has " + str(len(length_features("CDS")))+" features with CDSs in annotation, summing " +  str(sum(length_features("CDS"))) + " bp in total")
print("Percentage of Coding Sequences is: " + str((sum(length_features("CDS"))/len(record))*100)+"%")

GenBank file: NC_016736.1 has 86 features with CDSs in annotation, summing 79494 bp in total
Percentage of Coding Sequences is: 48.721201757772995%


In [29]:
#length for tRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("tRNA")))+" features with tRNAs in annotation, summing " +  str(sum(length_features("tRNA"))) + " bp in total")
print("Percentage of tRNAs is: " + str((sum(length_features("tRNA"))/len(record))*100)+"%")

GenBank file: NC_016736.1 has 37 features with tRNAs in annotation, summing 2802 bp in total
Percentage of tRNAs is: 1.717322154191259%


In [30]:
#length for rRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("rRNA")))+" features with rRNAs in annotation, summing " +  str(sum(length_features("rRNA"))) + " bp in total")

GenBank file: NC_016736.1 has 8 features with rRNAs in annotation, summing 9050 bp in total


In [31]:
#length for introns
print("GenBank file: "+ record.id + " has " + str(len(length_features("intron")))+" features with introns in annotation, summing " +  str(sum(length_features("intron"))) + " bp in total")
print("Percentage of Coding Sequences is: " + str((sum(length_features("intron"))/len(record))*100)+"%")

GenBank file: NC_016736.1 has 0 features with introns in annotation, summing 0 bp in total
Percentage of Coding Sequences is: 0.0%


In [32]:
#a gene size
for ft in record.features:
    for f in ft.qualifiers:
        if ft.type == "gene" and "locus_tag":
            print (ft.qualifiers["gene"][0] + " with a size " +str(len(ft)))

rps12 with a size 354
rps12 with a size 354
rps12 with a size 354
rps12 with a size 354
trnH-GUG with a size 75
trnH-GUG with a size 75
trnH-GUG with a size 75
psbA with a size 1062
psbA with a size 1062
psbA with a size 1062
trnK-UUU with a size 2631
trnK-UUU with a size 2631
trnK-UUU with a size 2631
matK with a size 1521
matK with a size 1521
matK with a size 1521
rps16 with a size 153
rps16 with a size 153
rps16 with a size 153
trnQ-UUG with a size 72
trnQ-UUG with a size 72
trnQ-UUG with a size 72
psbK with a size 186
psbK with a size 186
psbK with a size 186
psbI with a size 111
psbI with a size 111
psbI with a size 111
trnS-GCU with a size 88
trnS-GCU with a size 88
trnS-GCU with a size 88
trnG-GCC with a size 763
trnG-GCC with a size 763
trnG-GCC with a size 763
trnR-UCU with a size 72
trnR-UCU with a size 72
trnR-UCU with a size 72
atpA with a size 1524
atpA with a size 1524
atpA with a size 1524
atpF with a size 1331
atpF with a size 1331
atpF with a size 1331
atpH with a siz

## Analysis for Passiflora edulis chloroplast as a control of funtions

In [35]:
#loading genbank file 
record = SeqIO.read("Data/Passi_genbank.gb", "genbank")

In [36]:
#length for CDS
print("GenBank file: "+ record.id + " has " + str(len(length_features("CDS")))+" features with CDSs in annotation, summing " +  str(sum(length_features("CDS"))) + " bp in total")
print("Percentage of Coding Sequences is: " + str((sum(length_features("CDS"))/len(record))*100)+"%")

GenBank file: NC_034285.1 has 76 features with CDSs in annotation, summing 56604 bp in total
Percentage of Coding Sequences is: 37.38557256647689%


In [37]:
#length for tRNAs
print("GenBank file: "+ record.id + " has " + str(len(length_features("tRNA")))+" features with tRNAs in annotation, summing " +  str(sum(length_features("tRNA"))) + " bp in total")
print("Percentage of tRNAs is: " + str((sum(length_features("tRNA"))/len(record))*100)+"%")

GenBank file: NC_034285.1 has 36 features with tRNAs in annotation, summing 2729 bp in total
Percentage of tRNAs is: 1.8024384766785992%


### GC content

In [38]:
print("%GC content in "+ record.id + " is " + str(GC(record.seq)*100) + "%")

%GC content in NC_034285.1 is 37.007780405003764%
