In [6]:
import gzip
from Bio import SeqIO
from Bio.SeqUtils import GC

In [117]:
def summary(seq):
    length = len(seq)
    tm = 62.3 + 0.41*(4.94066E-324+GC(seq)) - 500/len(seq)
    gc = GC(seq)
    return length, tm, gc

In [119]:
with open('data/na_all.dmel.RELEASE6.chr.fa', 'r') as fh:
    for seq in SeqIO.parse(fh, format='fasta'):
        bdgp = seq.seq
        s = summary(bdgp)
        print('bdgp\t{chrom}\t{length}\t{tm:.4}\t{gc:.4}'.format(chrom=seq.id, length=s[0], tm=s[1], gc=s[2]))

bdgp	chr2L	23513712	79.43	41.78
bdgp	chr2CEN	225573	73.68	27.77
bdgp	chr2R	25286936	79.77	42.6
bdgp	chr3L	28110227	79.29	41.44
bdgp	chr3CEN	744266	81.43	46.66
bdgp	chr3R	32079331	79.75	42.56
bdgp	chr4	1348131	76.57	34.81
bdgp	chrX	23542271	79.66	42.34
bdgp	chrXmm	1049845	76.47	34.56
bdgp	chrY	3667352	77.33	36.66
bdgp	chrYmm	880023	77.14	36.19
bdgp	chrXYmm	216041	77.79	37.78
bdgp	chrU	3151297	75.43	32.03
bdgp	chrM	19908	69.49	17.59
bdgp	chrrDNA	76973	74.89	30.72


In [120]:
with gzip.open('data/dmel-all-chromosome-r6.12.fasta.gz', 'rt') as fh:
    for seq in SeqIO.parse(fh, format='fasta'):
        flybase = seq.seq
        s = summary(flybase)
        print('flybase\t{chrom}\t{length}\t{tm:.4}\t{gc:.4}'.format(chrom=seq.id, length=s[0], tm=s[1], gc=s[2]))

flybase	2L	23513712	79.43	41.78
flybase	2R	25286936	79.77	42.6
flybase	3L	28110227	79.29	41.44
flybase	3R	32079331	79.75	42.56
flybase	4	1348131	76.57	34.81
flybase	X	23542271	79.66	42.34
flybase	Y	3667352	77.33	36.66
flybase	2Cen_mapped_Scaffold_10_D1684	19956	75.26	31.68
flybase	2Cen_mapped_Scaffold_43_D1668	44411	71.52	22.51
flybase	2R2_mapped_Scaffold_56_D1828	13157	78.14	38.72
flybase	3Cen_mapped_Scaffold_1_D1896_D1895	76224	77.7	37.58
flybase	3Cen_mapped_Scaffold_27_D1777	11983	87.25	60.96
flybase	3Cen_mapped_Scaffold_31_D1643_D1653_D1791	87365	79.65	42.32
flybase	3Cen_mapped_Scaffold_36_D1605	36913	87.36	61.16
flybase	3Cen_mapped_Scaffold_41_D1641	22604	78.57	39.74
flybase	3Cen_mapped_Scaffold_50_D1686	23238	83.99	52.96
flybase	Unmapped_Scaffold_4_D1555_D1692	86267	76.44	34.51
flybase	Unmapped_Scaffold_8_D1580_D1567	88768	78.06	38.46
flybase	Unmapped_Scaffold_11_D1754	36482	73.47	27.27
flybase	Unmapped_Scaffold_13_D1782	25537	73.2	26.63
flybase	Unmapped_Scaffold_17_D1756_D1775	6

In [121]:
with gzip.open('data/dm6.fa.gz', 'rt') as fh:
    for seq in SeqIO.parse(fh, format='fasta'):
        ucsc = seq.seq
        s = summary(ucsc)
        print('ucsc\t{chrom}\t{length}\t{tm:.4}\t{gc:.4}'.format(chrom=seq.id, length=s[0], tm=s[1], gc=s[2]))

ucsc	chr2L	23513712	79.43	41.78
ucsc	chr2R	25286936	79.77	42.6
ucsc	chr3L	28110227	79.29	41.44
ucsc	chr3R	32079331	79.75	42.56
ucsc	chr4	1348131	76.57	34.81
ucsc	chrM	19524	69.56	17.78
ucsc	chrUn_DS485919v1	1021	73.58	28.7
ucsc	chrUn_DS483755v1	6936	79.09	41.12
ucsc	chrUn_DS485425v1	1143	76.0	34.47
ucsc	chrUn_DS484861v1	1395	78.22	39.71
ucsc	chrUn_DS484484v1	2020	81.7	47.92
ucsc	chrUn_DS483705v1	27456	70.73	20.6
ucsc	chrUn_DS485490v1	1127	80.81	46.23
ucsc	chrUn_DS485998v1	1003	70.14	20.34
ucsc	chrUn_DS483873v1	4222	76.33	34.51
ucsc	chrUn_DS485608v1	1097	78.36	40.29
ucsc	chrUn_DS485270v1	1185	77.1	37.13
ucsc	chrUn_DS485979v1	1008	79.86	44.05
ucsc	chrUn_DS485398v1	1148	78.33	40.16
ucsc	chrUn_DS484139v1	2820	76.75	35.67
ucsc	chrUn_DS483906v1	3924	77.87	38.28
ucsc	chrUn_DS485760v1	1058	74.0	29.68
ucsc	chrUn_DS484191v1	2665	78.93	41.01
ucsc	chrUn_DS485340v1	1163	70.82	21.84
ucsc	chrUn_DS484489v1	2008	81.98	48.61
ucsc	chrUn_DS485495v1	1126	78.5	40.59
ucsc	chrUn_DS484923v1	1343	76.95	36.63
uc