In [1]:
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

%matplotlib inline

last updated: 2016-12-02 

CPython 3.5.2
IPython 5.1.0


In [2]:
# imports
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipycache
from IPython.display import Markdown

import gffutils

sns.set_context('notebook')

In [3]:
# Import DB
db = gffutils.FeatureDB('/data/Oliverlab/references/genomes/Dmelanogaster/dm6/annotation/FlyBase_6.09/dmel-all-r6.09.chr.gtf.db')

# Sanity Check of GTF

FlyBase GTF available types:

In [4]:
# List of feature types
list(db.featuretypes())

['3UTR',
 '5UTR',
 'CDS',
 'exon',
 'gene',
 'mRNA',
 'miRNA',
 'ncRNA',
 'pre_miRNA',
 'pseudogene',
 'rRNA',
 'snRNA',
 'snoRNA',
 'start_codon',
 'stop_codon',
 'tRNA',
 'transcript']

In the GTF there are both 'mRNA' and 'transcript'. I want to make sure I understand the differences. There are fewer mRNA compared to transcripts, which is because 'transcript' includes:

* mRNA
* pseudogenes
* rRNA
* tRNA
* snRNA
* snoRNA
* ncRNA


In [5]:
# There are two feature types that could be of interest, need to figure why they are different
mrna = db.features_of_type('mRNA')
ts = db.features_of_type('transcript')

# Make list of (id, start, end)
mrnas = [(x.id, x.start, x.end) for x in mrna]
tss = [(x.id, x.start, x.end) for x in ts]

# Print differences in length
# There are more tss than mRNAs, are mRNAs a subset?
print('mRNAs: {}\ntranscripts:{}'.format(len(mrnas), len(tss)))

# Besides mRNA there are also these 
mirnas = [(x.id, x.start, x.end) for x in db.features_of_type('miRNA')]
ncrnas = [(x.id, x.start, x.end) for x in db.features_of_type('ncRNA')]
pseudogenes = [(x.id, x.start, x.end) for x in db.features_of_type('pseudogene')]
rrnas = [(x.id, x.start, x.end) for x in db.features_of_type('rRNA')]
snrnas = [(x.id, x.start, x.end) for x in db.features_of_type('snRNA')]
snornas = [(x.id, x.start, x.end) for x in db.features_of_type('snoRNA')]
trnas = [(x.id, x.start, x.end) for x in db.features_of_type('tRNA')]

# yes mRNA are a subset of tss with everything but miRNAs
assert len(tss) == np.sum([len(ncrnas), len(pseudogenes), len(rrnas), len(snrnas), len(snornas), len(trnas), len(mrnas)])

mRNAs: 30446
transcripts:34411


# Basic Counts

In [6]:
Markdown("""\
**Number of Genes:** {:,}
""".format(len(list(db.features_of_type('gene')))))

**Number of Genes:** 17,646


## Quartile distribution of the number of {transcripts, exons, introns}

In [202]:
# Count the number of transcripts, exons, introns per Gene
gene_ts = []
gene_exon = []
gene_intron = []
for gene in db.features_of_type('gene'):
    exons = []
    introns = []
    for ts in db.children(gene, featuretype='transcript'):
        gene_ts.append(gene.id)
        ts_exons = sorted(list(db.children(ts, featuretype='exon')), key=lambda x: (x.start, x.end))
        exons.extend(ts_exons)
        for i in range(len(exons)-1):
            s = exons[i].end + 1
            e = exons[i+1].start
            l = e - s
            introns.append((s, e, l))
    gene_exon.extend([gene.id]*len(set(exons)))
    gene_intron.extend([gene.id]*len(set(introns)))

num_ts_per_gene = Counter(gene_ts)
num_exon_per_gene = Counter(gene_exon)
num_intron_per_gene = Counter(gene_intron)

# Munge counts to DataFrame
gene_cnts = pd.concat([pd.DataFrame.from_dict(num_ts_per_gene, orient='index'), 
                       pd.DataFrame.from_dict(num_exon_per_gene, orient='index'),
                       pd.DataFrame.from_dict(num_intron_per_gene, orient='index')], axis=1)

gene_cnts.columns = ['Number Transcripts Per Gene',
                     'Number Exons Per Gene',
                     'Number Introns Per Gene']

gene_cnts = gene_cnts.fillna(0).astype(int)

# Look at Quartiles
q = [0, 25, 50, 75, 100]
dd = pd.DataFrame({'Number Transcripts Per Gene': np.percentile(gene_cnts['Number Transcripts Per Gene'], q),
                   'Number Exons Per Gene': np.percentile(gene_cnts['Number Exons Per Gene'], q),
                   'Number Introns Per Gene': np.percentile(gene_cnts['Number Introns Per Gene'], q)},
           index=['Minimum', '25th Percentile', 'Median', '75th Percentile', 'Maximum']).applymap(lambda x: '{:,.0f}'.format(x))

dd[['Number Transcripts Per Gene', 'Number Exons Per Gene', 'Number Introns Per Gene']]

Unnamed: 0,Number Transcripts Per Gene,Number Exons Per Gene,Number Introns Per Gene
Minimum,1,1,0
25th Percentile,1,2,1
Median,1,4,2
75th Percentile,2,9,6
Maximum,75,1798,210


## Gene counts by number of {transcripts, exons, introns}

In [201]:
# Break into bins and count
bins = [1, 5, 10, 20, 40, 80, 160, 420, 840, 1680, 99999]
labels = [
    '1 < x ≤ 5', 
    '5 < x ≤ 10', 
    '10 < x ≤ 20', 
    '20 < x ≤ 40', 
    '40 < x ≤ 80', 
    '80 < x ≤ 160', 
    '160 < x ≤ 420', 
    '420 < x ≤ 840', 
    '840 < x ≤ 1,680', 
    '1,680 < x',
]

# transcripts
dfTS = pd.cut(gene_cnts['Number Transcripts Per Gene'], bins=bins, labels=labels)
## Add 0 and 1 group
dfTS.cat.add_categories(['0', '1'], inplace=True)
dfTS[gene_cnts['Number Transcripts Per Gene'] == 0] = '0'
dfTS[gene_cnts['Number Transcripts Per Gene'] == 1] = '1'

# exons
dfExon = pd.cut(gene_cnts['Number Exons Per Gene'], bins=bins, labels=labels)
## Add 0 and 1 group
dfExon.cat.add_categories(['0', '1'], inplace=True)
dfExon[gene_cnts['Number Exons Per Gene'] == 0] = '0'
dfExon[gene_cnts['Number Exons Per Gene'] == 1] = '1'

# introns
dfIntron = pd.cut(gene_cnts['Number Introns Per Gene'], bins=bins, labels=labels)
## Add 0 and 1 group
dfIntron.cat.add_categories(['0', '1'], inplace=True)
dfIntron[gene_cnts['Number Introns Per Gene'] == 0] = '0'
dfIntron[gene_cnts['Number Introns Per Gene'] == 1] = '1'

# munge to data frame
df3 = pd.concat([dfTS.groupby(dfTS).count().map(lambda x: '{:,}'.format(x)),
           dfExon.groupby(dfExon).count().map(lambda x: '{:,}'.format(x)),
           dfIntron.groupby(dfIntron).count().map(lambda x: '{:,}'.format(x))], axis=1)

df3.index.name = 'Number Per Gene'

df3.columns = ['Number Genes (Transcripts)',
               'Number Genes (Exons)',
               'Number Genes (Introns)']

df3.loc[['0', '1'] + labels, ['Number Genes (Transcripts)', 'Number Genes (Exons)', 'Number Genes (Introns)']]

Unnamed: 0_level_0,Number Genes (Transcripts),Number Genes (Exons),Number Genes (Introns)
Number Per Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,4191
1,10035,4191,2856
1 < x ≤ 5,6571,6765,5991
5 < x ≤ 10,639,2697,2547
10 < x ≤ 20,143,1759,1379
20 < x ≤ 40,17,1103,393
40 < x ≤ 80,2,545,48
80 < x ≤ 160,0,261,1
160 < x ≤ 420,0,78,1
420 < x ≤ 840,0,5,0


# Intron Length Summary

## Quartile distribution of intron length

In [203]:
# Get list of introns
# Iterate over each gene and transcript to get a list of introns
introns = set()
for ts in db.features_of_type('transcript'):
    exons = sorted(list(db.children(ts, featuretype='exon')), key=lambda x: (x.start, x.end))
    for i in range(len(exons)-1):
        s = exons[i].end + 1
        e = exons[i+1].start
        l = e - s
        introns.add((s, e, l))

# Summarize Intron length
pd.DataFrame(['{:,.0f}'.format(x) for x in np.percentile([x[2] for x in introns], [0, 25, 50, 75, 100])],
             index=['Minimum', '25th Percentile', 'Median', '75th Percentile', 'Maximum'], columns=['Intron Length (bp)'])

Unnamed: 0,Intron Length (bp)
Minimum,2
25th Percentile,62
Median,101
75th Percentile,751
Maximum,268107


## Intron counts by intron length

In [207]:
# Summarize Number of introns with various lengths
df = pd.DataFrame(list(introns), columns=['start', 'end', 'length'])

bins = [0, 100, 500, 1000, 5000, 10000, 50000, 100000, 999999]
labels = ['0 < x ≤ 100', 
          '100 < x ≤ 500', 
          '500 < x ≤ 1,000', 
          '1,000 < x ≤ 5,000', 
          '5,000 < x ≤ 10,000', 
          '10,000 < x ≤ 50,000', 
          '50,000 < x ≤ 100,000', 
          '100,000 < x',
         ]

dfIn = pd.cut(df.length, bins=bins, labels=labels)

dd = pd.DataFrame(dfIn.groupby(dfIn).count().map(lambda x: '{:,}'.format(x)))
dd.index.name = 'Intron Length (bp)'
dd.columns = ['Number of Introns']
dd

Unnamed: 0_level_0,Number of Introns
Intron Length (bp),Unnamed: 1_level_1
0 < x ≤ 100,30130
100 < x ≤ 500,12130
"500 < x ≤ 1,000",4918
"1,000 < x ≤ 5,000",8420
"5,000 < x ≤ 10,000",2226
"10,000 < x ≤ 50,000",2237
"50,000 < x ≤ 100,000",172
"100,000 < x",38


# Exon Length Summary

## Quartile distribution of exon lengths

In [210]:
# Get list of exons
exons = [(x.start, x.end, len(x)) for x in db.features_of_type('exon')]

# Summarize exon length
pd.DataFrame(['{:,.0f}'.format(x) for x in np.percentile([x[2] for x in exons], [0, 25, 50, 75, 100])],
             index=['Minimum', '25th Percentile', 'Median', '75th Percentile', 'Maximum'], columns=['Exon Length (bp)'])

Unnamed: 0,Exon Length (bp)
Minimum,1
25th Percentile,145
Median,252
75th Percentile,556
Maximum,28074


## Exon counts by exon length

In [221]:
# Summarize Number of introns with various lengths
df = pd.DataFrame(exons, columns=['start', 'end', 'length'])
bins = [0, 100, 500, 1000, 5000, 10000, 50000, 100000, 999999]
labels = ['0 < x ≤ 100', 
          '100 < x ≤ 500', 
          '500 < x ≤ 1,000', 
          '1,000 < x ≤ 5,000', 
          '5,000 < x ≤ 10,000', 
          '10,000 < x ≤ 50,000', 
          '50,000 < x ≤ 100,000', 
          '100,000 < x',
         ]

dfEx = pd.cut(df.length, bins=bins, labels=labels)

dd = pd.DataFrame(dfEx.groupby(dfEx).count())
dd.index.name = 'Exon Length (bp)'
dd.columns = ['Number of Exons']

# Make sure my counts add up to the total
assert dd.sum().values == len(df)

dd.applymap(lambda x: '{:,}'.format(x))

Unnamed: 0_level_0,Number of Exons
Exon Length (bp),Unnamed: 1_level_1
0 < x ≤ 100,21748
100 < x ≤ 500,112814
"500 < x ≤ 1,000",30432
"1,000 < x ≤ 5,000",21018
"5,000 < x ≤ 10,000",613
"10,000 < x ≤ 50,000",58
"50,000 < x ≤ 100,000",0
"100,000 < x",0
