# Non-coding regions

Brian is wanting a wiggle track with just the non-coding regions. My pipeline is producing BedGraphs, which are just a special type of BED file. I should be able to generate extend the genome models slightly (say 20bp) and then take the inverse of the gene models and produce Bed File. The BedGraphs are stranded, so it probably makes sense to create a separate inverse BED for each strand. Then I can use BedTools to intersect the BedGraph and convert to wiggles.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-04-20 
Git hash: 9ab597bab00e615cd19772b4adfdc95a7ccffa27


In [4]:
# Additional Imports
import pybedtools

# Import GTF
chromSizes = '/data/LCDB/lcdb-references/dmel/r6-11/fasta/dmel_r6-11.chromsizes'
GTF = '/data/LCDB/lcdb-references/dmel/r6-11/gtf/dmel_r6-11.gtf'
gtf = pybedtools.BedTool(GTF).remove_invalid().saveas()

# Subset functions for pybedtools
def featuretype_filter(feature, featuretype):
    if feature[2] == featuretype:
        return True
    return False

def subset_featuretypes(g, featuretype):
    result = g.filter(featuretype_filter, featuretype).saveas()
    return pybedtools.BedTool(result.fn)

def strand_filter(feature, strand):
    if feature[3] == strand:
        return True
    return False

def subset_strand(g, strand):
    result = g.filter(strand_filter, strand).saveas()
    return pybedtools.BedTool(result.fn)

# pull out all exons
exons = subset_featuretypes(gtf, 'exon').bed6()

# Merge overlapping regions strand specifically
merged = exons.sort().merge(s=True, bed=True)

# Extend features by 20bp
extended = merged.slop( g=chromSizes, b=20)

# split by strand
exons_plus = pybedtools.BedTool([x[:3] + ['.', '.', '+'] for x in subset_strand(extended, '+')])
exons_minus = pybedtools.BedTool([x[:3] + ['.', '.', '-'] for x in subset_strand(extended, '-')])

In [2]:
def add_strand(feature, strand='+'):
    feature.name = '.'
    feature.strand = strand
    return feature

In [5]:
exons_plus.head()

chr2L	7508	8136	.	.	+
 chr2L	8172	9504	.	.	+
 chr2L	21931	22961	.	.	+
 chr2L	22977	24257	.	.	+
 chr2L	54796	55787	.	.	+
 chr2L	65978	66262	.	.	+
 chr2L	66297	66632	.	.	+
 chr2L	66655	67023	.	.	+
 chr2L	67022	67527	.	.	+
 chr2L	67548	67782	.	.	+
 

In [111]:
!cp /home/fearjm/Projects/ncbi_remap/output/alignment/raw/ERX455041/ERR489286/ERR489286.fq.first.bedgraph .

In [124]:
bg = pybedtools.BedTool('../../output/alignment/raw/ERX455041/ERR489286/ERR489286.fq.first.bedgraph').saveas()
bed = pybedtools.BedTool('../../output/inverse_exons_20bp.first.bed')

In [126]:
bg_bed = bg.intersect(bed).sort()

In [127]:
bg_bed.head()

chr2L	7508	8136	0
 chr2L	8172	9504	0
 chr2L	21931	22340	0
 chr2L	22340	22400	1
 chr2L	22400	22961	0
 chr2L	22977	23270	0
 chr2L	23270	23330	1
 chr2L	23330	23600	0
 chr2L	23600	23660	1
 chr2L	23660	24257	0
 

In [134]:
bg_bed.merge(c=4, o='max').head()

chr2L	7508	8136	0
 chr2L	8172	9504	0
 chr2L	21931	22961	1
 chr2L	22977	24257	1
 chr2L	54796	55787	0
 chr2L	65978	66262	0
 chr2L	66297	66632	0
 chr2L	66655	67527	8
 chr2L	67548	67782	22
 chr2L	67871	68043	18
 