# Import

In [148]:
import os
import glob
import pandas as pd
#from multiprocessing import Pool

import bed_lib as bl
import divtrans as dt


%matplotlib inline

In [2]:
# To not have to restart the kernel everytime changes are made in modules
%load_ext autoreload
%autoreload 2

**WARNINGS**

- TRY filtering only uniquely mapping reads (MAPQ 255) and compare with filtering only primary 

**TO EXPLAIN**

There is a rationale being only using reads (and not fragments), since we are only interested in ends of - reads first and then beginnings of + reads 
> Actually might be interesting to try using only one reads of the pair (TO CHECK)

## Input

In [3]:
bams = glob.glob('../data/bams/*_coordSort_filt.bam')
bams

['../data/bams/VEUDE_A_1_coordSort_filt.bam',
 '../data/bams/Compoud_A_1_coordSort_filt.bam']

# Analyse from bams

The idea here is to detect divergent transcription directly from a sorted bam.
This would fix some limitations of the first method.

ALGO: 
Look through the coordinated sorted bam file, registering the transcribed strand for each reads.

Since we have paired strand-specific libraries (rf):
- first of the pair, reverse complemented = + strand
- second of the pair, NOT reverse complemented = + strand
- first of the pair, NOT reverse complemented = - strand
- second of the pair, reverse complemented = - strand

When a change of sign is detected, calculate the distance with the previous transcript on the opposite strand 

## Detect divergent transcription events accross sorted bam

In [4]:
# Create orientated read objects, detect divergent transcription events with them
# And create a bed file of these divergent transcription events

def get_divergent_transcription_beds(bam):
    oread_gen = dt.get_oriented_reads(bam)    
    bed_out = os.path.join('../results/from_bam', 
                           os.path.basename(bam.replace('.bam', '_divtrans.bed')))
    
    with open(bed_out, 'w') as f:
        for interval in dt.identify_divergent_transcription(oread_gen):
            f.write('\t'.join([str(x) for x in interval]) + '\n')
        
    return bl.Bed(bed_out)

In [126]:
div_trans_beds = [get_divergent_transcription_beds(bam) for bam in bams]

Merge the divergent transcription events detected:

In [127]:
div_trans_beds_merged = [bed.merge(outfolder='../results/from_bam') for bed in div_trans_beds]

# Compare to FANTOM5 CAGE enhancer prediction

One can use SLIDEBASE for enhancer http://slidebase.binf.ku.dk/human_enhancers/results

Downloading bed from SLIDEBASE (same as SLIDEBASE "Download BED"):

In [128]:
%%bash
    
wget http://slidebase.binf.ku.dk/human_enhancers/bed -O ../data/slidebase_enhancers.bed

# Convert to Ensembl type bed
sed 's/^chr//g' ../data/slidebase_enhancers.bed | bedtools sort -i - > ../data/slidebase_enhancers_clean.bed

rm ../data/slidebase_enhancers.bed

--2018-05-18 14:59:35--  http://slidebase.binf.ku.dk/human_enhancers/bed
Resolving slidebase.binf.ku.dk... 130.226.13.22
Connecting to slidebase.binf.ku.dk|130.226.13.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘../data/slidebase_enhancers.bed’

     0K .......... .......... .......... .......... ..........  857K
    50K .......... .......... .......... .......... .......... 1,47M
   100K .......... .......... .......... .......... .......... 78,4M
   150K .......... .......... .......... .......... .......... 90,5M
   200K .......... .......... .......... .......... .......... 1,74M
   250K .......... .......... .......... .......... .......... 74,6M
   300K .......... .......... .......... .......... .......... 97,3M
   350K .......... .......... .......... .......... ..........  101M
   400K .......... .......... .......... .......... .......... 2,04M
   450K .......... .......... .......... .......... .......... 78

In [210]:
slidebase_enh = bl.Bed('../data/slidebase_enhancers_clean.bed')

Get the chromosome sizes for statistical testing of interval association:

In [130]:
bl.get_chrom_sizes('/home/khourhin/data/genomes/Homo_sapiens.GRCh37.dna_sm.primary_assembly.fa', '../data/genome_chro_size.tab')

seq_names
1             249250621
10            135534747
11            135006516
12            133851895
13            115169878
14            107349540
15            102531392
16             90354753
17             81195210
18             78077248
19             59128983
2             243199373
20             63025520
21             48129895
22             51304566
3             198022430
4             191154276
5             180915260
6             171115067
7             159138663
8             146364022
9             141213431
MT                16569
X             155270560
Y              59373566
GL000192.1       547496
GL000225.1       211173
GL000194.1       191469
GL000193.1       189789
GL000200.1       187035
                ...    
GL000233.1        45941
GL000237.1        45867
GL000230.1        43691
GL000242.1        43523
GL000243.1        43341
GL000241.1        42152
GL000236.1        41934
GL000240.1        41933
GL000206.1        41001
GL000232.1        40652
GL0002

Testing if significant association between intervals detected with divergent transcription and the ones reported in fantom5 CAGE enhancer dataset:


In [131]:
[bed.fisher(slidebase_enh, '../data/genome_chro_size.tab') for bed in div_trans_beds_merged]

[{'bed1': 'VEUDE_A_1_coordSort_filt_divtrans_M',
  'bed2': 'slidebase_enhancers_clean',
  'left': 0.0,
  'ratio': 2.439,
  'right': 1.0,
  'two-tail': 0.0},
 {'bed1': 'Compoud_A_1_coordSort_filt_divtrans_M',
  'bed2': 'slidebase_enhancers_clean',
  'left': 0.0,
  'ratio': 2.687,
  'right': 1.0,
  'two-tail': 0.0}]

Evaluating the distance between enhancer intervals defined by fantom5 and the ones predicted from RNA-seq data:

In [187]:
pd.concat([bed.jaccard(slidebase_enh) for bed in div_trans_beds_merged])

Unnamed: 0,intersection,union-intersection,jaccard,n_intersections
VEUDE_A_1_coordSort_filt_divtrans_M,350683,58647364,0.00598,2779
Compoud_A_1_coordSort_filt_divtrans_M,432884,62644295,0.00691,3239


## Stats on divergent transcription intervals

In [152]:
all_beds = div_trans_beds_merged + [slidebase_enh] + inter_prediction_VS_cage
pd.DataFrame({bed.name: bed.stats(full_report=True) for bed in all_beds})

Unnamed: 0,Compoud_A_1_coordSort_filt_divtrans_M,Compoud_A_1_coordSort_filt_divtrans_M-inter-slidebase_enhancers_clean,VEUDE_A_1_coordSort_filt_divtrans_M,VEUDE_A_1_coordSort_filt_divtrans_M-inter-slidebase_enhancers_clean,slidebase_enhancers_clean
Nbases,53104738,432876,49025612,350683,9972201
Nintervals,227676,3239,213764,2779,32693
bedobj,<Bed object: Compoud_A_1_coordSort_filt_divtra...,<Bed object: Compoud_A_1_coordSort_filt_divtra...,<Bed object: VEUDE_A_1_coordSort_filt_divtrans_M>,<Bed object: VEUDE_A_1_coordSort_filt_divtrans...,<Bed object: slidebase_enhancers_clean>
len_distrib_25%,129,53.5,129,50,190
len_distrib_50%,180,109,178,105,288
len_distrib_75%,247,182,244,174,385
len_distrib_count,227676,3239,213764,2779,32693
len_distrib_max,21738,1057,22406,955,2860
len_distrib_mean,233.247,133.645,229.345,126.19,305.026
len_distrib_min,0,0,0,1,2


## Filter detected intervals based on counts

In [215]:
a = div_trans_beds_merged[0].coverage(bams[0])

CalledProcessError: Command 'bedtools coverage -a ../results/from_bam/VEUDE_A_1_coordSort_filt_divtrans_M.bed -b ../data/bams/VEUDE_A_1_coordSort_filt.bam > bed_outfolder/VEUDE_A_1_coordSort_filt_divtrans_M-coveredby-VEUDE_A_1_coordSort_filt.bam.bed' returned non-zero exit status 137.

In [None]:
a.head()

In [202]:
isinstance(slidebase_enh, Bed)

True