# Import

In [1]:
import os
import glob
import pandas as pd
#from multiprocessing import Pool

from bed_lib import Bed
import divtrans as dt


%matplotlib inline

In [2]:
# To not have to restart the kernel everytime changes are made in modules
%load_ext autoreload
%autoreload 2

**WARNINGS**

- TRY filtering only uniquely mapping reads (MAPQ 255) and compare with filtering only primary 

**TO EXPLAIN**

There is a rationale being only using reads (and not fragments), since we are only interested in ends of - reads first and then beginnings of + reads 
> Actually might be interesting to try using only one reads of the pair (TO CHECK)

## Input

In [3]:
bams = glob.glob('../data/bams/*_coordSort_filt.bam')
bams

['../data/bams/VEUDE_A_1_coordSort_filt.bam',
 '../data/bams/Compoud_A_1_coordSort_filt.bam']

# Analyse from bams

The idea here is to detect divergent transcription directly from a sorted bam.
This would fix some limitations of the first method.

ALGO: 
Look through the coordinated sorted bam file, registering the transcribed strand for each reads.

Since we have paired strand-specific libraries (rf):
- first of the pair, reverse complemented = + strand
- second of the pair, NOT reverse complemented = + strand
- first of the pair, NOT reverse complemented = - strand
- second of the pair, reverse complemented = - strand

When a change of sign is detected, calculate the distance with the previous transcript on the opposite strand 

## Detect divergent transcription events accross sorted bam

In [4]:
# Create orientated read objects, detect divergent transcription events with them
# And create a bed file of these divergent transcription events

def get_divergent_transcription_beds(bam):
    oread_gen = dt.get_oriented_reads(bam)    
    bed_out = os.path.join('../results/from_bam', 
                           os.path.basename(bam.replace('.bam', '_divtrans.bed')))
    
    with open(bed_out, 'w') as f:
        for interval in dt.identify_divergent_transcription(oread_gen):
            f.write('\t'.join([str(x) for x in interval]) + '\n')
        
    return Bed(bed_out)

In [5]:
div_trans_beds = [get_divergent_transcription_beds(bam) for bam in bams]

Merge the divergent transcription events detected:

In [10]:
div_trans_beds_merged = [bed.merge(outfolder='../results/from_bam') for bed in div_trans_beds]

## Stats on divergent transcription intervals

In [45]:
#[bed.plot() for bed in div_trans_beds_merged]
pd.DataFrame({bed.name: bed.stats(full_report=True) for bed in div_trans_beds_merged})

Unnamed: 0,Compoud_A_1_coordSort_filt_divtrans_M,VEUDE_A_1_coordSort_filt_divtrans_M
Nbases,53104738,49025612
Nintervals,227676,213764
bedobj,<Bed object: Compoud_A_1_coordSort_filt_divtra...,<Bed object: VEUDE_A_1_coordSort_filt_divtrans_M>
len_distrib_25%,129,129
len_distrib_50%,180,178
len_distrib_75%,247,244
len_distrib_count,227676,213764
len_distrib_max,21738,22406
len_distrib_mean,233.247,229.345
len_distrib_min,0,0
