In [1]:
import pandas as pd

import sys
import os
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from scipy.stats import norm

from prepare_aparent_data_helpers import *

import isolearn.io as isoio


<h2>Load and Aggregate native cell type-specific data</h2>
<br/>
Load the processed APADB and Leslie data.<br/>
Aggregate isoform counts per tissue/celltype and dump as a dataframe.<br/>

In [2]:
#Load processed leslie and apadb data

load_suffix = '_wider_v2'

file_path = 'native_data/processed_data/leslie_apadb/final/'

index = np.load(file_path + 'leslie_apadb_index' + load_suffix + '.npy', allow_pickle=True)

df = pd.read_csv(file_path + 'leslie_apadb_data' + load_suffix + '.csv', sep=',')
gene_index = np.load(file_path + 'apadb_gene_index' + load_suffix + '.npy', allow_pickle=True)

leslie_cell_type_index = np.load(file_path + 'apadb_celltype_index' + load_suffix + '.npy', allow_pickle=True)
leslie_cleavage_count_matrix_dict = spio.loadmat(file_path + 'apadb_cleavage_count' + load_suffix + '.mat')
leslie_cleavage_count_matrix_dict_wide = spio.loadmat(file_path + 'apadb_cleavage_count_wide_ext' + load_suffix + '.mat')

df['apadb_count_pooled'] = np.load(file_path + 'apadb_orig_count' + load_suffix + '.npy')
df['apadb_total_count_pooled'] = np.load(file_path + 'apadb_orig_total_count' + load_suffix + '.npy')

df['row_index'] = np.arange(len(df), dtype=np.int)

In [3]:
#Rename cell type names

new_cell_type_index = []
for cell_type in leslie_cell_type_index :
    new_cell_type_index.append(cell_type.replace('-', '').replace('.', '_'))
    
    leslie_cleavage_count_matrix_dict[cell_type.replace('-', '').replace('.', '_')] = leslie_cleavage_count_matrix_dict[cell_type]
    leslie_cleavage_count_matrix_dict_wide[cell_type.replace('-', '').replace('.', '_')] = leslie_cleavage_count_matrix_dict_wide[cell_type]
    
    if cell_type.replace('-', '').replace('.', '_') != cell_type :
        leslie_cleavage_count_matrix_dict[cell_type] = None
        leslie_cleavage_count_matrix_dict_wide[cell_type] = None

leslie_cell_type_index = np.array(new_cell_type_index, dtype=np.object)

In [4]:
#Do some pre-filtering

print('Size before filtering = ' + str(len(df)))

df = df.query("apadb_total_count_pooled >= 100 and num_sites >= 2 and pas != -1").copy().reset_index(drop=True)

index = index[df['row_index']]
gene_index = gene_index[df['row_index']]

for cell_type_i, cell_type in enumerate(leslie_cell_type_index) :
    leslie_cleavage_count_matrix_dict[cell_type] = leslie_cleavage_count_matrix_dict[cell_type][df['row_index'], :]
    leslie_cleavage_count_matrix_dict_wide[cell_type] = leslie_cleavage_count_matrix_dict_wide[cell_type][df['row_index'], :]

df = df.drop(columns=['row_index'])
df['row_index'] = np.arange(len(df), dtype=np.int)

print('Size after filtering = ' + str(len(df)))


Size before filtering = 59731
Size after filtering = 51964


In [5]:
#Add apadb tissue-specific counts to dataframe

tissue_dict = {}

for tissue in ['kidney', 'pancreas', 'monocytes', 'all', 'pdac', 'prcc', 'full_blood', 'hlf'] :
    tissue_df = pd.read_csv('native_data/processed_data/apadb/apadb_' + tissue + '_tissue_data.csv', sep='\t')
    unique_genes = sorted(list(tissue_df['gene_symbol'].unique()))
    tissue_df = tissue_df.groupby('gene_symbol')
    
    tissue_dict[tissue] = {}
    for gene in unique_genes :
        tissue_dict[tissue][gene] = []
        tissue_gene_df = tissue_df.get_group(gene)
        for _, row in tissue_gene_df.iterrows() :
            tissue_dict[tissue][gene].append({
                'reads_supporting_site' : row['reads_supporting_site'],
                'total_count' : row['total_count'],
                'start' : row['start'],
                'end' : row['end']
            })

for tissue in tissue_dict :
    print('Aggregating counts for tissue = ' + str(tissue))
    
    tissue_counts = []
    tissue_total_counts = []
    
    for index, row in df.iterrows() :
        gene = row['gene']
        gene_id = row['gene_id']
        
        tissue_count = 0.
        tissue_total_count = 0.
        
        if gene in tissue_dict[tissue] and len(tissue_dict[tissue][gene]) > 0 :
            tissue_total_count = tissue_dict[tissue][gene][0]['total_count']
        
        cut_start = row['cut_start']
        cut_end = row['cut_end']

        if gene in tissue_dict[tissue] :
            tissue_sites = tissue_dict[tissue][gene]

            for tissue_site in tissue_sites :
                cand_start = int(tissue_site['start'])
                cand_end = int(tissue_site['end'])

                if (cand_start >= cut_start and cand_start <= cut_end) or (cand_end >= cut_start and cand_end <= cut_end) :
                    tissue_count = float(tissue_site['reads_supporting_site'])
                    break
        
        tissue_counts.append(tissue_count)
        tissue_total_counts.append(tissue_total_count)
    
    df['apadb_count_' + tissue] = tissue_counts
    df['apadb_total_count_' + tissue] = tissue_total_counts


Aggregating counts for tissue = kidney
Aggregating counts for tissue = pancreas
Aggregating counts for tissue = monocytes
Aggregating counts for tissue = all
Aggregating counts for tissue = pdac
Aggregating counts for tissue = prcc
Aggregating counts for tissue = full_blood
Aggregating counts for tissue = hlf


In [6]:
#Add leslie tissue-specific counts to dataframe

cut_start = 0#57
cut_end = 185#87

leslie_cleavage_count_matrix_pooled = sp.lil_matrix(leslie_cleavage_count_matrix_dict[leslie_cell_type_index[0]].shape)

for cell_type_i, cell_type in enumerate(leslie_cell_type_index) :
    print('Aggregating counts for cell type = ' + str(cell_type))
    
    leslie_cleavage_count_matrix = leslie_cleavage_count_matrix_dict[cell_type]
    leslie_cleavage_count_matrix_pooled += sp.coo_matrix(leslie_cleavage_count_matrix)
    
    leslie_site_counts = leslie_cleavage_count_matrix[:, cut_start:cut_end].sum(axis=1)
    
    df['leslie_count_' + cell_type] = leslie_site_counts
    df['leslie_total_count_' + cell_type] = df.groupby('gene')['leslie_count_' + cell_type].transform(lambda x : x.sum())


leslie_cleavage_count_matrix_pooled = sp.csr_matrix(leslie_cleavage_count_matrix_pooled)
leslie_site_counts_pooled = leslie_cleavage_count_matrix_pooled[:, cut_start:cut_end].sum(axis=1)

df['leslie_count_pooled'] = leslie_site_counts_pooled
df['leslie_total_count_pooled'] = df.groupby('gene')['leslie_count_pooled'].transform(lambda x : x.sum())

#Add apadb cut region measures

leslie_cleavage_count_dense_matrix_dict = {}

leslie_count_dict_apadb_region = {}
for cell_type in leslie_cell_type_index :
    leslie_cleavage_count_dense_matrix_dict[cell_type] = np.array(leslie_cleavage_count_matrix_dict[cell_type].todense())
    leslie_count_dict_apadb_region[cell_type] = []

leslie_count_dict_apadb_region['pooled'] = []

i = 0
for _, row in df.iterrows() :
    
    if i % 10000 == 0 :
        print('Processing APA site ' + str(i) + '...')
    
    strand = row['strand']
    
    cut_start = row['cut_start']
    cut_end = row['cut_end']
    pas_pos = row['pas_pos']
    
    start = 0
    end = 1
    if strand == '+' :
        start = cut_start - pas_pos + 50
        end = cut_end - pas_pos + 50
    else :
        start = pas_pos - cut_end + 56
        end = pas_pos - cut_start + 56
    
    pooled_cuts = np.zeros(186)
    
    for cell_type in leslie_cell_type_index :
        cuts = leslie_cleavage_count_dense_matrix_dict[cell_type][i, :]#np.ravel(leslie_cleavage_count_matrix_dict[cell_type][i, :].todense())
        pooled_cuts += cuts
        
        tissue_count = np.sum(cuts[start:end])
        leslie_count_dict_apadb_region[cell_type].append(tissue_count)
    
    pooled_count = np.sum(pooled_cuts[start:end])
    leslie_count_dict_apadb_region['pooled'].append(pooled_count)
    
    i += 1


for cell_type in leslie_cell_type_index :
    print('Aggregating counts for cell type = ' + str(cell_type))
    
    df['leslie_count_apadb_region_' + cell_type] = leslie_count_dict_apadb_region[cell_type]
    df['leslie_total_count_apadb_region_' + cell_type] = df.groupby('gene')['leslie_count_apadb_region_' + cell_type].transform(lambda x : x.sum())
    
df['leslie_count_apadb_region_pooled'] = leslie_count_dict_apadb_region['pooled']
df['leslie_total_count_apadb_region_pooled'] = df.groupby('gene')['leslie_count_apadb_region_pooled'].transform(lambda x : x.sum())

leslie_cleavage_count_dense_matrix_dict = None

Aggregating counts for cell type = hek293
Aggregating counts for cell type = mcf10a_hras2
Aggregating counts for cell type = mcf10a1
Aggregating counts for cell type = mcf10a2
Aggregating counts for cell type = mcf10a_hras1
Aggregating counts for cell type = bcells1
Aggregating counts for cell type = mcf7
Aggregating counts for cell type = bcells2
Aggregating counts for cell type = ovary
Aggregating counts for cell type = breast
Aggregating counts for cell type = brain
Aggregating counts for cell type = skmuscle
Aggregating counts for cell type = blcl
Aggregating counts for cell type = hES
Aggregating counts for cell type = testis
Aggregating counts for cell type = hela
Aggregating counts for cell type = ntera
Processing APA site 0...
Processing APA site 10000...
Processing APA site 20000...
Processing APA site 30000...
Processing APA site 40000...
Processing APA site 50000...
Aggregating counts for cell type = hek293
Aggregating counts for cell type = mcf10a_hras2
Aggregating counts f

In [7]:
#Dump APADB and Leslie data

print('Size of dataframe = ' + str(len(df)))
print('Size of wide ext tissue cuts = ' + str(leslie_cleavage_count_matrix_dict_wide['hek293'].shape))

data_dump_dict = { 'df' : df }
for cell_type in leslie_cell_type_index :
    data_dump_dict[cell_type] = leslie_cleavage_count_matrix_dict_wide[cell_type]

isoio.dump(data_dump_dict, 'prepared_data/apa_leslie_apadb_data_all_cuts/apa_leslie_apadb_data_all_cuts')


Size of dataframe = 51964
Size of wide ext tissue cuts = (51964, 356)


In [8]:

df


Unnamed: 0,gene_id,gene,sitenum,num_sites,pas,seq,seq_ext,wide_seq,wide_seq_ext,count,...,leslie_count_apadb_region_hES,leslie_total_count_apadb_region_hES,leslie_count_apadb_region_testis,leslie_total_count_apadb_region_testis,leslie_count_apadb_region_hela,leslie_total_count_apadb_region_hela,leslie_count_apadb_region_ntera,leslie_total_count_apadb_region_ntera,leslie_count_apadb_region_pooled,leslie_total_count_apadb_region_pooled
0,ABCB10.5,ABCB10,1,5,0,GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT...,TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC...,TCAGGTTTTGTATTTTCTTTTCTTGTGGAATATTTTAATTAATATA...,GAGTTTTAATAATTGTAACTTTTTAAATGTCTATAGCACTGAAGTT...,10,...,0.0,15.0,0.0,9.0,0.0,53.0,0.0,94.0,0.0,2039.0
1,ABCB10.4,ABCB10,2,5,0,CTATTTCATGAAAAGCATGGAATATTATATTTTATTGTTCATAATT...,CATGAACTAAGCATTTATTAGTTCCCTGATTAGACTGGAAGAAGAA...,AATGTAAATCAAATGGAAGTTTTCCCATGAACTAAGCATTTATTAG...,AAGTGCTTTTTCTCCATGGATGAGGCTAGACCCTAAGAAGTAATTA...,73,...,0.0,15.0,0.0,9.0,0.0,53.0,0.0,94.0,0.0,2039.0
2,ABCB10.3,ABCB10,3,5,2,CATAATTAATGAATAAAATTGATATGAATGAATATAGTGTTCTTTG...,AGAAGAAACCACTATTTCATGAAAAGCATGGAATATTATATTTTAT...,TTATTAGTTCCCTGATTAGACTGGAAGAAGAAACCACTATTTCATG...,GTAATTAAGTCAATGTAAATCAAATGGAAGTTTTCCCATGAACTAA...,6,...,0.0,15.0,0.0,9.0,0.0,53.0,0.0,94.0,0.0,2039.0
3,ABCB10.2,ABCB10,4,5,1,TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC...,TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT...,GGTTGAATCTGAGGAAAATAATCCTTGTGCCATAGAAGTATTTACG...,TGTGTTCTTTATAAAGTGTGATTTTCAGAAAGCAAACAACACAATT...,467,...,15.0,15.0,9.0,9.0,53.0,53.0,94.0,94.0,2039.0,2039.0
4,ABCB10.1,ABCB10,5,5,1,TCAGGAATAAAGAAAAGACTAACATTACACATATCCAAAAACATGT...,TATAAAACTTCTCACTACATTGTTTCTTAGTAGAATTTGGCTGTGG...,ACCAAAGATGCAGTCTGTCATTTCTTATAAAACTTCTCACTACATT...,TAGTGAACTTTATCTGTGTCTGTCACTTTTTTTTTTTTTATGACCC...,8,...,0.0,15.0,0.0,9.0,0.0,53.0,0.0,94.0,0.0,2039.0
5,ABCD3.5,ABCD3,1,5,0,TTCGAGACAAGCCTGGACAAAAAGCGAGACCCGCTTCTTTAAAAAA...,ATCCCAGCACTTTGGGAGGCTGAGATGGGAGGATCGCTTGAATCCA...,CCAGGCGTGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCT...,TAGGTACTTGGAAAAATTTTGTGGCATTAAAAACCAGACAAATGTA...,37,...,0.0,15.0,3.0,99.0,13.0,95.0,7.0,272.0,155.0,2694.0
6,ABCD3.4,ABCD3,2,5,0,GTATGGTTGTTTTACATATGTGTATGTGTGTATATGCATTTCAGTT...,ATTTAATATGTAATGTTATTGTTACATATTTATAACACAGCCATAT...,ATCTTATTGAAATGTAACTTTAGTCATTTAATATGTAATGTTATTG...,GTCTCTGATATTTGTGATGGCAAGAATCACTTTTAAGTTTTCTTTG...,12,...,0.0,15.0,0.0,99.0,0.0,95.0,0.0,272.0,0.0,2694.0
7,ABCD3.3,ABCD3,3,5,1,AATTTTTACCACTTCTGTTTAGCGAACTTGTATACTTATTTTCTGT...,CTGGCTACCGAAGTAAACTGATGTACTGAATTCCATAATACATAAC...,TTTGCAGAATTAACTATAACAATCACTGGCTACCGAAGTAAACTGA...,TCAGAAAGGGGCATTTTGTACTCTTGTTTTTGCATAACTGGTTTTG...,252,...,0.0,15.0,0.0,99.0,0.0,95.0,0.0,272.0,0.0,2694.0
8,ABCD3.2,ABCD3,4,5,0,GCCTTGACTTGAAAACATAGATAGTTTAATCTTGACTTGAAAAACA...,TGTTCTTTTTATTCTGGTATCTAAATACTGAGAAGTTCATTTATAA...,TCAGATATCCTATACAACCTTTGCTTGTTCTTTTTATTCTGGTATC...,CGAACTTGTATACTTATTTTCTGTTCAGATTAAAAAAAAAAAAAAA...,524,...,15.0,15.0,96.0,99.0,82.0,95.0,265.0,272.0,2539.0,2694.0
9,ABCD3.1,ABCD3,5,5,0,ATGAGAAAATAAGTATGAAACAGCAATGGTAGTTTGTTTTGCATTA...,TGCCAAGACATATCACCGTGTTCTCATAATAAGTTTTTACTTTTTA...,TTTCATCCATGAGCACCACGCTGCATGCCAAGACATATCACCGTGT...,GCAGTGGGAAATGGTAGTTTAATCCGAAGAATAAACCAAAGAATAA...,18,...,0.0,15.0,0.0,99.0,0.0,95.0,0.0,272.0,0.0,2694.0


In [9]:
#Add derti tissue-specific counts to dataframe

tissue_dict = {'pooled' : {}}

for tissue in ['brain', 'kidney', 'liver', 'maqc-brain1', 'maqc-brain2', 'maqc-UHR1', 'maqc-UHR2', 'muscle', 'testis'] :
    tissue_df = pd.read_csv('native_data/processed_data/derti/derti_apadb_v2_' + tissue + '.csv', sep='\t')[['gene', 'gene_id', 'supporting_reads']]
    
    tissue_dict[tissue] = {}
    
    for _, row in tissue_df.iterrows() :
        gene = str(row['gene'])
        gene_id = str(row['gene_id'])
        
        tissue_dict[tissue][gene_id] = row['supporting_reads']
        if gene not in tissue_dict[tissue] :
            tissue_dict[tissue][gene] = 0.
        
        tissue_dict[tissue][gene] += row['supporting_reads']
        
        if gene_id not in tissue_dict['pooled'] :
            tissue_dict['pooled'][gene_id] = 0.
        
        if gene not in tissue_dict['pooled'] :
            tissue_dict['pooled'][gene] = 0.
        
        tissue_dict['pooled'][gene_id] += row['supporting_reads']
        tissue_dict['pooled'][gene] += row['supporting_reads']

for tissue in ['brain', 'kidney', 'liver', 'maqc-brain1', 'maqc-brain2', 'maqc-UHR1', 'maqc-UHR2', 'muscle', 'testis', 'pooled'] :
    print('Aggregating counts for tissue = ' + str(tissue))
    
    tissue_counts = []
    tissue_total_counts = []
    
    for index, row in df.iterrows() :
        gene = row['gene']
        gene_id = row['gene_id']
        
        tissue_count = 0.
        tissue_total_count = 0.
        
        if gene_id in tissue_dict[tissue] :
            tissue_count = tissue_dict[tissue][gene_id]
        
        if gene in tissue_dict[tissue] :
            tissue_total_count = tissue_dict[tissue][gene]
        
        tissue_counts.append(tissue_count)
        tissue_total_counts.append(tissue_total_count)
    
    df['derti_count_' + tissue] = tissue_counts
    df['derti_total_count_' + tissue] = tissue_total_counts


Aggregating counts for tissue = brain
Aggregating counts for tissue = kidney
Aggregating counts for tissue = liver
Aggregating counts for tissue = maqc-brain1
Aggregating counts for tissue = maqc-brain2
Aggregating counts for tissue = maqc-UHR1
Aggregating counts for tissue = maqc-UHR2
Aggregating counts for tissue = muscle
Aggregating counts for tissue = testis
Aggregating counts for tissue = pooled


In [10]:
#Dump APADB, Leslie and Derti data

print('Size of dataframe = ' + str(len(df)))
print('Size of wide ext tissue cuts = ' + str(leslie_cleavage_count_matrix_dict_wide['hek293'].shape))

data_dump_dict = { 'df' : df }
for cell_type in leslie_cell_type_index :
    data_dump_dict[cell_type] = leslie_cleavage_count_matrix_dict_wide[cell_type]

isoio.dump(data_dump_dict, 'prepared_data/apa_leslie_derti_apadb_data_all_cuts/apa_leslie_derti_apadb_data_all_cuts')


Size of dataframe = 51964
Size of wide ext tissue cuts = (51964, 356)


In [12]:
#Extract isoform count matrices and tissue indexes
leslie_tissue_index = np.array(['hek293', 'mcf10a_hras2', 'mcf10a1', 'mcf10a2', 'mcf10a_hras1', 'bcells1', 'mcf7', 'bcells2', 'ovary', 'breast', 'brain', 'skmuscle', 'blcl', 'hES', 'testis', 'hela', 'ntera'], dtype=np.object)
derti_tissue_index = np.array(['brain', 'kidney', 'liver', 'maqc-brain1', 'maqc-brain2', 'maqc-UHR1', 'maqc-UHR2', 'muscle', 'testis'], dtype=np.object)
apadb_tissue_index = np.array(['kidney', 'pancreas', 'monocytes', 'all', 'pdac', 'prcc', 'full_blood', 'hlf'], dtype=np.object)

leslie_isoform_count_matrix = np.concatenate([np.ravel(df['leslie_count_' + tissue]).reshape(-1, 1) for tissue in leslie_tissue_index], axis=1)
derti_isoform_count_matrix = np.concatenate([np.ravel(df['derti_count_' + tissue]).reshape(-1, 1) for tissue in derti_tissue_index], axis=1)
apadb_isoform_count_matrix = np.concatenate([np.ravel(df['apadb_count_' + tissue]).reshape(-1, 1) for tissue in apadb_tissue_index], axis=1)

print('Leslie tissues = ' + str(leslie_tissue_index))
print('Derti tissues = ' + str(derti_tissue_index))
print('APADB tissues = ' + str(apadb_tissue_index))

Leslie tissues = ['hek293' 'mcf10a_hras2' 'mcf10a1' 'mcf10a2' 'mcf10a_hras1' 'bcells1'
 'mcf7' 'bcells2' 'ovary' 'breast' 'brain' 'skmuscle' 'blcl' 'hES'
 'testis' 'hela' 'ntera']
Derti tissues = ['brain' 'kidney' 'liver' 'maqc-brain1' 'maqc-brain2' 'maqc-UHR1'
 'maqc-UHR2' 'muscle' 'testis']
APADB tissues = ['kidney' 'pancreas' 'monocytes' 'all' 'pdac' 'prcc' 'full_blood' 'hlf']


In [18]:
#Join adjacent sites into pair-wise APA df

df['gene_id_dist'] = df['gene_id'].apply(lambda x: '.'.join(x.split('.')[:-1]) + '.' + str(int(x.split('.')[-1]) - 1))

df_dist = df.copy().set_index('gene_id')

dist_columns = [
    'sitenum',
    'pas',
    'seq',
    'wide_seq',
    'wide_seq_ext',
    'site_type',
    'pas_pos',
    'cut_start',
    'cut_end',
    'cut_mode',
    'mirna',
    'ratio',
    'row_index'
]

for cell_type in leslie_tissue_index :
    dist_columns.append('leslie_count_' + cell_type)
    dist_columns.append('leslie_count_apadb_region_' + cell_type)
dist_columns.append('leslie_count_pooled')
dist_columns.append('leslie_count_apadb_region_pooled')

for tissue in apadb_tissue_index :
    dist_columns.append('apadb_count_' + tissue)
dist_columns.append('apadb_count_pooled')

for tissue in derti_tissue_index :
    dist_columns.append('derti_count_' + tissue)
dist_columns.append('derti_count_pooled')

df_dist = df_dist[dist_columns]

df_pair = df.join(df_dist, on='gene_id_dist', how='inner', lsuffix='_prox', rsuffix='_dist')


#Aggregate prox + dist total counts

for tissue in leslie_tissue_index :
    df_pair['leslie_pair_count_' + tissue] = df_pair['leslie_count_' + tissue + '_prox'] + df_pair['leslie_count_' + tissue + '_dist']
    df_pair['leslie_pair_count_apadb_region_' + tissue] = df_pair['leslie_count_apadb_region_' + tissue + '_prox'] + df_pair['leslie_count_apadb_region_' + tissue + '_dist']

df_pair['leslie_pair_count_pooled'] = df_pair['leslie_count_pooled_prox'] + df_pair['leslie_count_pooled_dist']
df_pair['leslie_pair_count_apadb_region_pooled'] = df_pair['leslie_count_apadb_region_pooled_prox'] + df_pair['leslie_count_apadb_region_pooled_dist']

for tissue in apadb_tissue_index :
    df_pair['apadb_pair_count_' + tissue] = df_pair['apadb_count_' + tissue + '_prox'] + df_pair['apadb_count_' + tissue + '_dist']
df_pair['apadb_pair_count_pooled'] = df_pair['apadb_count_pooled_prox'] + df_pair['apadb_count_pooled_dist']

for tissue in derti_tissue_index :
    df_pair['derti_pair_count_' + tissue] = df_pair['derti_count_' + tissue + '_prox'] + df_pair['derti_count_' + tissue + '_dist']
df_pair['derti_pair_count_pooled'] = df_pair['derti_count_pooled_prox'] + df_pair['derti_count_pooled_dist']

#Compute site distance
df_pair['distance'] = np.abs(df_pair['cut_start_dist'] - df_pair['cut_start_prox'])

#Filter pair dataframe
filter_query = "(apadb_count_pooled_prox + apadb_count_pooled_dist >= 10) and "
filter_query += "pas_prox != -1 and pas_dist != -1"
filter_query += "and (site_type_prox == 'UTR3' or site_type_prox == 'Extension')"
filter_query += "and (site_type_dist == 'UTR3' or site_type_dist == 'Extension')"
filter_query += " and (cut_end_prox - cut_start_prox <= 60) and (cut_end_dist - cut_start_dist <= 60)"
filter_query += " and (distance >= 40 and distance <= 4000)"

df_pair_filtered = df_pair.query(filter_query).copy().reset_index(drop=True)
print(len(df_pair_filtered))

df_pair_filtered['row_index'] = np.arange(len(df_pair_filtered), dtype=np.int)


#Join cleavage measures and onto filtered pair dataframe
keep_index_prox = []
keep_index_dist = []

for _, row in df_pair_filtered.iterrows() :
    keep_index_prox.append(row['row_index_prox'])
    keep_index_dist.append(row['row_index_dist'])

leslie_cleavage_dict_prox = {}
leslie_cleavage_dict_dist = {}
for cell_type in leslie_tissue_index :
    leslie_cleavage_dict_prox[cell_type] = np.array(leslie_cleavage_count_matrix_dict_wide[cell_type][keep_index_prox, :].todense())
    leslie_cleavage_dict_dist[cell_type] = np.array(leslie_cleavage_count_matrix_dict_wide[cell_type][keep_index_dist, :].todense())


24678


In [19]:
#Dump APADB, Leslie and Derti pair-wise data

print('Size of dataframe = ' + str(len(df_pair_filtered)))
print('Size of prox wide ext tissue cuts = ' + str(leslie_cleavage_dict_prox['hek293'].shape))
print('Size of dist wide ext tissue cuts = ' + str(leslie_cleavage_dict_dist['hek293'].shape))

data_dump_dict = { 'df_pair' : df_pair_filtered }
for cell_type in leslie_cell_type_index :
    data_dump_dict[cell_type + '_prox'] = sp.csr_matrix(leslie_cleavage_dict_prox[cell_type])
    data_dump_dict[cell_type + '_dist'] = sp.csr_matrix(leslie_cleavage_dict_dist[cell_type])

isoio.dump(data_dump_dict, 'prepared_data/apa_leslie_derti_apadb_pair_data_all_cuts/apa_leslie_derti_apadb_pair_data_all_cuts')


Size of dataframe = 24678
Size of prox wide ext tissue cuts = (24678, 356)
Size of dist wide ext tissue cuts = (24678, 356)
