In [1]:
import pybedtools as pbt
import pysam
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os.path as op

In [3]:
import splanl.junction_scorer as jn
import splanl.merge_bcs as mbcs
import splanl.coords as coords
import splanl.plots as sp
import splanl.score_motifs as sm
import splanl.inspect_variants as iv
import splanl.post_processing as pp

Using TensorFlow backend.


In [4]:
fa_file = '/nfs/kitzman2/jacob/proj/jensplice/20220415_wt1_mpsa_trial3/jkp1053_1054_1055.fa'

In [5]:
refseq = pp.get_refseq( fa_file )

In [6]:
bam1 = ! ls /nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test*/*out.wbcs.bam

In [7]:
bam2 = ! ls /nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_RT*/*out.wbcs.bam

In [8]:
bam1

['/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_JKLab0340_MM1B/BB_test_Cos1053_JKLab0340_MM1BAligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_JKLab0340_MM2B/BB_test_Cos1053_JKLab0340_MM2BAligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_oligodT_MM1B/BB_test_Cos1053_oligodT_MM1BAligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_oligodT_MM2B/BB_test_Cos1053_oligodT_MM2BAligned.out.wbcs.bam']

In [9]:
bam2

['/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1_340_243_Cos1053/BB_RT_1_340_243_Cos1053Aligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1_340_243_Hek1053/BB_RT_1_340_243_Hek1053Aligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1_340_335_Cos1053/BB_RT_1_340_335_Cos1053Aligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1_340_335_Hek1053/BB_RT_1_340_335_Hek1053Aligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1b_340_243_Hek1053/BB_RT_1b_340_243_Hek1053Aligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1b_340_335_Hek1053/BB_RT_1b_340_335_Hek1053Aligned.out.wbcs.bam',
 '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/p

In [10]:
bam = bam1 + bam2

In [11]:
msamp_fn = { b.split( '/')[ -1 ].split( '.' )[ 0 ].replace( 'Aligned', '' ) : b for b in bam }

In [12]:
msamp_fn

{'BB_test_Cos1053_JKLab0340_MM1B': '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_JKLab0340_MM1B/BB_test_Cos1053_JKLab0340_MM1BAligned.out.wbcs.bam',
 'BB_test_Cos1053_JKLab0340_MM2B': '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_JKLab0340_MM2B/BB_test_Cos1053_JKLab0340_MM2BAligned.out.wbcs.bam',
 'BB_test_Cos1053_oligodT_MM1B': '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_oligodT_MM1B/BB_test_Cos1053_oligodT_MM1BAligned.out.wbcs.bam',
 'BB_test_Cos1053_oligodT_MM2B': '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_oligodT_MM2B/BB_test_Cos1053_oligodT_MM2BAligned.out.wbcs.bam',
 'BB_RT_1_340_243_Cos1053': '/nfs/kitzman2/jacob/proj/jensplice/20220422_wt1_mpsa_trial4and3_deep/process_star/BB_RT_1_340_243_Cos1053/BB_RT_1_340_243_Cos1053Al

In [13]:
msamp_rnabam = { samp: pysam.AlignmentFile( msamp_fn[ samp ], 'rb' ) for samp in msamp_fn }

In [None]:
%%time
#I made this more stringent than MSH2 since there is less skipping
#requiring 90 forward and 70 reverse matches
isos_dfs = { samp: jn.get_all_isoforms_pe( msamp_rnabam[ samp ],
                                           [ ( 649, 696 ), ( 3478, 3533 ) ],
                                            spl_tol = 3,
                                            indel_tol = 20,
                                            min_matches_for = 90,
                                            min_matches_rev = 70 )
             for samp in msamp_rnabam }

In [58]:
for samp in isos_dfs.keys():
    
    print( samp )
    
    print( isos_dfs[ samp ].head() )

BB_test_Cos1053_JKLab0340_MM1B
                 read_count
isoform                    
((1267, 1359),)     2398566
((1267, 1350),)     2199618
((1284, 1359),)       17744
((1284, 1350),)       12115
((1268, 1350),)        5905
BB_test_Cos1053_JKLab0340_MM2B
                 read_count
isoform                    
((1267, 1359),)     5042945
((1267, 1350),)     4600777
((1284, 1359),)       39501
((1284, 1350),)       23727
((1268, 1350),)       13542
BB_test_Cos1053_oligodT_MM1B
                 read_count
isoform                    
((1267, 1359),)     2188404
((1267, 1350),)     1959934
((1284, 1359),)       17534
((1284, 1350),)       11526
((1268, 1350),)        5871
BB_test_Cos1053_oligodT_MM2B
                 read_count
isoform                    
((1267, 1359),)     6167843
((1267, 1350),)     5723480
((1284, 1359),)       44962
((1284, 1350),)       30212
((1269, 1350),)       17823
BB_RT_1_340_243_Cos1053
                 read_count
isoform                    
((1267, 1359),) 

In [None]:
%%time
isogrp_df = jn.number_and_merge_isoforms( isos_dfs )

In [52]:
isogrp_df.head()

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM2B_read_count,BB_test_Cos1053_oligodT_MM1B_read_count,BB_test_Cos1053_oligodT_MM2B_read_count,BB_RT_1_340_243_Cos1053_read_count,BB_RT_1_340_243_Hek1053_read_count,BB_RT_1_340_335_Cos1053_read_count,BB_RT_1_340_335_Hek1053_read_count,BB_RT_1b_340_243_Hek1053_read_count,...,BB_RT_2_ODT_335_Cos1054_read_count,BB_RT_2_ODT_335_Hek1054_read_count,BB_RT_3_340_243_Cos1055_read_count,BB_RT_3_340_243_Hek1055_read_count,BB_RT_3_340_335_Cos1055_read_count,BB_RT_3_340_335_Hek1055_read_count,BB_RT_3_ODT_243_Cos1055_read_count,BB_RT_3_ODT_243_Hek1055_read_count,BB_RT_3_ODT_335_cos1055_read_count,BB_RT_3_ODT_335_Hek1055_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
iso0000,"((1136, 1225), (1301, 1350))",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
iso0001,"((1267, 1290), (1320, 1350))",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
iso0002,"((1267, 1314), (1337, 1359))",1,272,0,0,1,127,64,59,140,...,0,0,0,0,0,0,0,0,0,0
iso0003,"((1267, 1282), (1328, 1359))",0,0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
iso0004,"((1136, 1224), (1306, 1359))",0,0,0,0,0,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0


In [53]:
isogrp_df.loc[ isogrp_df.isoform == ((1267, 1359),) ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM2B_read_count,BB_test_Cos1053_oligodT_MM1B_read_count,BB_test_Cos1053_oligodT_MM2B_read_count,BB_RT_1_340_243_Cos1053_read_count,BB_RT_1_340_243_Hek1053_read_count,BB_RT_1_340_335_Cos1053_read_count,BB_RT_1_340_335_Hek1053_read_count,BB_RT_1b_340_243_Hek1053_read_count,...,BB_RT_2_ODT_335_Cos1054_read_count,BB_RT_2_ODT_335_Hek1054_read_count,BB_RT_3_340_243_Cos1055_read_count,BB_RT_3_340_243_Hek1055_read_count,BB_RT_3_340_335_Cos1055_read_count,BB_RT_3_340_335_Hek1055_read_count,BB_RT_3_ODT_243_Cos1055_read_count,BB_RT_3_ODT_243_Hek1055_read_count,BB_RT_3_ODT_335_cos1055_read_count,BB_RT_3_ODT_335_Hek1055_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
iso2293,"((1267, 1359),)",2398566,5042945,2188404,6167843,1848512,3402477,498991,1002709,3088366,...,147916,670428,933746,2231420,641615,1423772,2632227,1602454,768075,1087995


In [54]:
isogrp_df.loc[ isogrp_df.isoform == ((1267, 1350),) ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM2B_read_count,BB_test_Cos1053_oligodT_MM1B_read_count,BB_test_Cos1053_oligodT_MM2B_read_count,BB_RT_1_340_243_Cos1053_read_count,BB_RT_1_340_243_Hek1053_read_count,BB_RT_1_340_335_Cos1053_read_count,BB_RT_1_340_335_Hek1053_read_count,BB_RT_1b_340_243_Hek1053_read_count,...,BB_RT_2_ODT_335_Cos1054_read_count,BB_RT_2_ODT_335_Hek1054_read_count,BB_RT_3_340_243_Cos1055_read_count,BB_RT_3_340_243_Hek1055_read_count,BB_RT_3_340_335_Cos1055_read_count,BB_RT_3_340_335_Hek1055_read_count,BB_RT_3_ODT_243_Cos1055_read_count,BB_RT_3_ODT_243_Hek1055_read_count,BB_RT_3_ODT_335_cos1055_read_count,BB_RT_3_ODT_335_Hek1055_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
iso1187,"((1267, 1350),)",2199618,4600777,1959934,5723480,1639853,3024211,456880,942554,2722208,...,131835,643764,853122,1925365,606692,1466165,2361296,1371133,728819,1187498


In [None]:
satbls = ! ls /nfs/turbo/umms-kitzmanj/oldvol2/jacob/proj/jensplice/20220426_wt1_subasm_filter_stringent/sapipe/sa/*.haps.all.txt

In [None]:
satbl_fn = { sa.split( '/')[ -1 ].split( '.' )[ 0 ]: pd.read_table( sa ).set_index( 'readgroupid' ) for sa in satbls }

In [None]:
for lib in satbl_fn.keys():
    
    satbl_fn[ lib ] = satbl_fn[ lib ][ [ col for col in satbl_fn[ lib ] if not col.startswith( 'nbp_' ) ] ].copy()

In [None]:
exonbed = pbt.BedTool( '/nfs/kitzman2/smithcat/proj/wt1_2022/refs/wt1_ex9.bed' )

In [None]:
isos = jn.make_junction_graph( exonbed )

In [55]:
isos

{'iso00': ((650, 696), (1267, 1350), (3479, 3533)),
 'iso01': ((650, 696), (1267, 1359), (3479, 3533)),
 'iso02': ((650, 696), (3479, 3533))}

In [None]:
unique_jns = list( { jn for grp,jn_tups in isos.items() for jn_tup in jn_tups for jn in jn_tup
                       if 696 < jn < 3479 } ) 

In [56]:
unique_jns

[1267, 1350, 1359]

In [None]:
msamp_rnabam = { samp : pysam.AlignmentFile( msamp_fn[ samp ], 'rb' ) for samp in msamp_fn }

In [None]:
for lib in satbl_fn:
    
    print( lib )
    
    print( satbl_fn[ lib ].head() )

In [None]:
%%time
#18 min/sample

iso_df_stats = { lib: jn.summarize_isos_by_var_bc_pe( { samp: bam for samp,bam in msamp_rnabam.items()
                                                        if lib.replace( 'JKP', '' ) in samp },
                                                      [ ( 649, 696 ), ( 3478, 3533 ) ],
                                                      satbl_fn[ lib ],
                                                      isogrp_df,
                                                      unique_jns,
                                                      [ ( ( 1266, 1350 ), ), ( ( 1266, 1359 ), ), () ],
                                                      spl_tol = 3,
                                                      indel_tol = 20,
                                                      min_matches_for = 90,
                                                      min_matches_rev = 70,
                                                      bc_tag = 'BC',
                                                     ) 
                  for lib in satbl_fn }

In [57]:
for lib in iso_df_stats:
    print( lib, iso_df_stats[ lib ].shape )

JKP1053 (6386, 117)
JKP1054 (6386, 77)
JKP1055 (6386, 77)


In [59]:
for lib in iso_df_stats:
    print( lib, iso_df_stats[ lib ].query( 'total_passfilt > 0' ).shape )

JKP1053 (231, 117)
JKP1054 (224, 77)
JKP1055 (257, 77)


In [60]:
for lib in iso_df_stats:
    print( lib, ( iso_df_stats[ lib ].query( 'total_passfilt == 0' ).total_read_count.sum() / iso_df_stats[ lib ].total_read_count.sum() )*100 )

JKP1053 0.2490599007186634
JKP1054 0.4206742678143337
JKP1055 0.4112410690525785


In [61]:
for lib in iso_df_stats:
    print( lib, ( iso_df_stats[ lib ].query( 'total_passfilt == 0' ).total_num_bcs.sum() / iso_df_stats[ lib ].total_num_bcs.sum() )*100 )

JKP1053 0.3191391206421711
JKP1054 0.36252133322385294
JKP1055 0.20012527710218403


In [62]:
iso_df_stats[ 'JKP1053' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM2B_read_count,BB_test_Cos1053_oligodT_MM1B_read_count,BB_test_Cos1053_oligodT_MM2B_read_count,BB_RT_1_340_243_Cos1053_read_count,BB_RT_1_340_243_Hek1053_read_count,BB_RT_1_340_335_Cos1053_read_count,BB_RT_1_340_335_Hek1053_read_count,BB_RT_1b_340_243_Hek1053_read_count,...,BB_RT_1_ODT_335_Cos1053_filter,BB_RT_1_ODT_335_Hek1053_num_bcs,BB_RT_1_ODT_335_Hek1053_num_vars,BB_RT_1_ODT_335_Hek1053_max_reads_per_bc,BB_RT_1_ODT_335_Hek1053_max_bc_per_var,BB_RT_1_ODT_335_Hek1053_filter,total_read_count,total_num_bcs,total_num_vars,total_passfilt
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
iso0000,"((1136, 1225), (1301, 1350))",0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,1.0,1.0,0
iso0001,"((1267, 1290), (1320, 1350))",0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,1.0,1.0,0
iso0002,"((1267, 1314), (1337, 1359))",1,272,0,0,1,127,64,59,140,...,0.0,10.0,6.0,7.0,5.0,3.0,1105,170.0,129.0,9
iso0003,"((1267, 1282), (1328, 1359))",0,0,2,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,11,2.0,2.0,0
iso0004,"((1136, 1224), (1306, 1359))",0,0,0,0,0,1,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,4,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iso6381,"((1236, 1242), (1266, 1350))",0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1.0,0
iso6382,"((1146, 1222), (1281, 1359))",0,0,0,0,0,3,0,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,12,1.0,1.0,0
iso6383,"((1284, 1348),)",0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,3,1.0,1.0,0
iso6384,"((1267, 1354), (3245, 3249))",0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0


In [None]:
bdout = '/nfs/kitzman2/smithcat/proj/wt1_2022/ex9_data/'

In [None]:
for lib in iso_df_stats.keys():    
    iso_df_stats[ lib ].reset_index().to_csv( bdout + 'wt1_ex9_%s_isoforms_2022-0514.txt' % lib,
                                               sep = '\t',
                                               index = False )

In [None]:
iso_df_stats_all = iso_df_stats[ 'JKP1053' ].merge( iso_df_stats[ 'JKP1054' ],
                                                    left_index = True,
                                                    right_index = True,
                                                    how = 'outer',
                                                    suffixes = ( '_JKP1053', '_JKP1054' )
                                                  )

In [63]:
iso_df_stats_all

Unnamed: 0_level_0,isoform_JKP1053,BB_test_Cos1053_JKLab0340_MM1B_read_count_JKP1053,BB_test_Cos1053_JKLab0340_MM2B_read_count_JKP1053,BB_test_Cos1053_oligodT_MM1B_read_count_JKP1053,BB_test_Cos1053_oligodT_MM2B_read_count_JKP1053,BB_RT_1_340_243_Cos1053_read_count_JKP1053,BB_RT_1_340_243_Hek1053_read_count_JKP1053,BB_RT_1_340_335_Cos1053_read_count_JKP1053,BB_RT_1_340_335_Hek1053_read_count_JKP1053,BB_RT_1b_340_243_Hek1053_read_count_JKP1053,...,BB_RT_3_ODT_335_Hek1055_max_bc_per_var,BB_RT_3_ODT_335_Hek1055_filter,total_read_count_JKP1055,total_num_bcs_JKP1055,total_num_vars_JKP1055,total_passfilt_JKP1055,total_read_count,total_num_bcs,total_num_vars,total_passfilt
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
iso0000,"((1136, 1225), (1301, 1350))",0,0,0,0,0,0,0,0,0,...,0.0,0.0,3,0.0,0.0,0,9,1.0,1.0,0
iso0001,"((1267, 1290), (1320, 1350))",0,0,0,0,0,0,0,0,0,...,0.0,0.0,2,0.0,0.0,0,6,1.0,1.0,0
iso0002,"((1267, 1314), (1337, 1359))",1,272,0,0,1,127,64,59,140,...,0.0,0.0,1105,0.0,0.0,0,3315,170.0,129.0,9
iso0003,"((1267, 1282), (1328, 1359))",0,0,2,0,0,0,0,1,0,...,0.0,0.0,11,0.0,0.0,0,33,2.0,2.0,0
iso0004,"((1136, 1224), (1306, 1359))",0,0,0,0,0,1,0,0,2,...,0.0,0.0,4,0.0,0.0,0,12,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iso6381,"((1236, 1242), (1266, 1350))",0,0,0,0,0,0,0,0,1,...,0.0,0.0,1,0.0,0.0,0,3,1.0,1.0,0
iso6382,"((1146, 1222), (1281, 1359))",0,0,0,0,0,3,0,0,3,...,0.0,0.0,12,0.0,0.0,0,36,1.0,1.0,0
iso6383,"((1284, 1348),)",0,1,0,0,0,0,0,0,0,...,1.0,0.0,3,1.0,1.0,0,9,2.0,2.0,0
iso6384,"((1267, 1354), (3245, 3249))",0,0,0,0,0,0,0,0,0,...,0.0,0.0,1,1.0,1.0,0,3,1.0,1.0,0


In [None]:
iso_df_stats_all = iso_df_stats_all.merge( iso_df_stats[ 'JKP1055' ].rename( columns = { col: col + '_JKP1055' 
                                                                                         for col in iso_df_stats[ 'JKP1055' ] if col.startswith( 'total_' ) } ),
                                            left_index = True,
                                            right_index = True,
                                            how = 'outer',
                                            suffixes = ( '', '_JKP1055' )
                                        )

In [64]:
total_cols = [ 'total_read_count', 'total_num_bcs', 'total_num_vars', 'total_passfilt' ]

for col in total_cols:
    
    iso_df_stats_all[ col ] = iso_df_stats_all[ [ lcol for lcol in iso_df_stats_all if col + '_' in lcol ] ].sum( axis = 1 )

In [65]:
len( bam )

32

In [66]:
iso_df_stats_all.shape

(6386, 275)

In [67]:
iso_df_stats_all.query( 'total_passfilt > 0' ).shape

(431, 275)

In [68]:
iso_df_stats_all.query( 'total_passfilt >= 16' ).shape

(66, 275)

In [69]:
( iso_df_stats_all.query( 'total_passfilt == 0' ).total_read_count.sum() / iso_df_stats_all.total_read_count.sum() )*100

0.11385525052641854

In [70]:
( iso_df_stats_all.query( 'total_passfilt >= 16' ).total_read_count.sum() / iso_df_stats_all.total_read_count.sum() )*100

99.47733909593774

In [71]:
( iso_df_stats_all.query( 'total_passfilt == 0' ).total_num_bcs.sum() / iso_df_stats_all.total_num_bcs.sum() )*100

0.2220691058447506

In [72]:
( iso_df_stats_all.query( 'total_passfilt >= 16' ).total_num_bcs.sum() / iso_df_stats_all.total_num_bcs.sum() )*100

99.06743856722191

In [73]:
iso_df_stats_all.total_num_bcs.isnull().sum()

0

In [None]:
iso_df_stats_all.reset_index().to_csv( bdout + 'wt1_ex9_isoforms_2022-0514.txt',
                                               sep = '\t',
                                               index = False )