In [1]:
import pybedtools as pbt
import pysam
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os.path as op

In [3]:
import splanl.junction_scorer as jn
import splanl.merge_bcs as mbcs
import splanl.coords as coords
import splanl.plots as sp
import splanl.score_motifs as sm
import splanl.inspect_variants as iv
import splanl.post_processing as pp

Using TensorFlow backend.


In [4]:
fa_file = '/nfs/kitzman2/jacob/proj/jensplice/20220415_wt1_mpsa_trial3/jkp1053_1054_1055.fa'

In [5]:
refseq = pp.get_refseq( fa_file )

In [6]:
bam = '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_JKLab0340_MM1B/BB_test_Cos1053_JKLab0340_MM1BAligned.out.wbcs.bam'

In [7]:
msamp_fn = { bam.split( '/')[ -1 ].split( '.' )[ 0 ].replace( 'Aligned', '' ) : bam }

In [8]:
msamp_fn

{'BB_test_Cos1053_JKLab0340_MM1B': '/nfs/kitzman2/jacob/proj/jensplice/20220513_wt1_mpsa_trial4and3_deep_redoAligns/process_star/BB_test_Cos1053_JKLab0340_MM1B/BB_test_Cos1053_JKLab0340_MM1BAligned.out.wbcs.bam'}

In [9]:
msamp_rnabam = { samp: pysam.AlignmentFile( msamp_fn[ samp ], 'rb' ) for samp in msamp_fn }

In [10]:
%%time
#I made this more stringent than MSH2 since there is less skipping
#requiring 90 forward and 70 reverse matches
isos_dfs_s = { samp: jn.get_all_isoforms_pe( msamp_rnabam[ samp ],
                                           [ ( 649, 696 ), ( 3478, 3533 ) ],
                                            spl_tol = 3,
                                            indel_tol = 20,
                                            min_matches_for = 70,
                                            min_matches_rev = 50 )
             for samp in msamp_rnabam }

CPU times: user 1min 9s, sys: 786 ms, total: 1min 10s
Wall time: 1min 10s


In [11]:
for samp in isos_dfs_s.keys():
    
    print( samp )
    
    print( isos_dfs_s[ samp ].head() )

BB_test_Cos1053_JKLab0340_MM1B
                 read_count
isoform                    
((1267, 1359),)     2400975
((1267, 1350),)     2202889
()                  1143438
bad_ends              75036
bad_starts            29333


In [12]:
msamp_rnabam = { samp: pysam.AlignmentFile( msamp_fn[ samp ], 'rb' ) for samp in msamp_fn }

In [13]:
%%time
#I made this more stringent than MSH2 since there is less skipping
#requiring 90 forward and 70 reverse matches
isos_dfs_l = { samp: jn.get_all_isoforms_pe( msamp_rnabam[ samp ],
                                           [ ( 649, 696 ), ( 3478, 3533 ) ],
                                            spl_tol = 3,
                                            indel_tol = 20,
                                            min_matches_for = 90,
                                            min_matches_rev = 70 )
             for samp in msamp_rnabam }

CPU times: user 1min 5s, sys: 629 ms, total: 1min 5s
Wall time: 1min 5s


In [14]:
for samp in isos_dfs_l.keys():
    
    print( samp )
    
    print( isos_dfs_l[ samp ].head() )

BB_test_Cos1053_JKLab0340_MM1B
                 read_count
isoform                    
((1267, 1359),)     2398566
((1267, 1350),)     2199618
soft_clipped        1185451
bad_ends              50905
bad_starts            21891


In [15]:
for samp in isos_dfs_s.keys():
    
    print( samp )
    
    print( isos_dfs_s[ samp ].head() )

BB_test_Cos1053_JKLab0340_MM1B
                 read_count
isoform                    
((1267, 1359),)     2400975
((1267, 1350),)     2202889
()                  1143438
bad_ends              75036
bad_starts            29333


In [16]:
%%time
isogrp_df_s = jn.number_and_merge_isoforms( isos_dfs_s )

BB_test_Cos1053_JKLab0340_MM1B
CPU times: user 93.5 ms, sys: 12 µs, total: 93.5 ms
Wall time: 92.9 ms


In [17]:
%%time
isogrp_df_l = jn.number_and_merge_isoforms( isos_dfs_l )

BB_test_Cos1053_JKLab0340_MM1B
CPU times: user 76 ms, sys: 1e+03 µs, total: 77 ms
Wall time: 75.9 ms


In [18]:
isogrp_df_s.head()

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso000,"((1267, 1278), (1305, 1350))",902
iso001,"((1136, 1359),)",72
iso002,"((1190, 1226), (1260, 1359))",90
iso003,"((1267, 1353), (2559, 2565))",2
iso004,"((1265, 1359),)",112


In [19]:
isogrp_df_s

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso000,"((1267, 1278), (1305, 1350))",902
iso001,"((1136, 1359),)",72
iso002,"((1190, 1226), (1260, 1359))",90
iso003,"((1267, 1353), (2559, 2565))",2
iso004,"((1265, 1359),)",112
...,...,...
iso700,"((1260, 1359),)",8
iso701,"((1267, 1279), (1322, 1359))",668
iso702,"((1267, 1347), (2222, 2227))",1
iso703,"((1267, 1319), (2091, 2097), (2181, 2186))",1


In [20]:
isogrp_df_l.head()

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso000,"((1267, 1278), (1305, 1350))",901
iso001,"((1136, 1359),)",72
iso002,"((1190, 1226), (1260, 1359))",90
iso003,"((1267, 1353), (2559, 2565))",2
iso004,"((1265, 1359),)",112


In [21]:
isogrp_df_l

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso000,"((1267, 1278), (1305, 1350))",901
iso001,"((1136, 1359),)",72
iso002,"((1190, 1226), (1260, 1359))",90
iso003,"((1267, 1353), (2559, 2565))",2
iso004,"((1265, 1359),)",112
...,...,...
iso598,"((1260, 1359),)",8
iso599,"((1267, 1279), (1322, 1359))",666
iso600,"((1267, 1347), (2222, 2227))",1
iso601,"((1267, 1319), (2091, 2097), (2181, 2186))",1


In [22]:
isogrp_df_s.loc[ isogrp_df_s[ 'isoform' ] == 'bad_ends' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso355,bad_ends,75036


In [23]:
isogrp_df_l.loc[ isogrp_df_l[ 'isoform' ] == 'bad_ends' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso310,bad_ends,50905


In [24]:
isogrp_df_s.loc[ isogrp_df_s[ 'isoform' ] == 'soft_clipped' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso695,soft_clipped,4437


In [25]:
isogrp_df_l.loc[ isogrp_df_l[ 'isoform' ] == 'soft_clipped' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso595,soft_clipped,1185451


In [26]:
isogrp_df_s.loc[ isogrp_df_s.isoform == ((1267, 1359),) ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso544,"((1267, 1359),)",2400975


In [27]:
isogrp_df_l.loc[ isogrp_df_l.isoform == ((1267, 1359),) ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso471,"((1267, 1359),)",2398566


In [28]:
isogrp_df_s.loc[ isogrp_df_s.isoform == ((1267, 1350),) ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso658,"((1267, 1350),)",2202889


In [29]:
isogrp_df_l.loc[ isogrp_df_l.isoform == ((1267, 1350),) ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso568,"((1267, 1350),)",2199618


In [30]:
satbls = ! ls /nfs/turbo/umms-kitzmanj/oldvol2/jacob/proj/jensplice/20220426_wt1_subasm_filter_stringent/sapipe/sa/*.haps.all.txt

In [31]:
satbl_fn = { sa.split( '/')[ -1 ].split( '.' )[ 0 ]: pd.read_table( sa ).set_index( 'readgroupid' ) for sa in satbls }

In [32]:
for lib in satbl_fn.keys():
    
    satbl_fn[ lib ] = satbl_fn[ lib ][ [ col for col in satbl_fn[ lib ] if not col.startswith( 'nbp_' ) ] ].copy()

In [33]:
exonbed = pbt.BedTool( '/nfs/kitzman2/smithcat/proj/wt1_2022/refs/wt1_ex9.bed' )

In [34]:
isos = jn.make_junction_graph( exonbed )

In [35]:
isos

{'iso00': ((650, 696), (1267, 1350), (3479, 3533)),
 'iso01': ((650, 696), (1267, 1359), (3479, 3533)),
 'iso02': ((650, 696), (3479, 3533))}

In [36]:
unique_jns = list( { jn for grp,jn_tups in isos.items() for jn_tup in jn_tups for jn in jn_tup
                       if 696 < jn < 3479 } ) 

In [37]:
unique_jns

[1267, 1350, 1359]

In [38]:
msamp_rnabam = { samp : pysam.AlignmentFile( msamp_fn[ samp ], 'rb' ) for samp in msamp_fn }

In [39]:
for lib in satbl_fn:
    
    print( lib )
    
    print( satbl_fn[ lib ].head() )

JKP1053
                         passes                              status  \
readgroupid                                                           
AAAAAAAATCACGACCCTCCTGG   False  possible_chimeric_no_major_variant   
AAAAAAAGGGCTTCCGGGTATGG    True                   no_variants_input   
AAAAAAAGGGTTCCAGACTGTGG   False                     toomanymajorvar   
AAAAAAAGTAGTCTGGTGTGTGG    True                                pass   
AAAAAAATACCCCGGATGATTGG   False  possible_chimeric_no_major_variant   

                         n_variants_passing  \
readgroupid                                   
AAAAAAAATCACGACCCTCCTGG                   6   
AAAAAAAGGGCTTCCGGGTATGG                   0   
AAAAAAAGGGTTCCAGACTGTGG                   2   
AAAAAAAGTAGTCTGGTGTGTGG                   1   
AAAAAAATACCCCGGATGATTGG                   1   

                                                              variant_list  
readgroupid                                                                 
AAAAAAAATC

In [40]:
for lib in satbl_fn:
    
    print( lib )
    
    print( len( satbl_fn[ lib ] ) )
    
    print( satbl_fn[ lib ].variant_list.isnull().sum() )

JKP1053
413274
154756
JKP1054
311344
88664
JKP1055
630120
177766


In [41]:
%%time
#1 min/sample

iso_df_stats_l = { lib: jn.summarize_isos_by_var_bc_pe( { samp: bam for samp,bam in msamp_rnabam.items()
                                                        if lib.replace( 'JKP', '' ) in samp },
                                                      [ ( 649, 696 ), ( 3478, 3533 ) ],
                                                      satbl_fn[ lib ],
                                                      isogrp_df_l,
                                                      unique_jns,
                                                      [ ( ( 1266, 1350 ), ), ( ( 1266, 1359 ), ), () ],
                                                      spl_tol = 3,
                                                      indel_tol = 20,
                                                      min_matches_for = 90,
                                                      min_matches_rev = 70,
                                                      bc_tag = 'BC',
                                                     ) 
                  for lib in satbl_fn }

BB_test_Cos1053_JKLab0340_MM1B
Barcodes processed: 1000
Reads processed: 101205
Barcodes processed: 2000
Reads processed: 205641
Barcodes processed: 3000
Reads processed: 299930
Barcodes processed: 4000
Reads processed: 396153
Barcodes processed: 5000
Reads processed: 488837
Barcodes processed: 6000
Reads processed: 586554
Barcodes processed: 7000
Reads processed: 680654
Barcodes processed: 8000
Reads processed: 777247
Barcodes processed: 9000
Reads processed: 874904
Barcodes processed: 10000
Reads processed: 976876
Barcodes processed: 11000
Reads processed: 1095926
Barcodes processed: 12000
Reads processed: 1198768
Barcodes processed: 13000
Reads processed: 1300087
Barcodes processed: 14000
Reads processed: 1390618
Barcodes processed: 15000
Reads processed: 1510207
Barcodes processed: 16000
Reads processed: 1611438
Barcodes processed: 17000
Reads processed: 1707264
Barcodes processed: 18000
Reads processed: 1802124
Barcodes processed: 19000
Reads processed: 1896784
Barcodes processed:

For isoform: ((1284, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:1228:A:C', 9), ('jkp815:1228:A:T', 9), ('jkp815:1227:C:T', 8), ('jkp815:1227:C:A', 8), ('jkp815:1364:G:T', 7)]
For isoform: ((1236, 1252), (1304, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1321:A:T', 1), ('jkp815:1318:A:G', 1)]
For isoform: ((1000, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:1228:A:T', 2), ('jkp815:1228:A:G,jkp815:1267:G:C,jkp815:1347:A:G', 1), ('jkp815:1273:AACCATTCCAGTGTAAAACTTGT:AT,jkp815:1316:C:T', 1), ('jkp815:1312:G:C', 1), ('jkp815:1232:T:G,jkp815:1385:T:G', 1)]
For isoform: ((1267, 1298), (1321, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1247:G:A', 2), ('jkp815:1275:C:A', 1), ('jkp815:1235:G:T', 1), ('jkp815:1226:ACA:AA,jkp815:1347:A:C', 1), ('jkp815:1247:G:T', 1)]
For isoform: ((1267, 1293), (1334, 1359))
The variants with the top 5 number of barcodes are:
[('jkp815:1293:TGTCAGCGAAAGTTCTCCCGGTC

For isoform: ()
The variants with the top 5 number of barcodes are:
[('jkp815:1225:CAC:CC,jkp815:1392:T:A', 1), ('jkp815:1281:C:G', 1), ('jkp815:1319:C:A,jkp815:1344:C:T', 1), ('jkp815:1244:C:A,jkp815:1396:T:G', 1), ('jkp815:1259:C:G', 1)]
For isoform: ((1267, 1293), (1334, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1378:A:G', 1), ('jkp815:1253:C:G', 1), ('jkp815:1320:C:T', 1), ('jkp815:1345:A:C', 1), ('jkp815:1250:C:G', 1)]
For isoform: ((1267, 1325), (1353, 1359))
The variants with the top 5 number of barcodes are:
[('jkp815:1362:G:T', 2), ('jkp815:1227:CAT:CT,jkp815:1236:G:A,jkp815:1325:GAAGACCCACACCAGGACTCATACAGGTA:GA,jkp815:1379:CATT:CT', 1), ('jkp815:1271:G:C,jkp815:1317:G:A', 1), ('jkp815:1294:G:C,jkp815:1384:A:G', 1), ('jkp815:1288:A:G', 1)]
For isoform: ((1291, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:1243:GCT:GT,jkp815:1382:T:A', 1), ('jkp815:1352:T:C', 1)]
For isoform: ((1267, 1358),)
The variants with the top 5 number o

For isoform: ((1267, 1972), (3271, 3279))
The variants with the top 5 number of barcodes are:
[('jkp815:1308:T:G,jkp815:1313:G:T,jkp815:1352:T:C', 1)]
For isoform: ((1236, 1241), (1285, 1359))
The variants with the top 5 number of barcodes are:
[('jkp815:1253:C:A', 1), ('jkp815:1260:C:A,jkp815:1312:GGTCCGACCACCTGAAGACCCACACCAGGACTCATACAGGTAAA:GAGGTAAA,jkp815:1312:GGTCCGACCACCTGAAGACCCACACCAGGACTCATACAGGTAAA:GGTCCGACCACCTGAAGACCCACACCAGA', 1), ('jkp815:1311:C:T', 1), ('jkp815:1335:A:G', 1), ('jkp815:1285:G:A', 1)]
For isoform: ((1237, 1252), (1292, 1344), (2670, 2677))
The variants with the top 5 number of barcodes are:
[('jkp815:1327:A:G', 1)]
For isoform: ((1267, 1326), (1327, 1359))
The variants with the top 5 number of barcodes are:
[('jkp815:1225:CAC:CC,jkp815:1383:T:A', 1)]
For isoform: ((1267, 1340), (1964, 1969))
The variants with the top 5 number of barcodes are:
[('jkp815:1277:A:G', 1)]
For isoform: ((1136, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:

9536968 total reads in the bam file
162845 (68.82%) barcodes had no variant in the subassembly
3576291 (37.50%) reads were associated with barcodes without variants in the subassembly
From the remaining barcodes/reads in the subassembly...
1462 (1.98%) barcodes failed the min_bc_max_reads filter
18179 (0.30%) reads failed the min_bc_max_reads filter
218 (0.30%) barcodes did not fulfill any filter
3810 (0.06%) reads did not fulfill any filter
0 (0.00%) reads were unmapped
0 (0.00%) reads were unpaired
21156 (0.35%) reads were secondary alignments
215246 (3.61%) reads were soft clipped
9878 (0.17%) reads were bad starts
18650 (0.31%) reads were bad ends
CPU times: user 1min 1s, sys: 381 ms, total: 1min 1s
Wall time: 1min 1s


In [42]:
msamp_rnabam = { samp : pysam.AlignmentFile( msamp_fn[ samp ], 'rb' ) for samp in msamp_fn }

In [43]:
isogrp_df_s

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1
iso000,"((1267, 1278), (1305, 1350))",902
iso001,"((1136, 1359),)",72
iso002,"((1190, 1226), (1260, 1359))",90
iso003,"((1267, 1353), (2559, 2565))",2
iso004,"((1265, 1359),)",112
...,...,...
iso700,"((1260, 1359),)",8
iso701,"((1267, 1279), (1322, 1359))",668
iso702,"((1267, 1347), (2222, 2227))",1
iso703,"((1267, 1319), (2091, 2097), (2181, 2186))",1


In [45]:
%%time
#1 min/sample

iso_df_stats_s = { lib: jn.summarize_isos_by_var_bc_pe( { samp: bam for samp,bam in msamp_rnabam.items()
                                                        if lib.replace( 'JKP', '' ) in samp },
                                                      [ ( 649, 696 ), ( 3478, 3533 ) ],
                                                      satbl_fn[ lib ],
                                                      isogrp_df_s,
                                                      unique_jns,
                                                      [ ( ( 1266, 1350 ), ), ( ( 1266, 1359 ), ), () ],
                                                      spl_tol = 3,
                                                      indel_tol = 20,
                                                      min_matches_for = 70,
                                                      min_matches_rev = 50,
                                                      bc_tag = 'BC',
                                                     ) 
                  for lib in satbl_fn }

BB_test_Cos1053_JKLab0340_MM1B
Barcodes processed: 1000
Reads processed: 101205
Barcodes processed: 2000
Reads processed: 205641
Barcodes processed: 3000
Reads processed: 299930
Barcodes processed: 4000
Reads processed: 396153
Barcodes processed: 5000
Reads processed: 488837
Barcodes processed: 6000
Reads processed: 586554
Barcodes processed: 7000
Reads processed: 680654
Barcodes processed: 8000
Reads processed: 777247
Barcodes processed: 9000
Reads processed: 874904
Barcodes processed: 10000
Reads processed: 976876
Barcodes processed: 11000
Reads processed: 1095926
Barcodes processed: 12000
Reads processed: 1198768
Barcodes processed: 13000
Reads processed: 1300087
Barcodes processed: 14000
Reads processed: 1390618
Barcodes processed: 15000
Reads processed: 1510207
Barcodes processed: 16000
Reads processed: 1611438
Barcodes processed: 17000
Reads processed: 1707264
Barcodes processed: 18000
Reads processed: 1802124
Barcodes processed: 19000
Reads processed: 1896784
Barcodes processed:

For isoform: ((1273, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:1384:A:T', 2), ('jkp815:1229:TT:AG,jkp815:1231:G:T,jkp815:1234:AGGGCCGA:AGGCCGA,jkp815:1249:C:G,jkp815:1315:C:G,jkp815:1397:T:C', 1), ('jkp815:1253:C:T', 1), ('jkp815:1261:A:T', 1), ('jkp815:1321:A:C', 1)]
For isoform: ((1268, 1359),)
The variants with the top 5 number of barcodes are:
[('jkp815:1227:C:T', 3), ('jkp815:1317:G:T', 2), ('jkp815:1265:AGGTGTGA:AGTGTGA', 2), ('jkp815:1387:T:A', 2), ('jkp815:1265:AGGTGTGA:AGTGTGA,jkp815:1352:T:G', 2)]
For isoform: ((1267, 1289), (1335, 1359))
The variants with the top 5 number of barcodes are:
[('jkp815:1289:AACTTGTCAGCGAAAGTTCTCCCGGTCCGACCACCTGAAGACCCACA:AA', 2), ('jkp815:1245:T:A', 1), ('jkp815:1267:GTG:GG,jkp815:1297:A:T,jkp815:1309:C:G,jkp815:1366:A:G', 1), ('jkp815:1332:C:G', 1), ('jkp815:1222:GCCCACAT:GCCACAT', 1)]
For isoform: ((1299, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:1364:G:T', 2), ('jkp815:1228:A:C', 2), ('j

For isoform: ((1248, 1350),)
The variants with the top 5 number of barcodes are:
[('jkp815:1350:G:C', 1), ('jkp815:1256:T:C,jkp815:1299:C:G', 1), ('jkp815:1226:ACA:AA', 1), ('jkp815:1227:C:T,jkp815:1398:T:C', 1), ('jkp815:1240:G:C,jkp815:1283:G:T', 1)]
For isoform: ((1267, 1351), (1518, 1526))
The variants with the top 5 number of barcodes are:
[('jkp815:1352:TAAAACAAG:TAAAAACAAG', 1)]
For isoform: ((1267, 1355), (2041, 2045))
The variants with the top 5 number of barcodes are:
[('jkp815:1352:TAAAACAAG:TAAAAACAAG', 1)]
For isoform: ((1267, 1284), (1318, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1228:A:C', 2), ('jkp815:1228:A:G', 2), ('jkp815:1366:A:C', 2), ('jkp815:1318:A:T', 2), ('jkp815:1345:A:G', 2)]
For isoform: ((1267, 1355), (2222, 2227))
The variants with the top 5 number of barcodes are:
[('jkp815:1352:TAAAACAAG:TAAAAACAAG', 1), ('jkp815:1298:G:T,jkp815:1308:TCCCGGT:TCCGGT,jkp815:1352:TAAAACAAG:TAAAAACAAG', 1)]
For isoform: ((1273, 1359),)
The variant

For isoform: ((1267, 1303), (1326, 1359))
The variants with the top 5 number of barcodes are:
[('jkp815:1322:C:T', 1), ('jkp815:1277:ATTCC:ATCC', 1)]
For isoform: ((1267, 1303), (1323, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1231:G:T', 1), ('jkp815:1294:G:T', 1), ('jkp815:1235:G:A', 1), ('jkp815:1364:G:T', 1), ('jkp815:1366:A:T', 1)]
For isoform: ((1161, 1228), (1314, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1231:G:C,jkp815:1372:T:G', 1), ('jkp815:1340:G:A', 1), ('jkp815:1298:G:A,jkp815:1385:T:G', 1), ('jkp815:1223:C:T,jkp815:1309:C:T', 1)]
For isoform: ((1267, 1344), (2594, 2601))
The variants with the top 5 number of barcodes are:
[('jkp815:1300:G:A,jkp815:1346:TACAGGTAA:TA', 1)]
For isoform: ((1236, 1256), (1300, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1228:A:C', 1), ('jkp815:1260:C:G,jkp815:1396:T:C', 1)]
For isoform: ((879, 884), (1275, 1350))
The variants with the top 5 number of barcodes are:
[

For isoform: ((1267, 1319), (3023, 3031))
The variants with the top 5 number of barcodes are:
[('jkp815:1321:ACCTGAAGACCCACACCAGGA:AA', 1)]
For isoform: ((1267, 1321), (1655, 1664))
The variants with the top 5 number of barcodes are:
[('jkp815:1321:ACCTGAAGACCCACACCAGGA:AA', 1)]
For isoform: ((1267, 2234),)
The variants with the top 5 number of barcodes are:
[('jkp815:1390:C:T', 1), ('jkp815:1309:C:G,jkp815:1351:G:T', 1)]
For isoform: ((985, 1226), (1333, 1350))
The variants with the top 5 number of barcodes are:
[('jkp815:1272:A:G,jkp815:1297:A:G', 1), ('jkp815:1289:A:G', 1)]
For isoform: ((1267, 1349), (1567, 2601))
The variants with the top 5 number of barcodes are:
[('jkp815:1268:T:C', 1)]
For isoform: ((1267, 2227),)
The variants with the top 5 number of barcodes are:
[('jkp815:1226:ACA:AA,jkp815:1276:C:A', 1), ('jkp815:1288:A:C,jkp815:1336:C:A,jkp815:1343:TCATACAGGTAA:TA', 1), ('jkp815:1292:T:A', 1), ('jkp815:1288:A:C,jkp815:1306:T:C,jkp815:1336:C:A,jkp815:1343:TCATACAGGTAA:TA', 

CPU times: user 1min 2s, sys: 221 ms, total: 1min 2s
Wall time: 1min 2s


In [46]:
iso_df_stats_l[ 'JKP1053' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM1B_num_bcs,BB_test_Cos1053_JKLab0340_MM1B_num_vars,BB_test_Cos1053_JKLab0340_MM1B_max_reads_per_bc,BB_test_Cos1053_JKLab0340_MM1B_max_bc_per_var,BB_test_Cos1053_JKLab0340_MM1B_filter,total_read_count,total_num_bcs,total_num_vars,total_passfilt
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
iso000,"((1267, 1278), (1305, 1350))",901,55.0,54.0,3.0,2.0,5.0,901,55.0,54.0,1
iso001,"((1136, 1359),)",72,6.0,6.0,1.0,1.0,0.0,72,6.0,6.0,0
iso002,"((1190, 1226), (1260, 1359))",90,7.0,7.0,79.0,1.0,0.0,90,7.0,7.0,0
iso003,"((1267, 1353), (2559, 2565))",2,1.0,1.0,1.0,1.0,0.0,2,1.0,1.0,0
iso004,"((1265, 1359),)",112,8.0,8.0,62.0,1.0,5.0,112,8.0,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...
iso598,"((1260, 1359),)",8,2.0,2.0,1.0,1.0,0.0,8,2.0,2.0,0
iso599,"((1267, 1279), (1322, 1359))",666,38.0,38.0,1.0,1.0,0.0,666,38.0,38.0,0
iso600,"((1267, 1347), (2222, 2227))",1,1.0,1.0,1.0,1.0,0.0,1,1.0,1.0,0
iso601,"((1267, 1319), (2091, 2097), (2181, 2186))",1,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0


In [47]:
iso_df_stats_s[ 'JKP1053' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM1B_num_bcs,BB_test_Cos1053_JKLab0340_MM1B_num_vars,BB_test_Cos1053_JKLab0340_MM1B_max_reads_per_bc,BB_test_Cos1053_JKLab0340_MM1B_max_bc_per_var,BB_test_Cos1053_JKLab0340_MM1B_filter,total_read_count,total_num_bcs,total_num_vars,total_passfilt
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
iso000,"((1267, 1278), (1305, 1350))",902,55.0,54.0,3.0,2.0,5.0,902,55.0,54.0,1
iso001,"((1136, 1359),)",72,6.0,6.0,1.0,1.0,0.0,72,6.0,6.0,0
iso002,"((1190, 1226), (1260, 1359))",90,7.0,7.0,79.0,1.0,0.0,90,7.0,7.0,0
iso003,"((1267, 1353), (2559, 2565))",2,1.0,1.0,1.0,1.0,0.0,2,1.0,1.0,0
iso004,"((1265, 1359),)",112,8.0,8.0,62.0,1.0,5.0,112,8.0,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...
iso700,"((1260, 1359),)",8,2.0,2.0,1.0,1.0,0.0,8,2.0,2.0,0
iso701,"((1267, 1279), (1322, 1359))",668,38.0,38.0,1.0,1.0,0.0,668,38.0,38.0,0
iso702,"((1267, 1347), (2222, 2227))",1,1.0,1.0,1.0,1.0,0.0,1,1.0,1.0,0
iso703,"((1267, 1319), (2091, 2097), (2181, 2186))",1,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0


In [48]:
iso_df_stats_l[ 'JKP1053' ].loc[ iso_df_stats_l[ 'JKP1053' ].isoform == 'soft_clipped' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM1B_num_bcs,BB_test_Cos1053_JKLab0340_MM1B_num_vars,BB_test_Cos1053_JKLab0340_MM1B_max_reads_per_bc,BB_test_Cos1053_JKLab0340_MM1B_max_bc_per_var,BB_test_Cos1053_JKLab0340_MM1B_filter,total_read_count,total_num_bcs,total_num_vars,total_passfilt
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
iso595,soft_clipped,1185451,18821.0,5752.0,4405.0,198.0,0.0,1185451,18821.0,5752.0,0


In [49]:
for lib in iso_df_stats_l:
    print( lib, iso_df_stats_l[ lib ].shape )

JKP1053 (603, 11)
JKP1054 (603, 6)
JKP1055 (603, 6)


In [50]:
for lib in iso_df_stats_s:
    print( lib, iso_df_stats_s[ lib ].shape )

JKP1053 (705, 11)
JKP1054 (705, 6)
JKP1055 (705, 6)


In [51]:
for lib in iso_df_stats_s:
    print( lib, iso_df_stats_s[ lib ].query( 'total_passfilt > 0' ).shape )

JKP1053 (68, 11)
JKP1054 (0, 6)
JKP1055 (0, 6)


In [52]:
for lib in iso_df_stats_s:
    print( lib, ( iso_df_stats_s[ lib ].query( 'total_passfilt == 0' ).total_read_count.sum() / iso_df_stats_s[ lib ].total_read_count.sum() )*100 )

JKP1053 2.2095141206275732
JKP1054 100.0
JKP1055 100.0


In [53]:
for lib in iso_df_stats_s:
    print( lib, ( iso_df_stats_s[ lib ].query( 'total_passfilt == 0' ).total_num_bcs.sum() / iso_df_stats_s[ lib ].total_num_bcs.sum() )*100 )

JKP1053 12.071627452791597
JKP1054 nan
JKP1055 nan


  
  


In [54]:
iso_df_stats_s[ 'JKP1053' ]

Unnamed: 0_level_0,isoform,BB_test_Cos1053_JKLab0340_MM1B_read_count,BB_test_Cos1053_JKLab0340_MM1B_num_bcs,BB_test_Cos1053_JKLab0340_MM1B_num_vars,BB_test_Cos1053_JKLab0340_MM1B_max_reads_per_bc,BB_test_Cos1053_JKLab0340_MM1B_max_bc_per_var,BB_test_Cos1053_JKLab0340_MM1B_filter,total_read_count,total_num_bcs,total_num_vars,total_passfilt
isonum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
iso000,"((1267, 1278), (1305, 1350))",902,55.0,54.0,3.0,2.0,5.0,902,55.0,54.0,1
iso001,"((1136, 1359),)",72,6.0,6.0,1.0,1.0,0.0,72,6.0,6.0,0
iso002,"((1190, 1226), (1260, 1359))",90,7.0,7.0,79.0,1.0,0.0,90,7.0,7.0,0
iso003,"((1267, 1353), (2559, 2565))",2,1.0,1.0,1.0,1.0,0.0,2,1.0,1.0,0
iso004,"((1265, 1359),)",112,8.0,8.0,62.0,1.0,5.0,112,8.0,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...
iso700,"((1260, 1359),)",8,2.0,2.0,1.0,1.0,0.0,8,2.0,2.0,0
iso701,"((1267, 1279), (1322, 1359))",668,38.0,38.0,1.0,1.0,0.0,668,38.0,38.0,0
iso702,"((1267, 1347), (2222, 2227))",1,1.0,1.0,1.0,1.0,0.0,1,1.0,1.0,0
iso703,"((1267, 1319), (2091, 2097), (2181, 2186))",1,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0


In [55]:
bdout = '/nfs/kitzman2/smithcat/proj/wt1_2022/ex9_data/'

In [56]:
iso_df_stats_s[ 'JKP1053' ].reset_index().to_csv( bdout + 'wt1_ex9_isoforms_onesamp_2022-0517.txt',
                                                   sep = '\t',
                                                   index = False )