In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.colors as mcolors
import matplotlib as mpl
from matplotlib.patches import StepPatch
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import logomaker as lm
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")
%matplotlib inline

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database_masked = dict()
tRNA_database_masked['human'] = '{}/tRNA_database_masked/human/human-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /home/sulab/tRNA-charge-seq/collab/Alicia_2022-12-28
Repo is in: /home/sulab/tRNA-charge-seq
Using minimum read length: 39 (after merge)


In [3]:
### Input settings ###
sample_list_fnam = 'sample_list.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Downsample:
if False:
    sample_df, inp_file_df, seq_dir = downsample_raw_input(sample_df, inp_file_df, NBdir, data_dir, seq_dir_noDS, downsample_absolute=1e4)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

In [4]:
### Run AdapterRemoval ###
AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False, \
                  check_input=False)
inp_file_df = AR_obj.run_parallel(n_jobs=2, overwrite=False)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/collab/Alicia_2022-12-28/data/AdapterRemoval


In [5]:
### Split files based on barcodes ###
BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=2, load_previous=True)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/collab/Alicia_2022-12-28/data/BC_split
Loaded results from previous run... Not running barcode split.


In [6]:
### Generate UMI stats and write final trimmed tRNA sequences ###
UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False, \
                       downsample_absolute=2e6)
sample_df = UMItrim_obj.run_parallel(n_jobs=4, load_previous=True)
sample_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/collab/Alicia_2022-12-28/data/UMI_trimmed
Loaded results from previous run... Not running UMI trimming.


Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_after_downsample
0,L1,L1,1,2022-12-28/P8_R1.fastq.bz2,2022-12-28/P8_R2.fastq.bz2,D503,D702,l1Sp,human,293t-NoLeu,Sample name,L1,1,AGGATAGG,TCCGGAGA,GGCTGCCATGCGACTA,6499165,314837,6142668,6457505,99.358995,95.124479,6385292,520600,524285.306593,98.247883,99.29708,2000000
1,L2,L2,1,2023-02-28/P6_R1.fastq.bz2,2023-02-28/P6_R2.fastq.bz2,D502,D703,l1Sp,human,293t-NoLeu,Sample name,L2,2,GCCTCTAT,CGCTCATT,GGCTGCCATGCGACTA,5002866,336642,4622921,4959563,99.134436,93.212265,4910419,516459,524243.123847,98.152119,98.515169,2000000
2,L3,L3,1,2022-12-28/P8_R1.fastq.bz2,2022-12-28/P8_R2.fastq.bz2,D503,D702,l2Sp,human,293t-NoLeu,Sample name,L3,3,AGGATAGG,TCCGGAGA,GGCTGCCATGCTGTCACG,4093393,358779,3698162,4056941,99.109492,91.156416,4021035,508194,524043.241134,98.232322,96.975585,2000000


In [7]:
### Align reads to database of reference tRNAs ###
align_obj = SWIPE_align(dir_dict, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                        gap_penalty=6, extension_penalty=3, min_score_align=15, \
                        common_seqs=common_seqs, overwrite_dir=True)
sample_df = align_obj.run_parallel(n_jobs=4, overwrite=True, load_previous=False)
sample_df.head(3)

Using common sequences to prevent duplicated alignment.
Running Swipe on:  L1  L4  L3  L2  L5  L7  L8  L6  L10  L9  L11  L12  R1  R2  R3  R4  R5  R8  R6  R7  R9  R10  R12  R11  2Q3  2Q1  2Q2  2Q4  2Q6  2Q5  GQ1  GQ2  GQ3  GQ4  GQ5  GQ6  H1  HL1  HL2  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGL2  HGR1  HGR2  HGQ1  common-seqs  HGQ2
Collecting alignment statistics, from sample:  L4  L2  L3  L1  L5  L7  L6  L8  L9  L10  L12  L11  R1  R2  R3  R4  R8  R5  R7  R6  R9  R10  R12  R11  2Q3  2Q1  2Q2  2Q4  2Q6  2Q5  GQ1  GQ2  GQ3  GQ4  GQ5  GQ6  HL2  HL1  H1  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGL2  HGR2  HGR1  HGQ1  HGQ2  common-seqs

Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_after_downsample,N_mapped,percent_single_annotation,percent_multiple_annotation,percent_multiple_codons,Mapping_percent
0,L1,L1,1,2022-12-28/P8_R1.fastq.bz2,2022-12-28/P8_R2.fastq.bz2,D503,D702,l1Sp,human,293t-NoLeu,Sample name,L1,1,AGGATAGG,TCCGGAGA,GGCTGCCATGCGACTA,6499165,314837,6142668,6457505,99.358995,95.124479,6385292,520600,524285.306593,98.247883,99.29708,2000000,1986745.0,81.428165,18.571835,2.586291,99.33725
1,L2,L2,1,2023-02-28/P6_R1.fastq.bz2,2023-02-28/P6_R2.fastq.bz2,D502,D703,l1Sp,human,293t-NoLeu,Sample name,L2,2,GCCTCTAT,CGCTCATT,GGCTGCCATGCGACTA,5002866,336642,4622921,4959563,99.134436,93.212265,4910419,516459,524243.123847,98.152119,98.515169,2000000,1986072.0,81.314978,18.685022,2.417536,99.3036
2,L3,L3,1,2022-12-28/P8_R1.fastq.bz2,2022-12-28/P8_R2.fastq.bz2,D503,D702,l2Sp,human,293t-NoLeu,Sample name,L3,3,AGGATAGG,TCCGGAGA,GGCTGCCATGCTGTCACG,4093393,358779,3698162,4056941,99.109492,91.156416,4021035,508194,524043.241134,98.232322,96.975585,2000000,1975947.0,80.374423,19.625577,2.643796,98.79735


In [8]:
### Collect alignment statistics ###
stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                             overwrite_dir=False)
stats_df = stats_obj.run_parallel(n_jobs=4, load_previous=True)
stats_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/Alicia_2022-12-28/data/stats_collection
Loaded results from previous run... Not running stats collection.


Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,species,tRNA_annotation,tRNA_annotation_len,unique_annotation,5p_cover,align_3p_nt,codon,anticodon,amino_acid,align_gap,fmax_score>0.9,count,UMIcount,UMI_percent_exp
0,L1,L1,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,A,AAA,TTT,eColiLys,False,False,1,1,100.0
1,L1,L1,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,A,AAA,TTT,eColiLys,False,True,3,3,100.000191
2,L1,L1,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,C,AAA,TTT,eColiLys,False,False,62,62,100.005818


In [9]:
### Generate standard tRNAseq data plots ###
plot_obj = TRNA_plot(dir_dict, sample_df, overwrite_dir=False, pull_default=False)

Folder exists and overwrite set to false... Doing nothing.


In [10]:
# Ecoli control:
plot_obj.plot_Ecoli_ctr(min_obs=100)

In [11]:
# Codon abundance barchart:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_abundance_grp', \
                        group=True, min_obs=500)

# Codon charge barchart:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_charge_grp', \
                        group=True, min_obs=500, charge_plot=True)


Now plotting sample/group:  293t-NoArg  293t-NoGln  293t-NoLeu  HCT116-Depletion
Now plotting sample/group:  293t-NoArg  293t-NoGln  293t-NoLeu  HCT116-Depletion

In [None]:
# Abundance correlation:
plot_obj.plot_abundance_corr(sample_unique_pairs=[['L1', 'L1', 'R1'], \
                                                  ['R1', 'L6', 'R6']], \
                             plot_type='transcript', plot_name='tr_abundance_corr_L-R', \
                             min_obs=500, charge_plot=False, log=True)

In [12]:
# Coverage plots for cyto/mito transcripts:
plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_cyto_behrens_norm', n_jobs=4)

plot_obj.plot_coverage(compartment='mito', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_mito_behrens_norm', n_jobs=4)


Now collecting data for sample:  L2  L1  L3  L4  L7  L5  L6  L8  L9  L10  L12  L11  R1  R2  R4  R3  R5  R7  R6  R8  R10  R9  R11  R12  2Q4  2Q1  2Q3  2Q2  2Q6  2Q5  GQ1  GQ2  GQ3  GQ4  GQ5  GQ6  HL1  H1  HL2  HR1  HQ1  HR2  HQ2  HG1  HGL1  HGL2  HGR1  HGR2  HGQ1  HGQ2
Now plotting sample:  L1  L2  L3  L4  L5  L6  L7  L8  L9  L10  L11  L12  R1  R2  R3  R4  R5  R6  R7  R8  R9  R10  R11  R12  2Q1  2Q2  2Q3  2Q4  2Q5  2Q6  GQ1  GQ2  GQ3  GQ4  GQ5  GQ6  H1  HL1  HL2  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGL2  HGR1  HGR2  HGQ1  HGQ2
Now collecting data for sample:  L1  L2  L3  L4  L5  L6  L7  L8  L9  L10  L11  L12  R1  R2  R3  R4  R5  R8  R7  R6  R9  R10  R11  R12  2Q4  2Q1  2Q2  2Q3  2Q5  2Q6  GQ1  GQ2  GQ3  GQ5  GQ4  GQ6  H1  HL1  HL2  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGR1  HGL2  HGR2  HGQ1  HGQ2
Now plotting sample:  L1  L2  L3  L4  L5  L6  L7  L8  L9  L10  L11  L12  R1  R2  R3  R4  R5  R6  R7  R8  R9  R10  R11  R12  2Q1  2Q2  2Q3  2Q4  2Q5  2Q6  GQ1  GQ2  GQ3  GQ4  GQ5  GQ6  H1  HL1  HL2  HR1  

In [13]:
# UMI logo:
plot_obj.plot_UMI_logo(n_jobs=4)


Now collecting data for sample:  L1  L2  L3  L4  L5  L7  L6  L8  L9  L10  L12  L11  R1  R3  R2  R4  R6  R8  R5  R7  R9  R10  R11  R12  2Q1  2Q4  2Q2  2Q3  2Q5  2Q6  GQ1  GQ2  GQ3  GQ4  GQ6  GQ5  H1  HL1  HL2  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGL2  HGR1  HGR2  HGQ1  HGQ2
Now plotting logo plot.

In [14]:
# Non-template nucleotides:
plot_obj.plot_non_temp(end='5p', plot_name='_5p-non-template_logo', \
                       seq_len_percentile=99, n_jobs=4)

plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_logo', \
                       seq_len_percentile=99.9, n_jobs=4)

plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_3p-cover_logo', \
                       seq_len_percentile=99.9, _3p_cover=True, n_jobs=4)


Now collecting data for sample:  L1  L2  L3  L4  L5  L7  L6  L8  L9  L11  L12  L10  R1  R3  R2  R4  R8  R5  R6  R7  R9  R10  R11  R12  2Q4  2Q1  2Q2  2Q3  2Q5  2Q6  GQ1  GQ2  GQ3  GQ4  GQ5  GQ6  H1  HL1  HL2  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGL2  HGR1  HGR2  HGQ1  HGQ2
Now plotting logo plot.
Now collecting data for sample:  L1  L2  L3  L4  L5  L6  L7  L8  L9  L10  L12  L11  R1  R2  R3  R4  R8  R5  R7  R6  R9  R10  R12  R11  2Q3  2Q1  2Q2  2Q4  2Q6  2Q5  GQ1  GQ2  GQ4  GQ3  GQ5  GQ6  H1  HL1  HL2  HR1  HR2  HQ2  HQ1  HG1  HGL1  HGL2  HGR2  HGR1  HGQ1  HGQ2
Now plotting logo plot.
Now collecting data for sample:  L1  L2  L3  L4  L5  L6  L7  L8  L9  L10  L12  L11  R2  R1  R3  R4  R8  R5  R6  R7  R9  R10  R11  R12  2Q4  2Q1  2Q2  2Q3  2Q6  2Q5  GQ1  GQ2  GQ4  GQ3  GQ5  GQ6  H1  HL1  HL2  HR1  HR2  HQ1  HQ2  HG1  HGL1  HGR1  HGL2  HGR2  HGQ1  HGQ2
Now plotting logo plot.

In [12]:
# Export data:
plot_obj.write_charge_df(df_type='aa', fnam='charge-df_aa_2022-12-28')
plot_obj.write_charge_df(df_type='codon', fnam='charge-df_codon_2022-12-28')
plot_obj.write_charge_df(df_type='transcript', fnam='charge-df_transcript_2022-12-28')

In [13]:
### Perform transcript mutation analysis ###
TM_obj = TM_analysis(dir_dict, sample_df, tRNA_database, pull_default=False, \
                     common_seqs=common_seqs, ignore_common_count=False, \
                     overwrite_dir=False)

Using common sequences...
Folder exists and overwrite set to false... Doing nothing.


In [14]:
# Find mutations and save them:
if False:
    TM_obj.find_muts(n_jobs=4, unique_anno=True)
    TM_obj.pickle_muts_write(pickle_name='saved_muts_unique-anno.pickle')
else:
    TM_obj.pickle_muts_read(pickle_name='saved_muts_unique-anno.pickle')

In [23]:
# Compare mutations/gaps/RT stops for Leu/Arg depleted samples:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-mut_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='mut', \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  \
                                                         'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', \
                                                         'R6', 'R12', 'R8', 'R12']], \
                                   topN=10, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-gap_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='gap', \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  \
                                                         'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', \
                                                         'R6', 'R12', 'R8', 'R12']], \
                                   topN=10, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-RTstops_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='RTstops', \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  \
                                                         'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', \
                                                         'R6', 'R12', 'R8', 'R12']], \
                                   topN=10, topN_select='max_diff')

In [24]:
# Compare mutations/gaps/RT stops for Gln depleted samples:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoGln_tr-mut_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='mut', \
                                   sample_unique_pairs=[['2Q1', '2Q1', '2Q1', 'GQ1',  \
                                                         'GQ1', 'GQ1', '2Q1', '2Q6'], \
                                                        ['2Q3', '2Q4', '2Q6', 'GQ3', \
                                                         'GQ4', 'GQ6', 'GQ1', 'GQ6']], \
                                   topN=10, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoGln_tr-gap_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='gap', \
                                   sample_unique_pairs=[['2Q1', '2Q1', '2Q1', 'GQ1',  \
                                                         'GQ1', 'GQ1', '2Q1', '2Q6'], \
                                                        ['2Q3', '2Q4', '2Q6', 'GQ3', \
                                                         'GQ4', 'GQ6', 'GQ1', 'GQ6']], \
                                   topN=10, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoGln_tr-RTstops_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='RTstops', \
                                   sample_unique_pairs=[['2Q1', '2Q1', '2Q1', 'GQ1',  \
                                                         'GQ1', 'GQ1', '2Q1', '2Q6'], \
                                                        ['2Q3', '2Q4', '2Q6', 'GQ3', \
                                                         'GQ4', 'GQ6', 'GQ1', 'GQ6']], \
                                   topN=10, topN_select='max_diff')

In [25]:
# Compare mutations/gaps/RT stops for Leu/Arg/Gln depleted samples (HCT116):
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='HCT_tr-mut_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='mut', \
                                   sample_unique_pairs=[['H1',   'H1',   'H1',  \
                                                         'HG1',  'HG1',  'HG1'], \
                                                        ['HL2',  'HR1',  'HQ2', \
                                                         'HGL2', 'HGR2', 'HGQ2']], \
                                   topN=10, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='HCT_tr-gap_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='gap', \
                                   sample_unique_pairs=[['H1',   'H1',   'H1',  \
                                                         'HG1',  'HG1',  'HG1'], \
                                                        ['HL2',  'HR1',  'HQ2', \
                                                         'HGL2', 'HGR2', 'HGQ2']], \
                                   topN=10, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='HCT_tr-RTstops_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, \
                                   min_count_show=500, \
                                   data_type='RTstops', \
                                   sample_unique_pairs=[['H1',   'H1',   'H1',  \
                                                         'HG1',  'HG1',  'HG1'], \
                                                        ['HL2',  'HR1',  'HQ2', \
                                                         'HGL2', 'HGR2', 'HGQ2']], \
                                   topN=10, topN_select='max_diff')

In [28]:
# Export the mutation/gap/RT stops data:
TM_obj.write_transcript_mut(data_type='mut', csv_name='tr-mut_matrix')
TM_obj.write_transcript_mut(data_type='gap', csv_name='tr-gap_matrix')
TM_obj.write_transcript_mut(data_type='RTstops', csv_name='tr-RTstops_matrix')