In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.colors as mcolors
import matplotlib as mpl
from matplotlib.patches import StepPatch
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import logomaker as lm
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")
%matplotlib inline

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)
tRNA_database_masked = dict()
tRNA_database_masked['human'] = '{}/tRNA_database_masked/human/human-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /home/sulab/tRNA-charge-seq/projects/RT-comp
Repo is in: /home/sulab/tRNA-charge-seq
Using minimum read length: 39 (after merge)


### Settings

In [3]:
sample_list_fnam = 'sample_list.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Downsample:
if False:
    sample_df, inp_file_df, seq_dir = downsample_raw_input(sample_df, inp_file_df, NBdir, data_dir, seq_dir_noDS, downsample_absolute=1e4)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

In [4]:
# Run AdapterRemoval:
AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False, AR_threads=6)
inp_file_df = AR_obj.run_parallel(n_jobs=2, overwrite=False)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/RT-comp/data/AdapterRemoval


In [5]:
# Split files based on barcodes:
BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12, load_previous=True)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/RT-comp/data/BC_split
Loaded results from previous run... Not running barcode split.


In [6]:
### Generate UMI stats and write final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.
UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False)
sample_df = UMItrim_obj.run_parallel(n_jobs=12, load_previous=True)
sample_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/RT-comp/data/UMI_trimmed
Loaded results from previous run... Not running UMI trimming.


Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp
0,A70,A70,1,2023-06-06/P12_R1.fastq.bz2,2023-06-06/P12_R2.fastq.bz2,D503,D704,l1Sp,human,P12,Percent charge,A70,1,AGGATAGG,GAGATTCC,GGCTGCCATGCGACTA,3641259,363050,3247474,3610524,99.155924,89.944673,3565268,503533,523704.186898,97.913057,96.148362
1,A71,A71,1,2023-06-06/P12_R1.fastq.bz2,2023-06-06/P12_R2.fastq.bz2,D503,D704,l2Sp,human,P12,Percent charge,A71,1,AGGATAGG,GAGATTCC,GGCTGCCATGCTGTCACG,3438153,229058,3179347,3408405,99.134768,93.279613,3365422,498838,523433.294249,97.884591,95.30116
2,A72,A72,1,2023-06-06/P12_R1.fastq.bz2,2023-06-06/P12_R2.fastq.bz2,D503,D704,l3Sp,human,P12,Percent charge,A72,1,AGGATAGG,GAGATTCC,GGCTGCCATGCTGCGA,4065762,297576,3734553,4032129,99.172775,92.619879,3979460,507732,524023.041926,97.877347,96.891159


In [7]:
### Align reads to database of reference tRNAs ###
align_obj = SWIPE_align(dir_dict, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                        gap_penalty=6, extension_penalty=3, min_score_align=15, \
                        common_seqs=common_seqs, overwrite_dir=False)
sample_df = align_obj.run_parallel(n_jobs=4, overwrite=False, load_previous=True)
sample_df.head(3)

Using common sequences to prevent duplicated alignment.
Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/RT-comp/data/SWalign
Loaded results from previous run... Not running alignment.


Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_mapped,percent_single_annotation,percent_multiple_annotation,percent_multiple_codons,Mapping_percent
0,A70,A70,1,2023-06-06/P12_R1.fastq.bz2,2023-06-06/P12_R2.fastq.bz2,D503,D704,l1Sp,human,P12,Percent charge,A70,1,AGGATAGG,GAGATTCC,GGCTGCCATGCGACTA,3641259,363050,3247474,3610524,99.155924,89.944673,3565268,503533,523704.186898,97.913057,96.148362,3394452,67.116518,32.883482,4.57473,95.208888
1,A71,A71,1,2023-06-06/P12_R1.fastq.bz2,2023-06-06/P12_R2.fastq.bz2,D503,D704,l2Sp,human,P12,Percent charge,A71,1,AGGATAGG,GAGATTCC,GGCTGCCATGCTGTCACG,3438153,229058,3179347,3408405,99.134768,93.279613,3365422,498838,523433.294249,97.884591,95.30116,3094055,66.318925,33.681075,4.50076,91.936613
2,A72,A72,1,2023-06-06/P12_R1.fastq.bz2,2023-06-06/P12_R2.fastq.bz2,D503,D704,l3Sp,human,P12,Percent charge,A72,1,AGGATAGG,GAGATTCC,GGCTGCCATGCTGCGA,4065762,297576,3734553,4032129,99.172775,92.619879,3979460,507732,524023.041926,97.877347,96.891159,3789861,67.002616,32.997384,4.402563,95.23556


In [8]:
### Collect alignment statistics ###
stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                             overwrite_dir=False)
stats_df = stats_obj.run_parallel(n_jobs=6, load_previous=True)
# The dataframe returned is the "ALL_stats_aggregate_filtered.csv"
# which is the aggregated data filtered to contain only the
# most relevant columnns and requiring the 3' must be covered
# and have no 3' non-template bases.
# The CSV file output "ALL_stats_aggregate.csv" is the data
# aggregated based on all values identical except
# readID, 5p_UMI and 3p_BC. I.e. every information, except
# the UMI seequence is maintained in the aggregated CSV.
stats_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/RT-comp/data/stats_collection
Loaded results from previous run... Not running stats collection.


Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,tRNA_annotation,tRNA_annotation_len,unique_annotation,5p_cover,align_3p_nt,codon,anticodon,amino_acid,count
0,A70,A70,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,A,AAA,TTT,eColiLys,10
1,A70,A70,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,C,AAA,TTT,eColiLys,1954
2,A70,A70,1,l1Sp,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,True,A,AAA,TTT,eColiLys,4


In [9]:
### Generate standard tRNAseq data plots ###
plot_obj = TRNA_plot(dir_dict, sample_df, overwrite_dir=True, pull_default=False)

# Plot UMI nucleotide content logo per sample:
#plot_obj.plot_UMI_logo()

# Plot the 5p non-template nucleotide logo per sample,
# using only those at, or below, the legth determined as a percentile:
#plot_obj.plot_non_temp('5p', '_5p-non-template_logo', seq_len_percentile=99)

# 3p non-template nucleotides are more rare, therefore 99.9 percentile:
#plot_obj.plot_non_temp('3p', '_3p-non-template_logo', seq_len_percentile=99.9)

# 3p non-template nucleotides, but require 3p coverage.
# This is more likely to reflect either:
# 1) Erronous CCA addition, or 2) additional bases in the adapter.
#plot_obj.plot_non_temp('3p', '_3p-non-template_3p-cover_logo', seq_len_percentile=99.9, _3p_cover=True)

# Plot the 5p to 3p coverage for each amino acid:
#plot_obj.plot_coverage(compartment='cyto', plot_type='needle', aa_norm=False, plot_name='cov_plot_cyto_needle')
#plot_obj.plot_coverage(compartment='mito', plot_type='needle', aa_norm=False, plot_name='cov_plot_mito_needle')
#plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', aa_norm=False, plot_name='cov_plot_cyto_behrens')
#plot_obj.plot_coverage(compartment='mito', plot_type='behrens', aa_norm=False, plot_name='cov_plot_mito_behrens')
#plot_obj.plot_coverage(compartment='cyto', plot_type='needle', aa_norm=False, #plot_name='cov_plot_cyto_needle_100p', sample_list=['100p1', '100p2', '100p3', '100p4'])
#plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', y_norm=True, plot_name='cov_plot_cyto_behrens_norm')



In [12]:
plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_cyto_behrens_norm', n_jobs=12)


Now collecting data for sample:  A01  A02  A03  A07  A08  A09  A10  A06  A13  A12  A11  A05  A16  A17  A23  A24  A22  A21  A26  A25  A20  A19  A15  A14  A18  A29  A32  A28  A27  A31  A30  A33  A34  A35  A36  A37  A46  A40  A38  A39  A41  A48  A45  A43  A44  A42  A47  A50  A53  A49  A52  A55  A51  A54  A58  A56  A59  A57  A63  A60  A62  A65  A61  A64  A68  A70  A66  A67  A69  A72  A71  A71_T  A70_T  A73_T  A72_T  A75  A74  A73  A76  A74_T  A76_T  A75_T
Now plotting sample:  A01  A02  A03  A05  A06  A07  A08  A09  A10  A11  A12  A13  A14  A15  A16  A17  A18  A19  A20  A21  A22  A23  A24  A25  A26  A27  A28  A29  A30  A31  A32  A33  A34  A35  A36  A37  A38  A39  A40  A41  A42  A43  A44  A45  A46  A47  A48  A49  A50  A51  A52  A53  A54  A55  A56  A57  A58  A59  A60  A61  A62  A63  A64  A65  A66  A67  A68  A69  A70  A71  A72  A73  A74  A75  A76  A70_T  A71_T  A72_T  A73_T  A74_T  A75_T  A76_T

In [13]:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_charge_grp', \
                        group=True, min_obs=500, charge_plot=True)


Now plotting sample/group:  P03  P04  P05  P06  P07  P08  P09  P10  P11  P12  P12_T

In [10]:
plot_obj.plot_Ecoli_ctr()

In [None]:
# P3, P6, P7, P10, P11, P12_T

In [10]:
plot_obj.plot_UMI_logo(n_jobs=8)

plot_obj.plot_non_temp(end='5p', plot_name='_5p-non-template_logo', \
                       seq_len_percentile=99, n_jobs=8)

plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_logo', \
                       seq_len_percentile=99.9, n_jobs=8)

plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_3p-cover_logo', \
                       seq_len_percentile=99.9, _3p_cover=True, n_jobs=8)

plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_cyto_behrens_norm', n_jobs=12)

plot_obj.plot_coverage(compartment='mito', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_mito_behrens_norm', n_jobs=12)




Now collecting data for sample:  100p1_v1  100p3_v2  0m_p1_v2  0m_p2_v2  0m_p3_v2  0m_p4_v2  85p1_v1  85p2_v1  85p1_v2  85p2_v2  70p2_v2  85p4_v2  85p3_v2  85p4_v1  70p1_v1  70p1_v2  70p4_v2  70p5_v2  70p3_v2  55p1_v1  55p3_v1  55p2_v2  55p1_v2  55p3_v2  55p4_v2  40p4_v1  40p2_v1  55p5_v2  40p1_v1  40p1_v2  40p2_v2  40p3_v2  25p1_v1  40p4_v2  25p1_v2  25p3_v2  25p2_v2  25p4_v2  10p1_v1  25p6_v2  10p1_v2  25p5_v2  10p2_v2  10p3_v2  10p3_v1  10p5_v2  10p4_v2  0p2_v1  0p1_v1  0p4_v1  0p1_v2  0p3_v2  0p_90m3_v1  0p_90m4_v1  0p2_v2  0p_90m2_v1  0p4_v2  0p_90m1_v1  1h_1  0m_1  4m_1  8m_1  16m_1  32m_1  2h_1  8h_1  40h_1  4h_1  0m_2  40h_NoOx_1  8m_2  16h_1  4m_2  1h_2  16m_2  32m_2  2h_2  4h_2  40h_2  8h_2  16h_2  0m_3  40h_NoOx_2  4m_3  8m_3  32m_3  16m_3  4h_3  1h_3  8h_3  2h_3  16h_3  40h_NoOx_3  40h_3  0m_4  4m_4  8m_4  16m_4  2h_4  1h_4  32m_4  4h_4  8h_4  16h_4  40h_NoOx_4  40h_4  8h_p1  8h_p2  8h_p3  8h_p8  8h_p7  8h_p4  8h_p9  8h_p5  8h_p6  L1  L2  L4  L3  L5  L7  L6  L8  L12  L10  

In [11]:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_abundance_grp', \
                        group=True, min_obs=500)

plot_obj.plot_abundance(plot_type='aa', plot_name='aa_abundance_grp', \
                        group=True, min_obs=500)

plot_obj.plot_abundance(plot_type='codon', plot_name='codon_charge_grp', \
                        group=True, min_obs=500, charge_plot=True)

plot_obj.plot_abundance(plot_type='aa', plot_name='aa_charge_grp', \
                        group=True, min_obs=500, charge_plot=True)

plot_obj.plot_abundance_corr(sample_pairs=[['100p', '100p', '85p', '85p', '40p', '40h'], \
                                           ['85p', '0p', '40p', '0p', '0p', '40h_NoOx']], \
                             plot_type='codon', plot_name='codon_abundance_corr', \
                             min_obs=500, charge_plot=False)

plot_obj.plot_abundance_corr(sample_pairs=[['100p', '100p', '85p', '85p', '40p', '40h'], \
                                           ['85p', '0p', '40p', '0p', '0p', '40h_NoOx']], \
                             plot_type='transcript', plot_name='tr_abundance_corr', \
                             min_obs=500, charge_plot=False)

plot_obj.plot_abundance_corr(sample_unique_pairs=[['L1', 'L1', 'R1'], ['R1', 'L6', 'R6']], \
                             plot_type='codon', plot_name='codon_abundance_corr_L-R', \
                             min_obs=500, charge_plot=False)

plot_obj.plot_abundance_corr(sample_unique_pairs=[['L1', 'L1', 'R1'], ['R1', 'L6', 'R6']], \
                             plot_type='transcript', plot_name='tr_abundance_corr_L-R', \
                             min_obs=500, charge_plot=False)

plot_obj.plot_abundance_corr(sample_unique_pairs=[['100p1', '85p1', '55p1', '40p1', '0p1', '40h_1', '40h_NoOx_1'], \
                                           ['100p3', '85p4', '55p3', '40p2', '0p2', '40h_2', '40h_NoOx_2']], \
                             plot_type='codon', plot_name='codon_charge_corr', \
                             min_obs=500, charge_plot=True)


Now plotting sample/group:  Charge-titration  Acylation-half-life  Barcode test  293t-NoLeu  293t-NoArg  293t-NoGln  HCT116-Depletion
Now plotting sample/group:  Charge-titration  Acylation-half-life  Barcode test  293t-NoLeu  293t-NoArg  293t-NoGln  HCT116-Depletion
Now plotting sample/group:  Charge-titration  Acylation-half-life  Barcode test  293t-NoLeu  293t-NoArg  293t-NoGln  HCT116-Depletion
Now plotting sample/group:  Charge-titration  Acylation-half-life  Barcode test  293t-NoLeu  293t-NoArg  293t-NoGln  HCT116-Depletion
Now plotting sample pairs:  (100p - 85p)  (100p - 0p)  (85p - 40p)  (85p - 0p)  (40p - 0p)  (40h - 40h_NoOx)
Now plotting sample pairs:  (100p - 85p)  (100p - 0p)  (85p - 40p)  (85p - 0p)  (40p - 0p)  (40h - 40h_NoOx)
Now plotting sample pairs:  (L1 - R1)  (L1 - L6)  (R1 - R6)
Now plotting sample pairs:  (L1 - R1)  (L1 - L6)  (R1 - R6)
Now plotting sample pairs:  (40h_1 - 40h_2)  (40h_NoOx_1 - 40h_NoOx_2)

In [7]:
asam = pd.read_excel('Alicias_samples.xlsx', header=None)
alicia_samples = set(asam[0].values)

In [36]:
amask_aa = plot_obj.charge_filt['aa']['sample_name_unique'].isin(alicia_samples)
plot_obj.charge_filt['aa'][amask_aa].reset_index(drop=True).to_csv('alicia_res_AA.csv', index=False)

In [37]:
amask_cd = plot_obj.charge_filt['codon']['sample_name_unique'].isin(alicia_samples)
plot_obj.charge_filt['codon'][amask_cd].reset_index(drop=True).to_csv('alicia_res_codon.csv', index=False)

In [38]:
amask_tr = plot_obj.charge_filt['tr']['sample_name_unique'].isin(alicia_samples)
plot_obj.charge_filt['tr'][amask_tr].reset_index(drop=True).to_csv('alicia_res_tr.csv', index=False)

In [18]:
mtm = sample_df['sample_name'].isin(['85p', '70p', '55p', '40p', '25p', '10p', '0p'])
scomp = sample_df.loc[mtm, 'sample_name_unique'].values

In [19]:
scomp

array(['85p1_v1', '85p2_v1', '85p4_v1', '85p1_v2', '85p2_v2', '85p3_v2',
       '85p4_v2', '70p1_v1', '70p1_v2', '70p2_v2', '70p3_v2', '70p4_v2',
       '70p5_v2', '55p1_v1', '55p3_v1', '55p1_v2', '55p2_v2', '55p3_v2',
       '55p4_v2', '55p5_v2', '40p1_v1', '40p2_v1', '40p4_v1', '40p1_v2',
       '40p2_v2', '40p3_v2', '40p4_v2', '25p1_v1', '25p1_v2', '25p2_v2',
       '25p3_v2', '25p4_v2', '25p5_v2', '25p6_v2', '10p1_v1', '10p3_v1',
       '10p1_v2', '10p2_v2', '10p3_v2', '10p4_v2', '10p5_v2', '0p1_v1',
       '0p2_v1', '0p4_v1', '0p1_v2', '0p2_v2', '0p3_v2', '0p4_v2'],
      dtype=object)

In [None]:
# Use the TM_analysis to find the most mutated positions in the tRNA transcripts.
# Then generate a new version of the tRNA transcripts database with these positions masked
# Then, using the masked sequences, re-run the alignment, stats collected and plotting

# Re-run using gap_open = -3 and gap_extension = -2 to reflect how gaps are more tolerated

### The masked tRNA database will only be used for alignment
# All other steps will use the old unmasked database

In [8]:
### Perform transcript mutation analysis ###
TM_obj = TM_analysis(dir_dict, sample_df, tRNA_database, pull_default=False, \
                     common_seqs=common_seqs, ignore_common_count=False, \
                     overwrite_dir=True)


#TM_obj.find_muts(n_jobs=12, unique_anno=True)
#TM_obj.find_muts(n_jobs=4)

#TM_obj.plot_transcript_logo()



# tr_mut_out = TM_obj.plot_transcript_mut(topN=30, no_plot_return=False, mito=False, gap_only=False, plot_name='test_heat', min_count_show=10)
#tr_mut_out = TM_obj.plot_transcript_mut(topN=30, no_plot_return=False, mito=False, gap_only=False, plot_name='test_heat2')


#tr_cov_out = TM_obj.plot_transcript_cov(topN=40, no_plot_return=False, mito=False, plot_name='test_heat_cov', sort_rows=True)
#tr_mut_out = TM_obj.plot_transcript_mut(topN=40, no_plot_return=False, mito=False, gap_only=False, plot_name='test_heat2', sort_rows=tr_cov_out[2])


#TM_obj.mask_tRNA_database(min_mut_freq=0.5, min_pos_count=50, min_tr_count=100, frac_max_score=0.90)

#tRNA_database_masked = TM_obj.write_masked_tRNA_database(out_dir='tRNA_database_masked')

Using common sequences...


In [5]:
#TM_obj.find_muts(n_jobs=10, unique_anno=True, fix_end=True, \
#                 sample_list=['100p1', '100p3', '0p1', '0p2', '0p4', \
#                              '0m_1', '40h_NoOx_1', '40h_NoOx_2', '0m_3',
#                              'L1', 'L3', 'L6', 'L8', 'L12', \
#                              'R1', 'R3', 'R6', 'R8', 'R12'])

In [20]:
TM_obj.find_muts(n_jobs=7, unique_anno=False, fix_end=True, \
                 sample_list=list(scomp))

Collecting stats from:  85p1_v1  85p2_v1  85p4_v1  85p1_v2  85p2_v2  85p3_v2  85p4_v2  70p1_v2  70p3_v2  55p1_v1  70p4_v2  70p1_v1  70p5_v2  70p2_v2  55p1_v2  55p3_v2  55p4_v2  55p3_v1  55p2_v2  55p5_v2  40p1_v1  40p4_v1  40p2_v1  40p2_v2  25p1_v1  40p1_v2  40p3_v2  25p2_v2  25p1_v2  40p4_v2  25p4_v2  25p3_v2  10p1_v1  25p5_v2  10p1_v2  10p3_v1  25p6_v2  10p4_v2  10p2_v2  10p3_v2  0p2_v1  0p4_v1  0p1_v1  10p5_v2  0p2_v2  0p1_v2  0p4_v2  0p3_v2

In [21]:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='10p-0p_tr-mut_matrix_comp_top20-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=1000, \
                                   sample_pairs=[['10p'], \
                                                 ['0p']], \
                                   freq_avg_weighted=True, \
                                   topN=20, topN_select='max_diff')


TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='10p-0p_tr-gap_matrix_comp_top20-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=True, \
                                   min_count_show=1000, \
                                   sample_pairs=[['10p'], \
                                                 ['0p']], \
                                   freq_avg_weighted=True, \
                                   topN=20, topN_select='max_diff')

In [23]:
TM_obj.write_transcript_mut(list(scomp), \
                            csv_name='mut-matrix')
TM_obj.write_transcript_mut(list(scomp), \
                            csv_name='gap-matrix', \
                            gap_only=True)

In [8]:
#alicia_samples.remove('R4')
TM_obj.find_muts(n_jobs=10, unique_anno=False, fix_end=True, \
                 sample_list=list(alicia_samples))

Collecting stats from:  L1  L2  L4  L3  L5  L6  L7  L10  L8  L9  R1  R4  R6  R2  L12  R8  R3  R10  L11  R5  R7  R11  2Q1  2Q3  2Q2  R12  2Q6  GQ3  R9  2Q5  GQ5  GQ6  2Q4  GQ1  GQ4  GQ2  HR2  HQ1  HR1  HQ2  H1  HL1  HL2  HGL1  HG1  HGR1  HGQ2  HGR2  HGQ1  HGL2

In [13]:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-mut_matrix_comp_top20-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=1000, \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', 'R6', 'R12', 'R8', 'R12']], \
                                   freq_avg_weighted=True, \
                                   topN=20, topN_select='max_diff')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-mut_matrix_comp_AA-Arg', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=1000, \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', 'R6', 'R12', 'R8', 'R12']], \
                                   freq_avg_weighted=True, \
                                   anno_substring_compare='Arg')

TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-mut_matrix_comp_AA-Leu', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=1000, \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', 'R6', 'R12', 'R8', 'R12']], \
                                   freq_avg_weighted=True, \
                                   anno_substring_compare='Leu')

In [14]:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-mut_matrix_comp_AA-Asn', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=400, \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', 'R6', 'R12', 'R8', 'R12']], \
                                   freq_avg_weighted=True, \
                                   anno_substring_compare='Asn')

In [15]:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='NoLeu-NoArg_tr-mut_matrix_comp_AA-Ala', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=400, \
                                   sample_unique_pairs=[['L1', 'L8',  'L1', 'L6',  'R1', 'R8',  'R1', 'R6'], \
                                                        ['L6', 'L12', 'L8', 'L12', 'R6', 'R12', 'R8', 'R12']], \
                                   freq_avg_weighted=True, \
                                   anno_substring_compare='Ala')

In [16]:
TM_obj.plot_transcript_logo(sample_list=['100p1', '100p3', '0p1', '0p2', '0p4', \
                                         '0m_1', '40h_NoOx_1', '40h_NoOx_2', '0m_3', \
                                         'L1', 'L3', 'L6', 'R1', 'R3', 'R6'], \
                            plot_name='tr-muts_logo')

In [17]:
tr_cov_out = TM_obj.plot_transcript_cov(topN=40, no_plot_return=True, mito=False, \
                                        plot_name='tr-cov_matrix', sort_rows=True, \
                                        sample_list=['100p1', '100p3', '0p1', '0p2', '0p4', \
                                                     '0m_1', '40h_NoOx_1', '40h_NoOx_2', '0m_3', \
                                                     'L1', 'L3', 'L6', 'R1', 'R3', 'R6'])

In [18]:
tr_mut_out = TM_obj.plot_transcript_mut(topN=40, no_plot_return=True, mito=False, \
                                        gap_only=False, min_count_show=1000, \
                                        plot_name='tr-mut_matrix', \
                                        sample_list=['100p1', '100p3', '0p1', '0p2', '0p4', \
                                                     '0m_1', '40h_NoOx_1', '40h_NoOx_2', '0m_3', \
                                                     'L1', 'L3', 'L6', 'R1', 'R3', 'R6'])

In [9]:
TM_obj.write_transcript_mut(list(alicia_samples), \
                            csv_name='mut-matrix_right-aligned')

In [11]:
TM_obj.write_transcript_mut(list(alicia_samples), \
                            csv_name='mut-matrix_left-aligned', \
                            right_align=False)