In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.colors as mcolors
import matplotlib as mpl
from matplotlib.patches import StepPatch
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import logomaker as lm
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")
%matplotlib inline

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)
tRNA_database_masked = dict()
tRNA_database_masked['human'] = '{}/tRNA_database_masked/human/human-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /home/sulab/tRNA-charge-seq/projects/Alicia-samples
Repo is in: /home/sulab/tRNA-charge-seq
Using minimum read length: 39 (after merge)


### Settings

In [3]:
sample_list_fnam = 'sample_list.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Downsample:
if False:
    sample_df, inp_file_df, seq_dir = downsample_raw_input(sample_df, inp_file_df, NBdir, data_dir, seq_dir_noDS, downsample_absolute=1e4)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

In [4]:
# Run AdapterRemoval:
AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False)
inp_file_df = AR_obj.run_parallel(n_jobs=4, overwrite=False)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/Alicia-samples/data/AdapterRemoval


In [5]:
# Split files based on barcodes:
BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12, load_previous=True)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/Alicia-samples/data/BC_split
Loaded results from previous run... Not running barcode split.


In [6]:
### Generate UMI stats and write final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.
UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False)
sample_df = UMItrim_obj.run_parallel(n_jobs=12, load_previous=True)
sample_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/Alicia-samples/data/UMI_trimmed
Loaded results from previous run... Not running UMI trimming.


Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp
0,A01,A01,1,2023-06-06/P03_R1.fastq.bz2,2023-06-06/P03_R2.fastq.bz2,D501,D703,l1Sp,human,293T-cys @&@ 293T-sergly @&@ 293T-met @&@ 293T...,Genotype-Time-Drug,WT-0,1,AGGCTATA,CGCTCATT,GGCTGCCATGCGACTA,5598086,308346,5237383,5545729,99.064734,94.439937,5478089,509270,524272.802045,97.856464,97.13836
1,A02,A02,1,2023-06-06/P03_R1.fastq.bz2,2023-06-06/P03_R2.fastq.bz2,D501,D703,l2Sp,human,293T-cys,Genotype-Time-Drug,WT-6,2,AGGCTATA,CGCTCATT,GGCTGCCATGCTGTCACG,5362565,281781,5026667,5308448,98.990837,94.691838,5244422,506781,524264.267492,97.796894,96.66518
2,A03,A03,1,2023-06-06/P03_R1.fastq.bz2,2023-06-06/P03_R2.fastq.bz2,D501,D703,l3Sp,human,293T-cys,Genotype-Time-Drug,WT-48,3,AGGCTATA,CGCTCATT,GGCTGCCATGCTGCGA,416891,97926,304409,402335,96.50844,75.660581,407214,179506,283156.76116,97.67877,63.394566


In [7]:
### Align reads to database of reference tRNAs ###
align_obj = SWIPE_align(dir_dict, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                        gap_penalty=6, extension_penalty=2, min_score_align=15, \
                        common_seqs=common_seqs, overwrite_dir=False)
sample_df = align_obj.run_parallel(n_jobs=4, overwrite=False, load_previous=True)
sample_df.head(3)

Using common sequences to prevent duplicated alignment.
Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/Alicia-samples/data/SWalign
Loaded results from previous run... Not running alignment.


Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_mapped,percent_single_annotation,percent_multiple_annotation,percent_multiple_codons,Mapping_percent
0,A01,A01,1,2023-06-06/P03_R1.fastq.bz2,2023-06-06/P03_R2.fastq.bz2,D501,D703,l1Sp,human,293T-cys @&@ 293T-sergly @&@ 293T-met @&@ 293T...,Genotype-Time-Drug,WT-0,1,AGGCTATA,CGCTCATT,GGCTGCCATGCGACTA,5598086,308346,5237383,5545729,99.064734,94.439937,5478089,509270,524272.802045,97.856464,97.13836,5210288,65.751663,34.248337,4.458564,95.111416
1,A02,A02,1,2023-06-06/P03_R1.fastq.bz2,2023-06-06/P03_R2.fastq.bz2,D501,D703,l2Sp,human,293T-cys,Genotype-Time-Drug,WT-6,2,AGGCTATA,CGCTCATT,GGCTGCCATGCTGTCACG,5362565,281781,5026667,5308448,98.990837,94.691838,5244422,506781,524264.267492,97.796894,96.66518,4814273,65.548381,34.451619,4.396261,91.797971
2,A03,A03,1,2023-06-06/P03_R1.fastq.bz2,2023-06-06/P03_R2.fastq.bz2,D501,D703,l3Sp,human,293T-cys,Genotype-Time-Drug,WT-48,3,AGGCTATA,CGCTCATT,GGCTGCCATGCTGCGA,416891,97926,304409,402335,96.50844,75.660581,407214,179506,283156.76116,97.67877,63.394566,383222,76.933996,23.066004,11.045034,94.108258


In [8]:
### Collect alignment statistics ###
stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                             overwrite_dir=False)
stats_df = stats_obj.run_parallel(n_jobs=6, load_previous=True)
# The dataframe returned is the "ALL_stats_aggregate_filtered.csv"
# which is the aggregated data filtered to contain only the
# most relevant columnns and requiring the 3' must be covered
# and have no 3' non-template bases.
# The CSV file output "ALL_stats_aggregate.csv" is the data
# aggregated based on all values identical except
# readID, 5p_UMI and 3p_BC. I.e. every information, except
# the UMI seequence is maintained in the aggregated CSV.
stats_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/Alicia-samples/data/stats_collection
Loaded results from previous run... Not running stats collection.


Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,species,tRNA_annotation,tRNA_annotation_len,unique_annotation,5p_cover,align_3p_nt,codon,anticodon,amino_acid,count
0,A01,A01,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,A,AAA,TTT,eColiLys,175
1,A01,A01,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,C,AAA,TTT,eColiLys,4403
2,A01,A01,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,True,A,AAA,TTT,eColiLys,66


In [9]:
### Generate standard tRNAseq data plots ###
plot_obj = TRNA_plot(dir_dict, sample_df, overwrite_dir=False, pull_default=False)

In [10]:
# Coverage plots:
plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_cyto_norm', n_jobs=12)
plot_obj.plot_coverage(compartment='mito', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_mito_norm', n_jobs=12)

# Charge plots:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_charge', \
                        group=True, min_obs=500, charge_plot=True)
plot_obj.plot_abundance(plot_type='aa', plot_name='aa_charge', \
                        group=True, min_obs=500, charge_plot=True)

# RPM plots:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_rpm', \
                        group=True, min_obs=500, charge_plot=False)
plot_obj.plot_abundance(plot_type='aa', plot_name='aa_rpm', \
                        group=True, min_obs=500, charge_plot=False)

# Ecoli control plots:
plot_obj.plot_Ecoli_ctr(plot_name='ecoli-ctr_charge', charge_plot=True)
plot_obj.plot_Ecoli_ctr(plot_name='ecoli-ctr_rpm', charge_plot=False)



Now collecting data for sample:  A01  A02  A03  A07  A09  A08  A10  A11  A12  A13  A06  A05  A16  A23  A24  A17  A22  A21  A25  A26  A20  A19  A15  A14  A18  A28  A33  A27  A29  A30  A31  A32  A34  A35  A36  A37  A39  A40  A38  A46  A42  A48  A45  A43  A44  A41  A47  A50  A53  A49  A52  A51  A58  A55  A54  A56  A57  A63  A59  A60  A62  A67  A64  A61  A66  A65  A72  A69  A68  A71  A70  A75  A76  A73  A74
Now plotting sample:  A01  A02  A03  A05  A06  A07  A08  A09  A10  A11  A12  A13  A14  A15  A16  A17  A18  A19  A20  A21  A22  A23  A24  A25  A26  A27  A28  A29  A30  A31  A32  A33  A34  A35  A36  A37  A38  A39  A40  A41  A42  A43  A44  A45  A46  A47  A48  A49  A50  A51  A52  A53  A54  A55  A56  A57  A58  A59  A60  A61  A62  A63  A64  A65  A66  A67  A68  A69  A70  A71  A72  A73  A74  A75  A76
Now collecting data for sample:  A01  A02  A03  A05  A06  A07  A08  A09  A10  A11  A12  A13  A16  A17  A24  A23  A22  A21  A25  A26  A20  A19  A15  A14  A18  A28  A33  A27  A29  A31  A30  A32  A34

In [11]:
# Logo plots of 5/3 prime non-template sequence:
plot_obj.plot_non_temp(end='5p', plot_name='_5p-non-template_logo', \
                       seq_len_percentile=99, n_jobs=8)
plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_logo', \
                       seq_len_percentile=99.9, n_jobs=8)
plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_3p-cover_logo', \
                       seq_len_percentile=99.9, _3p_cover=True, n_jobs=8)


Now collecting data for sample:  A01  A02  A03  A05  A06  A07  A08  A09  A12  A13  A17  A18  A15  A14  A16  A11  A10  A19  A22  A23  A21  A20  A24  A25  A26  A27  A29  A28  A30  A32  A34  A31  A33  A35  A40  A36  A37  A38  A39  A45  A41  A42  A43  A44  A49  A48  A47  A46  A52  A54  A51  A50  A53  A55  A56  A57  A58  A63  A59  A62  A61  A60  A66  A68  A64  A65  A72  A67  A73  A70  A69  A76  A75  A71  A74
Now plotting logo plot.
Now plotting logo plot.
Now collecting data for sample:  A01  A07  A02  A03  A05  A06  A08  A09  A12  A13  A18  A17  A15  A11  A14  A16  A10  A19  A21  A23  A22  A20  A25  A24  A26  A27  A28  A33  A32  A34  A31  A29  A30  A35  A39  A37  A36  A38  A46  A41  A43  A45  A40  A42  A44  A47  A48  A51  A54  A50  A49  A53  A52  A55  A56  A60  A57  A61  A58  A62  A63  A59  A64  A68  A71  A66  A65  A67  A70  A72  A69  A74  A73  A75  A76
Now plotting logo plot.
Now plotting logo plot.
Now collecting data for sample:  A02  A01  A03  A05  A06  A07  A08  A09  A12  A16  A13  A

In [13]:
# Write charge/rpm data to csv file:
fnam = 'data/plotting/alicia_res_AA.csv'
plot_obj.charge_filt['aa'].reset_index(drop=True).to_csv(fnam, index=False)
fnam = 'data/plotting/alicia_res_codon.csv'
plot_obj.charge_filt['codon'].reset_index(drop=True).to_csv(fnam, index=False)
fnam = 'data/plotting/alicia_res_tr.csv'
plot_obj.charge_filt['tr'].reset_index(drop=True).to_csv(fnam, index=False)

In [9]:
### Perform transcript mutation analysis ###
TM_obj = TM_analysis(dir_dict, sample_df, tRNA_database, pull_default=False, \
                     common_seqs=common_seqs, ignore_common_count=False, \
                     overwrite_dir=False)

Using common sequences...


In [10]:
TM_obj.find_muts(n_jobs=6, unique_anno=False, fix_end=True)

Collecting stats from:  A01  A02  A03  A05  A06  A07  A10  A11  A09  A14  A12  A13  A08  A15  A16  A17  A18  A19  A20  A21  A22  A23  A24  A25  A26  A28  A27  A29  A30  A31  A32  A33  A35  A36  A37  A34  A38  A39  A41  A43  A40  A42  A46  A47  A44  A45  A48  A49  A50  A52  A53  A51  A55  A54  A57  A56  A58  A60  A59  A61  A64  A62  A63  A65  A67  A66  A70  A68  A71  A69  A73  A72  A74  A75  A76

In [16]:
# Plot mutation/gap difference matrix for methionine depleted samples:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='Met_tr-mut_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=500, \
                                   sample_pairs=[['A01', 'A01', 'A20', 'A20', \
                                                  'A39', 'A39', 'A58', 'A58'], \
                                                 ['A10', 'A11', 'A29', 'A30', \
                                                  'A48', 'A49', 'A67', 'A68']], \
                                   freq_avg_weighted=True, \
                                   topN=10, topN_select='max_diff')
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='Met_tr-gap_matrix_comp_top10-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=True, \
                                   min_count_show=500, \
                                   sample_pairs=[['A01', 'A01', 'A20', 'A20', \
                                                  'A39', 'A39', 'A58', 'A58'], \
                                                 ['A10', 'A11', 'A29', 'A30', \
                                                  'A48', 'A49', 'A67', 'A68']], \
                                   freq_avg_weighted=True, \
                                   topN=10, topN_select='max_diff')

In [18]:
# Write transcript mutations/gaps to csv file:
TM_obj.write_transcript_mut(csv_name='mut-matrix_right-aligned', \
                            sample_list=list(sample_df['sample_name_unique'].values))
TM_obj.write_transcript_mut(csv_name='gap-matrix_right-aligned', gap_only=True, \
                            sample_list=list(sample_df['sample_name_unique'].values))