In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.colors as mcolors
import matplotlib as mpl
from matplotlib.patches import StepPatch
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import logomaker as lm
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")
%matplotlib inline

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)
tRNA_database_masked = dict()
tRNA_database_masked['human'] = '{}/tRNA_database_masked/human/human-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /home/sulab/tRNA-charge-seq/projects/Kugel-samples
Repo is in: /home/sulab/tRNA-charge-seq
Using minimum read length: 39 (after merge)


### Settings

In [3]:
sample_list_fnam = 'sample_list.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Downsample:
if False:
    sample_df, inp_file_df, seq_dir = downsample_raw_input(sample_df, inp_file_df, NBdir, data_dir, seq_dir_noDS, downsample_absolute=1e4)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

In [5]:
# Run AdapterRemoval:
AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False, AR_threads=6)
inp_file_df = AR_obj.run_parallel(n_jobs=2, overwrite=False)

In [6]:
# Split files based on barcodes:
BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12, load_previous=True)

In [7]:
### Generate UMI stats and write final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.
UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False)
sample_df = UMItrim_obj.run_parallel(n_jobs=12, load_previous=True)
sample_df.head(3)

Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp
0,S01,S01,1,2023-06-06/P01_R1.fastq.bz2,2023-06-06/P01_R2.fastq.bz2,D501,D701,l1Sp,human,Direct Trizol,Cell line,Panc3.27 siCtrl,1,AGGCTATA,ATTACTCG,GGCTGCCATGCGACTA,2864349,123367,2719039,2842406,99.233927,95.659769,2806474,486471,521805.883311,97.979471,93.228347
1,S02,S02,2,2023-06-06/P01_R1.fastq.bz2,2023-06-06/P01_R2.fastq.bz2,D501,D701,l2Sp,human,Direct Trizol,Cell line,Panc3.27 siCtrl,1,AGGCTATA,ATTACTCG,GGCTGCCATGCTGTCACG,2751104,117505,2610253,2727758,99.151395,95.69225,2694732,482073,521216.264062,97.950932,92.490015
2,S03,S03,3,2023-06-06/P01_R1.fastq.bz2,2023-06-06/P01_R2.fastq.bz2,D501,D701,l3Sp,human,Direct Trizol,Cell line,Panc3.27 siCtrl,1,AGGCTATA,ATTACTCG,GGCTGCCATGCTGCGA,2640449,119777,2500386,2620163,99.231722,95.428643,2586063,478562,520508.79859,97.940275,91.941193


In [8]:
### Align reads to database of reference tRNAs ###
align_obj = SWIPE_align(dir_dict, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                        gap_penalty=6, extension_penalty=3, min_score_align=15, \
                        common_seqs=common_seqs, overwrite_dir=False)
sample_df = align_obj.run_parallel(n_jobs=4, overwrite=False, load_previous=True)
sample_df.head(3)

Using common sequences to prevent duplicated alignment.
Running Swipe on:  S03  S02  S04  S01  S07  S06  S05  S08  S10  S09  S11  S12  S13  S16  S15  S14  S17  common-seqs  S18
Collecting alignment statistics, from sample:  S02  S01  S03  S04  S06  S07  S08  S05  S09  S11  S10  S12  S13  S16  S15  S14  common-seqs  S17  S18

Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_mapped,percent_single_annotation,percent_multiple_annotation,percent_multiple_codons,Mapping_percent
0,S01,S01,1,2023-06-06/P01_R1.fastq.bz2,2023-06-06/P01_R2.fastq.bz2,D501,D701,l1Sp,human,Direct Trizol,Cell line,Panc3.27 siCtrl,1,AGGCTATA,ATTACTCG,GGCTGCCATGCGACTA,2864349,123367,2719039,2842406,99.233927,95.659769,2806474,486471,521805.883311,97.979471,93.228347,2690827.0,69.182374,30.817626,4.060945,95.879278
1,S02,S02,2,2023-06-06/P01_R1.fastq.bz2,2023-06-06/P01_R2.fastq.bz2,D501,D701,l2Sp,human,Direct Trizol,Cell line,Panc3.27 siCtrl,1,AGGCTATA,ATTACTCG,GGCTGCCATGCTGTCACG,2751104,117505,2610253,2727758,99.151395,95.69225,2694732,482073,521216.264062,97.950932,92.490015,2498162.0,69.98041,30.01959,4.199327,92.705397
2,S03,S03,3,2023-06-06/P01_R1.fastq.bz2,2023-06-06/P01_R2.fastq.bz2,D501,D701,l3Sp,human,Direct Trizol,Cell line,Panc3.27 siCtrl,1,AGGCTATA,ATTACTCG,GGCTGCCATGCTGCGA,2640449,119777,2500386,2620163,99.231722,95.428643,2586063,478562,520508.79859,97.940275,91.941193,2479627.0,69.329016,30.670984,4.293307,95.884246


In [56]:
### Collect alignment statistics ###
stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                             overwrite_dir=False)
stats_df = stats_obj.run_parallel(n_jobs=6, load_previous=True)
# The dataframe returned is the "ALL_stats_aggregate_filtered.csv"
# which is the aggregated data filtered to contain only the
# most relevant columnns and requiring the 3' must be covered
# and have no 3' non-template bases.
# The CSV file output "ALL_stats_aggregate.csv" is the data
# aggregated based on all values identical except
# readID, 5p_UMI and 3p_BC. I.e. every information, except
# the UMI seequence is maintained in the aggregated CSV.
stats_df.head(3)

Using common sequences...
Collecting stats from:  S01  S02  S03  S04  S05  S06  S10  S12  S08  S09  S11  S07  S15  S13  S14  S17  S16  S18

Unnamed: 0,sample_name_unique,sample_name,replicate,barcode,species,tRNA_annotation,tRNA_annotation_len,unique_annotation,5p_cover,align_3p_nt,codon,anticodon,amino_acid,count
0,S01,Pa-Ctr,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,A,AAA,TTT,eColiLys,1
1,S01,Pa-Ctr,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,False,C,AAA,TTT,eColiLys,688
2,S01,Pa-Ctr,1,l1Sp,human,Escherichia_coli_str_K_12_substr_MG1655_tRNA-e...,76,True,True,A,AAA,TTT,eColiLys,2


In [78]:
### Generate standard tRNAseq data plots ###
plot_obj = TRNA_plot(dir_dict, sample_df, overwrite_dir=False, pull_default=False)

Folder exists and overwrite set to false... Doing nothing.


In [79]:
# Coverage plots:
plot_obj.plot_coverage(compartment='cyto', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_cyto_norm', n_jobs=12)
plot_obj.plot_coverage(compartment='mito', plot_type='behrens', y_norm=True, \
                       plot_name='cov_plot_mito_norm', n_jobs=12)

# Charge plots:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_charge', \
                        group=True, min_obs=500, charge_plot=True)
plot_obj.plot_abundance(plot_type='aa', plot_name='aa_charge', \
                        group=True, min_obs=500, charge_plot=True)

# RPM plots:
plot_obj.plot_abundance(plot_type='codon', plot_name='codon_rpm', \
                        group=True, min_obs=500, charge_plot=False)
plot_obj.plot_abundance(plot_type='aa', plot_name='aa_rpm', \
                        group=True, min_obs=500, charge_plot=False)

# Ecoli control plots:
plot_obj.plot_Ecoli_ctr(plot_name='ecoli-ctr_charge', charge_plot=True)
plot_obj.plot_Ecoli_ctr(plot_name='ecoli-ctr_rpm', charge_plot=False)

# Sample replicability plots:
plot_obj.plot_abundance_corr(sample_unique_pairs=[['S01', 'S01', 'S04', 'S04', 'S07', 'S07', \
                                                   'S10', 'S10', 'S13', 'S13', 'S16', 'S16'], \
                                                  ['S02', 'S03', 'S05', 'S06', 'S08', 'S09', \
                                                   'S11', 'S12', 'S14', 'S15', 'S17', 'S18']], \
                             plot_type='codon', plot_name='codon_charge_replicability', \
                             min_obs=500, charge_plot=True)
plot_obj.plot_abundance_corr(sample_unique_pairs=[['S01', 'S01', 'S04', 'S04', 'S07', 'S07', \
                                                   'S10', 'S10', 'S13', 'S13', 'S16', 'S16'], \
                                                  ['S02', 'S03', 'S05', 'S06', 'S08', 'S09', \
                                                   'S11', 'S12', 'S14', 'S15', 'S17', 'S18']], \
                             plot_type='codon', plot_name='codon_rpm_replicability', \
                             min_obs=500, charge_plot=False)

# Charge/RPM correlation between sample group:
plot_obj.plot_abundance_corr(sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                           ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']], \
                             plot_type='codon', plot_name='codon_charge_diff', \
                             min_obs=500, charge_plot=True)
plot_obj.plot_abundance_corr(sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                           ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']], \
                             plot_type='codon', plot_name='codon_rpm_diff', \
                             min_obs=500, charge_plot=False)
plot_obj.plot_abundance_corr(sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                           ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']], \
                             plot_type='transcript', plot_name='tr_charge_diff', \
                             min_obs=500, charge_plot=True)
plot_obj.plot_abundance_corr(sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                           ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']], \
                             plot_type='transcript', plot_name='tr_rpm_diff', \
                             min_obs=500, charge_plot=False)


Now collecting data for sample:  S01  S02  S03  S04  S05  S06  S07  S08  S10  S09  S11  S12  S14  S16  S15  S13  S18  S17
Now plotting sample:  S01  S02  S03  S04  S05  S06  S07  S08  S09  S10  S11  S12  S13  S14  S15  S16  S17  S18
Now collecting data for sample:  S01  S02  S03  S04  S05  S06  S07  S09  S08  S11  S10  S12  S14  S15  S18  S16  S13  S17
Now plotting sample:  S01  S02  S03  S04  S05  S06  S07  S08  S09  S10  S11  S12  S13  S14  S15  S16  S17  S18
Now plotting sample/group:  Direct Trizol  Snap frozen
Now plotting sample/group:  Direct Trizol  Snap frozen
Now plotting sample/group:  Direct Trizol  Snap frozen
Now plotting sample/group:  Direct Trizol  Snap frozen
Now plotting sample pairs:  (S01 - S02)  (S01 - S03)  (S04 - S05)  (S04 - S06)  (S07 - S08)  (S07 - S09)  (S10 - S11)  (S10 - S12)  (S13 - S14)  (S13 - S15)  (S16 - S17)  (S16 - S18)
Now plotting sample pairs:  (S01 - S02)  (S01 - S03)  (S04 - S05)  (S04 - S06)  (S07 - S08)  (S07 - S09)  (S10 - S11)  (S10 - S12)

In [25]:
# Logo plots of 5/3 prime non-template sequence:
plot_obj.plot_non_temp(end='5p', plot_name='_5p-non-template_logo', \
                       seq_len_percentile=99, n_jobs=4)
plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_logo', \
                       seq_len_percentile=99.9, n_jobs=4)
plot_obj.plot_non_temp(end='3p', plot_name='_3p-non-template_3p-cover_logo', \
                       seq_len_percentile=99.9, _3p_cover=True, n_jobs=4)


Now collecting data for sample:  S02  S01  S03  S04  S06  S07  S08  S05  S09  S10  S11  S12  S16  S14  S13  S15  S17  S18
Now plotting logo plot.
Now plotting logo plot.
Now collecting data for sample:  S02  S03  S01  S04  S07  S06  S05  S08  S10  S09  S12  S11  S13  S15  S14  S16  S17  S18
Now plotting logo plot.
Now plotting logo plot.

In [60]:
# Write charge/rpm data to csv file:
fnam = 'data/plotting/aa_charge.csv'
plot_obj.charge_filt['aa'].reset_index(drop=True).to_csv(fnam, index=False)
fnam = 'data/plotting/codon_charge.csv'
plot_obj.charge_filt['codon'].reset_index(drop=True).to_csv(fnam, index=False)
fnam = 'data/plotting/tr_charge.csv'
plot_obj.charge_filt['tr'].reset_index(drop=True).to_csv(fnam, index=False)

In [61]:
### Perform transcript mutation analysis ###
TM_obj = TM_analysis(dir_dict, sample_df, tRNA_database, pull_default=False, \
                     common_seqs=common_seqs, ignore_common_count=False, \
                     overwrite_dir=False)

Using common sequences...
Folder exists and overwrite set to false... Doing nothing.


In [62]:
TM_obj.find_muts(n_jobs=12, unique_anno=True, fix_end=True)

Collecting stats from:  S01  S04  S02  S03  S05  S06  S07  S09  S10  S08  S11  S12  S15  S14  S16  S18  S13  S17

In [71]:
# Plot transcript mutations/gaps:
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='tr-mut_matrix_comp_top5-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=False, \
                                   min_count_show=400, \
                                   sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                                 ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']],
                                   freq_avg_weighted=True, \
                                   topN=5, topN_select='max_diff')
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='tr-gap_matrix_comp_top5-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=True, \
                                   min_count_show=400, \
                                   sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                                 ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']],
                                   freq_avg_weighted=True, \
                                   topN=5, topN_select='max_diff')
TM_obj.plot_transcript_mut_compare(species='human', \
                                   plot_name='tr-gap_matrix_comp_Cys-max-diff', \
                                   no_plot_return=True, \
                                   mito=False, gap_only=True, \
                                   min_count_show=400, \
                                   sample_pairs=[['Pa-Ctr',  'KP4-Ctr',  'KP4-Ctr-SF'], \
                                                 ['Pa-HMGA', 'KP4-HMGA', 'KP4-HMGA-SF']],
                                   freq_avg_weighted=True, \
                                   topN=5, topN_select='max_diff', \
                                   anno_substring_compare='Cys')

In [77]:
# Write transcript mutations/gaps to csv file:
TM_obj.write_transcript_mut(csv_name='mut-matrix_right-aligned', \
                            sample_list=list(sample_df['sample_name_unique'].values))
TM_obj.write_transcript_mut(csv_name='gap-matrix_right-aligned', gap_only=True, \
                            sample_list=list(sample_df['sample_name_unique'].values))