In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy, warnings
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
# Numpy will complain about subnormals if python
# is compiled with the -ffast-math compiler flag:
# https://github.com/clearlinux/distribution/issues/2809
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="The value of the smallest subnormal for")
    import numpy as np

In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# The minimum alignment score.
# Better to set relatively low, since additional filtering can
# be applied later.
MIN_SCORE_ALIGN = 15
print('Using minimum alignemnt score: {}'.format(MIN_SCORE_ALIGN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

# Read sample list:
sample_list_fnam = 'sample_list_alignment-opti.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

Notebook is in: /home/sulab/tRNA-charge-seq/projects/alignment-opti
Repo is in: /home/sulab/tRNA-charge-seq


  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


Using minimum read length: 39 (after merge)
Using minimum alignemnt score: 15


# Alignment optimization grid search

tRNAs are heavily modified post-transcriptionally and some of these modifications induce skipping or wrong base incorporation during the reverse transcription step.
At the level of alignment, RT skipping will appear as gaps and wrong base incorporation will appear as mismatches, both leading to lower alignment scores.
The non-random nature of such modifications is problematic because it can lead to non-random misannotation by flipping the alignment score in favor of a closely related, but wrong, annotation.

There are three strategies to increase alignment specificity: 
1. Increase alignment length i.e. decrease RT fall-off
2. Remove tRNA modification i.e. demodify tRNA before the RT step
3. Adjust the alignment to accomodate RT artifacts

Here, we are exploring the third option by applying "N" masking of positions in the reference sequences that are highly modified.
N masking causes the masked positions to have no contribution to the alignment score.
Therefore it is a very unspecific approach, as opposed to a position specific scoring matrix (PSSM) or a hidden Markov model (HMM) that would score the types of mismatches differently.
The advantage of the unspecific approach is that fewer assumptions are made.
Using a PSSM or HMM would require inferring these models on a dataset of tRNA reads, with the implicit assumption that each read is drawn randomly from the distribution of all possible reads.
That assumption is quite obviously wrong, given that the penetrance of a tRNA modification can change as a response to a biological event.

In the following, reference sequence masking is made using the mismatches observed in the alignment i.e. if the tRNA reads have a high degree of mismatches on a certain position in the reference, then this position in the reference is masked.
The masking will alter the alignment and thus iterations can be performed using the new alignment to make a new masked reference etc.
The masking has four tuning parameters:
1. "min_mut_freq": The minimum mismatch frequency to trigger masking
2. "unique_anno": Use only uniquely annotated reads for masking (True/False)
3. "frac_max_score": The minimum fraction of the maximum alignment score between two reference sequences to expand the masked positions in one reference to another
4. "iteration": The number of iterations performed

The purpose of the grid search is to find the best set of tuning parameters and use these in future alignments.
A first pass alignment has to be generated, then the grid search is performed and finally the best performing masked reference sequences are extracted for future use.

### First pass alignment

In [3]:
# If restarting after shutdown,
# no need to re-run first pass alignment:
if True:
    # Run AdapterRemoval:
    AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=True)
    inp_file_df = AR_obj.run_parallel(n_jobs=4)

    # Split files based on barcodes:
    BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=True)
    sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12)

    # Trim UMI:
    UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=True)
    sample_df = UMItrim_obj.run_parallel(n_jobs=12)

    # First pass alignment:
    align_obj = SWIPE_align(dir_dict, tRNA_database, sample_df, SWIPE_score_mat, \
                            gap_penalty=6, extension_penalty=1, min_score_align=MIN_SCORE_ALIGN, \
                            common_seqs=common_seqs, overwrite_dir=True, verbose=False)
    sample_df = align_obj.run_parallel(n_jobs=12, verbose=False)

    # Collect alignment statistics:
    stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                                 overwrite_dir=True)
    stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False)

    # Write first alignment stats:
    align_res = sample_df.loc[:, ['sample_name_unique', 'Mapping_percent', \
                                  'percent_single_annotation', 'percent_multiple_codons']]
    align_res['unique_anno'] = None
    align_res['frac_max_score'] = None
    align_res['min_mut_freq'] = None
    align_res['iteration'] = None
    with open('align-opti_res.csv', 'w') as fh_res:
        align_res.to_csv(fh_res, index=False)

else:
    # Run AdapterRemoval:
    AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False)
    inp_file_df = AR_obj.run_parallel(n_jobs=4, overwrite=False)

    # Split files based on barcodes:
    BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
    sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12, load_previous=True)

    # Trim UMI:
    UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False)
    sample_df = UMItrim_obj.run_parallel(n_jobs=12, load_previous=True)

    # First pass alignment:
    align_obj = SWIPE_align(dir_dict, tRNA_database, sample_df, SWIPE_score_mat, \
                            gap_penalty=6, extension_penalty=1, min_score_align=MIN_SCORE_ALIGN, \
                            common_seqs=common_seqs, overwrite_dir=False, verbose=False)
    sample_df = align_obj.run_parallel(n_jobs=12, verbose=False, load_previous=True)

    # Collect alignment statistics:
    stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                                 overwrite_dir=False)
    stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False, load_previous=True)

Downsampling UMI trimmed sequences to maximum 2000000 reads.


### Searching a grid of alignment parameters

In [4]:
dir_dict_masked = copy.deepcopy(dir_dict)
dir_dict_masked['align_dir'] = 'SWalign_masked'
dir_dict_masked['stats_dir'] = 'stats_collection_masked'

In [12]:
# Combination of variables to test:
min_mut_freq_grid = [0.5, 0.6, 0.63, 0.65, 0.68, 0.7, 0.72, 0.75, 0.8, 0.9]
frac_max_score_grid = [0.90, 0.95, 1]
unique_anno_grid = [True, False]
N_iterations = 3
total_combi = len(min_mut_freq_grid) * len(frac_max_score_grid) * len(unique_anno_grid) * N_iterations

In [13]:
if True: # Switch on to do the search
    combi_counter = 0
    # Run all the nested combinations:
    for unique_anno in unique_anno_grid:
        for frac_max_score in frac_max_score_grid:
            for min_mut_freq in min_mut_freq_grid:
                # "iteration" must be the innermost nest,
                # because the result depends on the previous run:
                for iteration in range(1, N_iterations+1):
                    combi_counter += 1
                    combi_key = '{}-{}-{}-{}'.format(unique_anno, frac_max_score, min_mut_freq, float(iteration))
                    align_res_df = pd.read_csv('align-opti_res.csv')
                    combi_key_set = {'{}-{}-{}-{}'.format(u, f, m, i) for u, f, m, i in zip(align_res_df['unique_anno'], align_res_df['frac_max_score'], align_res_df['min_mut_freq'], align_res_df['iteration'])}
                    if combi_key in combi_key_set:
                        continue
                    print('Running combi {} of {}'.format(combi_counter, total_combi))

                    # At first iteration use first pass alignment,
                    # after this use the masked alignments:
                    if iteration == 1:
                        dir_dict_iter = dir_dict
                    else:
                        dir_dict_iter = dir_dict_masked
                    # Perform transcript mutation analysis:
                    TM_obj = TM_analysis(dir_dict_iter, sample_df, tRNA_database, \
                                         common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                    TM_obj.find_muts(n_jobs=12, unique_anno=unique_anno, verbose=False)
                    TM_obj.mask_tRNA_database(min_mut_freq=min_mut_freq, frac_max_score=frac_max_score, \
                                              min_pos_count=100, min_tr_count=200)
                    tRNA_database_masked = TM_obj.write_masked_tRNA_database(out_dir='tRNA_database_masked')

                    # Run alignment:
                    align_obj = SWIPE_align(dir_dict_masked, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                                            gap_penalty=6, extension_penalty=3, min_score_align=MIN_SCORE_ALIGN, \
                                            common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                    sample_df = align_obj.run_parallel(n_jobs=5, verbose=False)

                    # Collect alignment statistics:
                    stats_obj = STATS_collection(dir_dict_masked, tRNA_data, sample_df, \
                                                 common_seqs=common_seqs, overwrite_dir=True)
                    stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False)

                    # Collect alignment stats and append to file:
                    col_sele = ['sample_name_unique', 'Mapping_percent', \
                                'percent_single_annotation', 'percent_multiple_codons']
                    align_res = sample_df.loc[:, col_sele]
                    align_res['unique_anno'] = unique_anno
                    align_res['frac_max_score'] = frac_max_score
                    align_res['min_mut_freq'] = min_mut_freq
                    align_res['iteration'] = iteration
                    with open('align-opti_res.csv', 'a') as fh_res:
                        align_res.to_csv(fh_res, index=False, header=False)

Running combi 7 of 180
Running combi 8 of 180
Running combi 9 of 180
Running combi 13 of 180
Running combi 14 of 180
Running combi 15 of 180
Running combi 19 of 180
Running combi 20 of 180
Running combi 21 of 180
Running combi 37 of 180
Running combi 38 of 180
Running combi 39 of 180
Running combi 43 of 180
Running combi 44 of 180
Running combi 45 of 180
Running combi 49 of 180
Running combi 50 of 180
Running combi 51 of 180
Running combi 61 of 180
Running combi 62 of 180
Running combi 63 of 180
Running combi 64 of 180
Running combi 65 of 180
Running combi 66 of 180
Running combi 67 of 180
Running combi 68 of 180
Running combi 69 of 180
Running combi 70 of 180
Running combi 71 of 180
Running combi 72 of 180
Running combi 73 of 180
Running combi 74 of 180
Running combi 75 of 180
Running combi 76 of 180
Running combi 77 of 180
Running combi 78 of 180
Running combi 79 of 180
Running combi 80 of 180
Running combi 81 of 180
Running combi 82 of 180
Running combi 83 of 180
Running combi 84 of

### Make final masked database from best alignment parameters
The best parameters were found in the data processing and plotting notebook: `alignment-opti_plotting.ipynb`
The resulting masked reference sequences folder is moved out of this projects folder and into the main repo.
Future alignments can then use these masked references for alignment directly.

In [14]:
# Best parameters:
min_mut_freq_grid = [0.68]
frac_max_score_grid = [0.95]
unique_anno_grid = [False]
N_iterations = 3

In [15]:
# Repeat of the grid search above.
# This is done so because 3 iterations
# have to be performed.
combi_counter = 0
# Run all the nested combinations:
for unique_anno in unique_anno_grid:
    for frac_max_score in frac_max_score_grid:
        for min_mut_freq in min_mut_freq_grid:
            # "iteration" must be the innermost nest,
            # because the result depends on the previous run:
            for iteration in range(1, N_iterations+1):
                # At first iteration use first pass alignment,
                # after this use the masked alignments:
                if iteration == 1:
                    dir_dict_iter = dir_dict
                else:
                    dir_dict_iter = dir_dict_masked
                # Perform transcript mutation analysis:
                TM_obj = TM_analysis(dir_dict_iter, sample_df, tRNA_database, \
                                     common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                TM_obj.find_muts(n_jobs=12, unique_anno=unique_anno, verbose=False)
                TM_obj.mask_tRNA_database(min_mut_freq=min_mut_freq, frac_max_score=frac_max_score, \
                                          min_pos_count=100, min_tr_count=200)
                tRNA_database_masked = TM_obj.write_masked_tRNA_database(out_dir='tRNA_database_masked')

                # Run alignment:
                align_obj = SWIPE_align(dir_dict_masked, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                                        gap_penalty=6, extension_penalty=3, min_score_align=MIN_SCORE_ALIGN, \
                                        common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                sample_df = align_obj.run_parallel(n_jobs=12, verbose=False)

                # Collect alignment statistics:
                stats_obj = STATS_collection(dir_dict_masked, tRNA_data, sample_df, \
                                             common_seqs=common_seqs, overwrite_dir=True)
                stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False)

                # Collect alignment stats and append to file:
                col_sele = ['sample_name_unique', 'Mapping_percent', \
                            'percent_single_annotation', 'percent_multiple_codons']
                align_res = sample_df.loc[:, col_sele]
                align_res['unique_anno'] = unique_anno
                align_res['frac_max_score'] = frac_max_score
                align_res['min_mut_freq'] = min_mut_freq
                align_res['iteration'] = iteration

In [16]:
align_res

Unnamed: 0,sample_name_unique,Mapping_percent,percent_single_annotation,percent_multiple_codons,unique_anno,frac_max_score,min_mut_freq,iteration
0,0m_1,99.270539,74.027656,4.440943,False,0.95,0.68,3
1,8m_1,99.318155,74.301386,4.457906,False,0.95,0.68,3
2,32m_1,98.68007,72.137738,5.911871,False,0.95,0.68,3
3,1h_1,99.291669,74.222362,4.060808,False,0.95,0.68,3
4,4h_1,98.823837,74.138942,4.857488,False,0.95,0.68,3
5,8h_1,98.795492,74.132849,4.076637,False,0.95,0.68,3
6,16h_1,99.091935,73.952814,6.724284,False,0.95,0.68,3
7,40h_1,98.561703,71.608583,6.213271,False,0.95,0.68,3


In [17]:
align_res.loc[:, ['Mapping_percent', 'percent_single_annotation', 'percent_multiple_codons']].mean()

Mapping_percent              98.979175
percent_single_annotation    73.565291
percent_multiple_codons       5.092901
dtype: float64