In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.colors as mcolors
import matplotlib as mpl
from matplotlib.patches import StepPatch
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import logomaker as lm
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")
%matplotlib inline

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# The minimum alignment score.
# Better to set relatively low, since additional filtering can
# be applied later.
MIN_SCORE_ALIGN = 15
print('Using minimum alignemnt score: {}'.format(MIN_SCORE_ALIGN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /home/sulab/tRNA-charge-seq/projects/alignment-opti
Repo is in: /home/sulab/tRNA-charge-seq
Using minimum read length: 39 (after merge)
Using minimum alignemnt score: 15


### Settings

In [3]:
sample_list_fnam = 'sample_list_alignment-opti.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

### First pass alignment

In [4]:
# If restarting after shutdown,
# no need to re-run first pass alignment:
if True:
    # Run AdapterRemoval:
    AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=True)
    inp_file_df = AR_obj.run_parallel(n_jobs=4)

    # Split files based on barcodes:
    BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=True)
    sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12)

    # Trim UMI:
    UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=True)
    sample_df = UMItrim_obj.run_parallel(n_jobs=12)

    # First pass alignment:
    align_obj = SWIPE_align(dir_dict, tRNA_database, sample_df, SWIPE_score_mat, \
                            gap_penalty=6, extension_penalty=1, min_score_align=MIN_SCORE_ALIGN, \
                            common_seqs=common_seqs, overwrite_dir=True, verbose=False)
    sample_df = align_obj.run_parallel(n_jobs=12, verbose=False)

    # Collect alignment statistics:
    stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                                 overwrite_dir=True)
    stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False)

    # Write first alignment stats:
    align_res = sample_df.loc[:, ['sample_name_unique', 'Mapping_percent', \
                                  'percent_single_annotation', 'percent_multiple_codons']]
    align_res['unique_anno'] = None
    align_res['frac_max_score'] = None
    align_res['min_mut_freq'] = None
    align_res['iteration'] = None
    with open('align-opti_res.csv', 'w') as fh_res:
        align_res.to_csv(fh_res, index=False)

else:
    # Run AdapterRemoval:
    AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False)
    inp_file_df = AR_obj.run_parallel(n_jobs=4, overwrite=False)

    # Split files based on barcodes:
    BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
    sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=12, load_previous=True)

    # Trim UMI:
    UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False)
    sample_df = UMItrim_obj.run_parallel(n_jobs=12, load_previous=True)

    # First pass alignment:
    align_obj = SWIPE_align(dir_dict, tRNA_database, sample_df, SWIPE_score_mat, \
                            gap_penalty=6, extension_penalty=1, min_score_align=MIN_SCORE_ALIGN, \
                            common_seqs=common_seqs, overwrite_dir=False, verbose=False)
    sample_df = align_obj.run_parallel(n_jobs=12, verbose=False, load_previous=True)

    # Collect alignment statistics:
    stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                                 overwrite_dir=False)
    stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False, load_previous=True)

### Searching a grid of alignment parameters

In [5]:
dir_dict_masked = copy.deepcopy(dir_dict)
dir_dict_masked['align_dir'] = 'SWalign_masked'
dir_dict_masked['stats_dir'] = 'stats_collection_masked'

In [6]:
# Combination of variables to test:
min_mut_freq_grid = [0.5, 0.7, 0.75, 0.78, 0.8, \
                     0.82, 0.84, 0.86, 0.9]
frac_max_score_grid = [0.90, 0.95, 1]
unique_anno_grid = [True, False]
N_iterations = 3
total_combi = len(min_mut_freq_grid) * len(frac_max_score_grid) * len(unique_anno_grid) * N_iterations

In [7]:
combi_counter = 0
# Run all the nested combinations:
for unique_anno in unique_anno_grid:
    for frac_max_score in frac_max_score_grid:
        for min_mut_freq in min_mut_freq_grid:
            # "iteration" must be the innermost nest,
            # because the result depends on the previous run:
            for iteration in range(1, N_iterations+1):
                combi_counter += 1
                combi_key = '{}-{}-{}-{}'.format(unique_anno, frac_max_score, min_mut_freq, float(iteration))
                align_res_df = pd.read_csv('align-opti_res.csv')
                combi_key_set = {'{}-{}-{}-{}'.format(u, f, m, i) for u, f, m, i in zip(align_res_df['unique_anno'], align_res_df['frac_max_score'], align_res_df['min_mut_freq'], align_res_df['iteration'])}
                if combi_key in combi_key_set:
                    continue
                print('Running combi {} of {}'.format(combi_counter, total_combi))

                # At first iteration use first pass alignment,
                # after this use the masked alignments:
                if iteration == 1:
                    dir_dict_iter = dir_dict
                else:
                    dir_dict_iter = dir_dict_masked
                # Perform transcript mutation analysis:
                TM_obj = TM_analysis(dir_dict_iter, sample_df, tRNA_database, \
                                     common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                TM_obj.find_muts(n_jobs=12, unique_anno=unique_anno, verbose=False)
                TM_obj.mask_tRNA_database(min_mut_freq=min_mut_freq, frac_max_score=frac_max_score, \
                                          min_pos_count=100, min_tr_count=200)
                tRNA_database_masked = TM_obj.write_masked_tRNA_database(out_dir='tRNA_database_masked')

                # Run alignment:
                align_obj = SWIPE_align(dir_dict_masked, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                                        gap_penalty=6, extension_penalty=3, min_score_align=MIN_SCORE_ALIGN, \
                                        common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                sample_df = align_obj.run_parallel(n_jobs=12, verbose=False)

                # Collect alignment statistics:
                stats_obj = STATS_collection(dir_dict_masked, tRNA_data, sample_df, \
                                             common_seqs=common_seqs, overwrite_dir=True)
                stats_df = stats_obj.run_parallel(n_jobs=12, verbose=False)

                # Collect alignment stats and append to file:
                col_sele = ['sample_name_unique', 'Mapping_percent', \
                            'percent_single_annotation', 'percent_multiple_codons']
                align_res = sample_df.loc[:, col_sele]
                align_res['unique_anno'] = unique_anno
                align_res['frac_max_score'] = frac_max_score
                align_res['min_mut_freq'] = min_mut_freq
                align_res['iteration'] = iteration
                with open('align-opti_res.csv', 'a') as fh_res:
                    align_res.to_csv(fh_res, index=False, header=False)

Running combi 1 of 162
Running combi 2 of 162
Running combi 3 of 162
Running combi 4 of 162
Running combi 5 of 162
Running combi 6 of 162
Running combi 7 of 162
Running combi 8 of 162
Running combi 9 of 162
Running combi 10 of 162
Running combi 11 of 162
Running combi 12 of 162
Running combi 13 of 162
Running combi 14 of 162
Running combi 15 of 162
Running combi 16 of 162
Running combi 17 of 162
Running combi 18 of 162
Running combi 19 of 162
Running combi 20 of 162
Running combi 21 of 162
Running combi 22 of 162
Running combi 23 of 162
Running combi 24 of 162
Running combi 25 of 162
Running combi 26 of 162
Running combi 27 of 162
Running combi 28 of 162
Running combi 29 of 162
Running combi 30 of 162
Running combi 31 of 162
Running combi 32 of 162
Running combi 33 of 162
Running combi 34 of 162
Running combi 35 of 162
Running combi 36 of 162
Running combi 37 of 162
Running combi 38 of 162
Running combi 39 of 162
Running combi 40 of 162
Running combi 41 of 162
Running combi 42 of 162
R