In [33]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np

### Plotting imports ###
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.colors as mcolors
import matplotlib as mpl
from matplotlib.patches import StepPatch
import matplotlib.ticker as ticker
import matplotlib.gridspec as gridspec
import logomaker as lm
palette = list(mcolors.TABLEAU_COLORS.keys())
sns.set_theme(style="ticks", palette="muted")
sns.set_context("talk")
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, downsample_raw_input, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection
from src.plotting import TRNA_plot
from src.transcript_mutations import TM_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'
seq_dir_noDS = seq_dir # Not downsampled

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
TM_dir = 'transcript_mutations'
plotting_dir = 'plotting'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)
# ^^ That one is too big, take the top 1k instead:
common_seqs = '{}/utils/common-seqs_1k.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# The minimum alignment score.
# Better to set relatively low, since additional filtering can
# be applied later.
MIN_SCORE_ALIGN = 15
print('Using minimum alignemnt score: {}'.format(MIN_SCORE_ALIGN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/projects/alignment-opti
Repo is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq
Using minimum read length: 39 (after merge)
Using minimum alignemnt score: 15


### Settings

In [12]:
# sample_list_fnam = 'sample_list_alignment-opti.xlsx'
sample_list_fnam = 'sample_list_alignment-opti_DS.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir,
                TM_dir = TM_dir,
                plotting_dir = plotting_dir)

### First pass alignment

In [13]:
# Run AdapterRemoval:
AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=True)
inp_file_df = AR_obj.run_parallel(n_jobs=4)

# Split files based on barcodes:
BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=True)
sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=4)

# Trim UMI:
UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=True)
sample_df = UMItrim_obj.run_parallel(n_jobs=4)

# First pass alignment:
align_obj = SWIPE_align(dir_dict, tRNA_database, sample_df, SWIPE_score_mat, \
                        gap_penalty=6, extension_penalty=1, min_score_align=MIN_SCORE_ALIGN, \
                        common_seqs=common_seqs, overwrite_dir=True)
sample_df = align_obj.run_parallel(n_jobs=4, verbose=False)

# Collect alignment statistics:
stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, overwrite_dir=True)
stats_df = stats_obj.run_parallel(n_jobs=4, verbose=False)

# Write first alignment stats:
align_res = sample_df.loc[:, ['sample_name_unique', 'Mapping_percent', 'percent_single_annotation']]
align_res['unique_anno'] = None
align_res['frac_max_score'] = None
align_res['min_mut_freq'] = None
align_res['iteration'] = None
with open('align-opti_res.csv', 'w') as fh_res:
    align_res.to_csv(fh_res, index=False)

Using common sequences to prevent duplicated alignment.
Running Swipe on:  1h_1  32m_1  8m_1  0m_1  40h_1  8h_1  40h_NoOx_1  16h_1  4h_1  common-seqs
Collecting alignment statistics, from sample:  8m_1  0m_1  32m_1  1h_1  40h_1  8h_1  40h_NoOx_1  16h_1  4h_1  common-seqsUsing common sequences...
Collecting stats from:  0m_1  32m_1  8m_1  1h_1  40h_1  8h_1  40h_NoOx_1  16h_1  4h_1

In [34]:
dir_dict_masked = copy.deepcopy(dir_dict)
dir_dict_masked['align_dir'] = 'SWalign_masked'
dir_dict_masked['stats_dir'] = 'stats_collection_masked'

In [35]:
# Combination of variables to test:
min_mut_freq_grid = [0.4, 0.6, 0.7, 0.75, 0.8, \
                     0.83, 0.86, 0.89, 0.95]
frac_max_score_grid = [0.90, 0.95, 1]
unique_anno_grid = [True, False]
N_iterations = 3
total_combi = len(min_mut_freq_grid) * len(frac_max_score_grid) * len(unique_anno_grid) * N_iterations

In [37]:
combi_counter = 0
# Run all the nested combinations:
for unique_anno in unique_anno_grid:
    for frac_max_score in frac_max_score_grid:
        for min_mut_freq in min_mut_freq_grid:
            # "iteration" must be the innermost nest,
            # because the result depends on the previous run:
            for iteration in range(1, N_iterations+1):
                combi_counter += 1
                print('Running combi {} of {}'.format(combi_counter, total_combi))

                # At first iteration use first pass alignment,
                # after this use the masked alignments:
                if iteration == 1:
                    dir_dict_iter = dir_dict
                else:
                    dir_dict_iter = dir_dict_masked
                # Perform transcript mutation analysis:
                TM_obj = TM_analysis(dir_dict_iter, sample_df, tRNA_database, \
                                     common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                TM_obj.find_muts(n_jobs=4, unique_anno=unique_anno, verbose=False)
                TM_obj.mask_tRNA_database(min_mut_freq=min_mut_freq, frac_max_score=frac_max_score, \
                                          min_pos_count=100, min_tr_count=200)
                tRNA_database_masked = TM_obj.write_masked_tRNA_database(out_dir='tRNA_database_masked')

                # Run alignment:
                align_obj = SWIPE_align(dir_dict_masked, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                                        gap_penalty=6, extension_penalty=3, min_score_align=MIN_SCORE_ALIGN, \
                                        common_seqs=common_seqs, overwrite_dir=True, verbose=False)
                sample_df = align_obj.run_parallel(n_jobs=4, verbose=False)

                # Collect alignment statistics:
                stats_obj = STATS_collection(dir_dict_masked, tRNA_data, sample_df, \
                                             common_seqs=common_seqs, overwrite_dir=True)
                stats_df = stats_obj.run_parallel(n_jobs=4, verbose=False)

                # Collect alignment stats and append to file:
                col_sele = ['sample_name_unique', 'Mapping_percent', 'percent_single_annotation']
                align_res = sample_df.loc[:, col_sele]
                align_res['unique_anno'] = unique_anno
                align_res['frac_max_score'] = frac_max_score
                align_res['min_mut_freq'] = min_mut_freq
                align_res['iteration'] = iteration
                with open('align-opti_res.csv', 'a') as fh_res:
                    align_res.to_csv(fh_res, index=False, header=False)

Running combi 1 of 162
Using common sequences...
Using common sequences to prevent duplicated alignment.


KeyboardInterrupt: 