In [1]:
%load_ext autoreload
%autoreload 2

import os, sys, shutil, bz2, copy, warnings
import pandas as pd
pd.set_option('display.max_columns', 50)
# Numpy will complain about subnormals if python
# is compiled with the -ffast-math compiler flag:
# https://github.com/clearlinux/distribution/issues/2809
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="The value of the smallest subnormal for")
    import numpy as np

In [2]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from src.misc import index_to_sample_df, read_tRNAdb_info, sample_df_to_dict
from src.read_processing import AR_merge, BC_split, UMI_trim
from src.alignment import SWIPE_align
from src.stats_collection import STATS_collection

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_dir = 'data'
seq_dir = 'raw_fastq'

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
align_dir = 'SWalign'
stats_dir = 'stats_collection'
tRNA_database = dict()
tRNA_database['human'] = '{}/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database_masked = dict()
tRNA_database_masked['human'] = '{}/tRNA_database_masked/human/human-tRNAs.fa'.format(homedir)
# Read information (length, codon etc) of tRNAs into dictionary:
tRNA_data = read_tRNAdb_info(tRNA_database)
SWIPE_score_mat = '{}/utils/nuc_score-matrix.txt'.format(homedir)
SWIPE_score_mat2 = '{}/utils/nuc_score-matrix_2.txt'.format(homedir) # For masked reference sequences
# tRNA sequencing yields many duplicated reads.
# Adding these commonly seen sequences to a list prevents duplicated alignment:
common_seqs = '{}/utils/common-seqs.fasta.bz2'.format(homedir)

# Define minimum read length based on minimum insert size:
MIN_INSERT_LEN = 10
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT_LEN + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))

# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

# Read sample list:
sample_list_fnam = 'sample_list.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Read elementary info (replicate, barcode, species)
# for each unique sample name into a dictionary:
sample_dict = sample_df_to_dict(sample_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)
# Make a dictionary with paths used for data processing:
dir_dict = dict(NBdir = NBdir,
                data_dir = data_dir,
                seq_dir = seq_dir,
                AdapterRemoval_dir = AdapterRemoval_dir,
                BC_dir = BC_dir,
                UMI_dir = UMI_dir,
                align_dir = align_dir,
                stats_dir = stats_dir)

Notebook is in: /home/sulab/tRNA-charge-seq/projects/masked-align_analysis
Repo is in: /home/sulab/tRNA-charge-seq
Using minimum read length: 39 (after merge)


# Alignment effect on reads mapping to transcripts/codons
Process a set of samples covering all adapter barcodes with/without using a masked reference for alignment.
Gather alignment statistics from each case with the purpose of showing the effect of the alignment method on the mapping to transcripts/codons.

In [3]:
# Run AdapterRemoval:
AR_obj = AR_merge(dir_dict, inp_file_df, MIN_READ_LEN, overwrite_dir=False, \
                  check_input=False, AR_threads=12)
inp_file_df = AR_obj.run_parallel(n_jobs=4, overwrite=False)

# Split files based on barcodes:
BCsplit_obj = BC_split(dir_dict, sample_df, inp_file_df, overwrite_dir=False)
sample_df, inp_file_df = BCsplit_obj.run_parallel(n_jobs=9, load_previous=True)

# Trim UMIs:
UMItrim_obj = UMI_trim(dir_dict, sample_df, overwrite_dir=False)
sample_df = UMItrim_obj.run_parallel(n_jobs=9, load_previous=True)
sample_df.head(3)

Using existing folder because overwrite set to false: /home/sulab/tRNA-charge-seq/projects/masked-align_analysis/data/AdapterRemoval
Downsampling UMI trimmed sequences to maximum 2000000 reads.


Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_after_downsample
0,8h_p1,8h_p1,1,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l1Sp,human,Barcode test,Barcode,l1Sp,1,AGGATAGG,ATTACTCG,GGCTGCCATGCGACTA,3789831,1854972,1889882,3744854,98.813219,50.466106,3701708,502544,523837.957508,97.674751,95.935011,2000000
1,8h_p2,8h_p2,2,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l2Sp,human,Barcode test,Barcode,l2Sp,2,AGGATAGG,ATTACTCG,GGCTGCCATGCTGTCACG,3330318,1694269,1594075,3288344,98.73964,48.476528,3254876,491336,523232.671659,97.734691,93.903922,2000000
2,8h_p3,8h_p3,3,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l3Sp,human,Barcode test,Barcode,l3Sp,3,AGGATAGG,ATTACTCG,GGCTGCCATGCTGCGA,3623618,1789206,1790632,3579838,98.791815,50.019917,3537475,497838,523672.403392,97.622735,95.066686,2000000


In [4]:
# Align reads to unmasked reference tRNAs:
align_obj = SWIPE_align(dir_dict, tRNA_database, sample_df, SWIPE_score_mat2, \
                        gap_penalty=6, extension_penalty=3, min_score_align=15, \
                        common_seqs=common_seqs, overwrite_dir=True)
sample_df = align_obj.run_parallel(n_jobs=12, overwrite=False, load_previous=False)

# Collect alignment statistics:
stats_obj = STATS_collection(dir_dict, tRNA_data, sample_df, common_seqs=common_seqs, \
                             overwrite_dir=True)
stats_df = stats_obj.run_parallel(n_jobs=12, load_previous=False)
sample_df.head(3)

Using common sequences to prevent duplicated alignment.
Running Swipe on:  common-seqs  8h_p6  8h_p1  8h_p4  8h_p2  8h_p7  8h_p9  8h_p3  8h_p5  8h_p8
Collecting alignment statistics, from sample:  common-seqs  8h_p3  8h_p7  8h_p2  8h_p1  8h_p5  8h_p6  8h_p8  8h_p9  8h_p4Using common sequences...
Collecting stats from:  8h_p1  8h_p2  8h_p3  8h_p6  8h_p7  8h_p8  8h_p9  8h_p4  8h_p5

Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_after_downsample,N_mapped,percent_single_annotation,percent_multiple_annotation,percent_multiple_codons,Mapping_percent
0,8h_p1,8h_p1,1,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l1Sp,human,Barcode test,Barcode,l1Sp,1,AGGATAGG,ATTACTCG,GGCTGCCATGCGACTA,3789831,1854972,1889882,3744854,98.813219,50.466106,3701708,502544,523837.957508,97.674751,95.935011,2000000,1976609.0,68.947728,31.052272,7.087188,98.83045
1,8h_p2,8h_p2,2,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l2Sp,human,Barcode test,Barcode,l2Sp,2,AGGATAGG,ATTACTCG,GGCTGCCATGCTGTCACG,3330318,1694269,1594075,3288344,98.73964,48.476528,3254876,491336,523232.671659,97.734691,93.903922,2000000,1969357.0,68.129648,31.870352,7.179298,98.46785
2,8h_p3,8h_p3,3,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l3Sp,human,Barcode test,Barcode,l3Sp,3,AGGATAGG,ATTACTCG,GGCTGCCATGCTGCGA,3623618,1789206,1790632,3579838,98.791815,50.019917,3537475,497838,523672.403392,97.622735,95.066686,2000000,1976978.0,68.889942,31.110058,7.191076,98.8489


In [5]:
# Make new folders for the masked alignment/stats:
dir_dict_masked = copy.deepcopy(dir_dict)
dir_dict_masked['align_dir'] = 'SWalign_masked'
dir_dict_masked['stats_dir'] = 'stats_collection_masked'

# Align reads to masked reference tRNAs:
align_obj = SWIPE_align(dir_dict_masked, tRNA_database_masked, sample_df, SWIPE_score_mat2, \
                        gap_penalty=6, extension_penalty=3, min_score_align=15, \
                        common_seqs=common_seqs, overwrite_dir=True)
sample_df = align_obj.run_parallel(n_jobs=12, overwrite=False, load_previous=False)

# Collect alignment statistics:
stats_obj = STATS_collection(dir_dict_masked, tRNA_data, sample_df, common_seqs=common_seqs, \
                             overwrite_dir=True)
stats_df = stats_obj.run_parallel(n_jobs=12, load_previous=False)
sample_df.head(3)

Using common sequences to prevent duplicated alignment.
Running Swipe on:  common-seqs  8h_p9  8h_p5  8h_p7  8h_p3  8h_p4  8h_p1  8h_p2  8h_p6  8h_p8
Collecting alignment statistics, from sample:  common-seqs  8h_p9  8h_p7  8h_p3  8h_p1  8h_p2  8h_p6  8h_p8  8h_p4  8h_p5Using common sequences...
Collecting stats from:  8h_p1  8h_p3  8h_p2  8h_p6  8h_p7  8h_p8  8h_p9  8h_p4  8h_p5

Unnamed: 0,sample_name_unique,sample_name,replicate,fastq_mate1_filename,fastq_mate2_filename,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_after_trim,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp,N_after_downsample,N_mapped,percent_single_annotation,percent_multiple_annotation,percent_multiple_codons,Mapping_percent
0,8h_p1,8h_p1,1,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l1Sp,human,Barcode test,Barcode,l1Sp,1,AGGATAGG,ATTACTCG,GGCTGCCATGCGACTA,3789831,1854972,1889882,3744854,98.813219,50.466106,3701708,502544,523837.957508,97.674751,95.935011,2000000,1978910.0,75.396102,24.603898,2.779358,98.9455
1,8h_p2,8h_p2,2,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l2Sp,human,Barcode test,Barcode,l2Sp,2,AGGATAGG,ATTACTCG,GGCTGCCATGCTGTCACG,3330318,1694269,1594075,3288344,98.73964,48.476528,3254876,491336,523232.671659,97.734691,93.903922,2000000,1972529.0,74.093917,25.906083,2.865864,98.62645
2,8h_p3,8h_p3,3,2023-02-28/P7_R1.fastq.bz2,2023-02-28/P7_R2.fastq.bz2,D503,D701,l3Sp,human,Barcode test,Barcode,l3Sp,3,AGGATAGG,ATTACTCG,GGCTGCCATGCTGCGA,3623618,1789206,1790632,3579838,98.791815,50.019917,3537475,497838,523672.403392,97.622735,95.066686,2000000,1979241.0,75.18281,24.81719,2.826336,98.96205
