In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys, shutil, bz2, random, resource, warnings
from subprocess import Popen, PIPE, STDOUT
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
from Bio import SeqIO, bgzf
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from mpire import WorkerPool
import jellyfish

In [3]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from utils.functions import indices, index_to_sample_df, downsample_raw_input, AR_merge, BC_split, Kmer_analysis, BC_analysis, UMI_trim

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_folder = 'data'
seq_folder_noDS = 'raw_fastq' # Not downsampled
seq_folder = 'raw_fastq'

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
tRNA_database = dict()
tRNA_database['human'] = '{}/utils/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/utils/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)


# Define minimum read length based on minimum insert size:
MIN_INSERT = 15
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))


# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/projects/tRNAseq_third-gen
Repo is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq
Using minimum read length: 44 (after merge)


### Settings

In [4]:
sample_list_fnam = 'sample_list_P1-2-3.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Downsample:
if True:
    sample_df, inp_file_df, seq_folder = downsample_raw_input(sample_df, inp_file_df, NBdir, data_folder, seq_folder_noDS, downsample_absolute=1e4)

In [5]:
# Run AdapterRemoval:
AR_obj = AR_merge(inp_file_df, NBdir, data_folder, seq_folder, AdapterRemoval_dir, MIN_READ_LEN)
AdapterRemoval_dir_abs = AR_obj.make_dir(overwrite=True)
# inp_file_df = AR_obj.run_serial()
inp_file_df = AR_obj.run_parallel()

In [8]:
# Split files based on barcodes:
BCsplit_obj = BC_split(sample_df, inp_file_df, NBdir, data_folder, AdapterRemoval_dir_abs, BC_dir)
BC_dir_abs = BCsplit_obj.make_dir(overwrite=True)
# sample_df, inp_file_df = BCsplit_obj.run_serial()
sample_df, inp_file_df = BCsplit_obj.run_parallel()

In [10]:
### Perform Kmer analysis on unmapped reads ###
if False: # barcode analysis is typically suficient 
    kmer_obj = Kmer_analysis(inp_file_df, index_df, BC_dir_abs)
    # Add a filter to avoid Kmers from the end of tRNA sequences:
    kmer_obj.filter_3p_fasta(tRNA_database['human'])
    # Add to this filter the constant region of the adapters:
    kmer_obj.filter_window_BC(filter_window=(0, 11))
    # Search for Kmers:
    all_kmer = kmer_obj.search_unmapped(search_size=13)

In [11]:
### Perform barcode analysis on unmapped reads ###
# Recall that adapters look like this:
# GGCTGCCATGC    GACTA
# GGCTGCCATGCA   AGTGC
# GGCTGCCATGCTG  TCACG
# GGCTGCCATGCAAC CTGAT
# With the barcode as the rightmost 5 nt.
# Search these 5 nt. barcodes by specifying BC_size_3p=5
bc_analysis_obj = BC_analysis(inp_file_df, index_df, BC_dir_abs, BC_size_3p=5)
# Search for barcodes in the unmapped reads.
# For the summary output, filter by a max distance
# to any barcode of 1 (group_dist=1), then group by barcode name:
bc_analysis_df = bc_analysis_obj.search_unmapped(group_dist=1)
bc_analysis_df

Unnamed: 0,Name,Count
0,l5Sp,168
1,l12Sp,107
2,l4Sp,67
3,l10Sp,52
4,l6Sp,50
5,l9Sp,48
6,l3Sp,43
7,l8Sp,41
8,l1Sp,38
9,l2Sp,34


In [12]:
### Generate UMI stats and write final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.
UMItrim_obj = UMI_trim(sample_df, NBdir, data_folder, BC_dir_abs, UMI_dir)
UMI_dir_abs = UMItrim_obj.make_dir(overwrite=True)
# sample_df = UMItrim_obj.run_serial()
sample_df = UMItrim_obj.run_parallel()
sample_df

Unnamed: 0,sample_name_unique,sample_name,replicate,P5_index,P7_index,barcode,species,plot_group,hue_name,hue_value,hue_order,P5_index_seq,P7_index_seq,barcode_seq,fastq_mate1_filename,fastq_mate2_filename,N_total,N_CC,N_CCA,N_CCA+CC,CCA+CC_percent_total,percent_CCA,N_UMI_observed,N_UMI_expected,percent_seqs_after_UMI_trim,percent_UMI_obs-vs-exp
0,100p1,100p,1,D501,D701,l1Sp,human,Charge-titration,Percent charge,100p,1,AGGCTATA,ATTACTCG,GGCTGCCATGCGACTA,P1_R1_DSA-10k.fastq.bz2,P1_R2_DSA-10k.fastq.bz2,966,38,925,963,99.689441,96.053998,885,950.138923,98.447205,93.144274
1,100p2,100p,2,D501,D702,l2Sp,human,Charge-titration,Percent charge,100p,1,AGGCTATA,TCCGGAGA,GGCTGCCATGCAAGTGC,P2_R1_DSA-10k.fastq.bz2,P2_R2_DSA-10k.fastq.bz2,680,14,656,670,98.529412,97.910448,630,670.571439,98.676471,93.949722
2,100p3,100p,3,D501,D703,l3Sp,human,Charge-titration,Percent charge,100p,1,AGGCTATA,CGCTCATT,GGCTGCCATGCTGTCACG,P3_R1_DSA-10k.fastq.bz2,P3_R2_DSA-10k.fastq.bz2,723,35,679,714,98.755187,95.098039,645,705.52554,97.648686,91.421212
3,100p4,100p,4,D501,D701,l4Sp,human,Charge-titration,Percent charge,100p,1,AGGCTATA,ATTACTCG,GGCTGCCATGCAACCTGAT,P1_R1_DSA-10k.fastq.bz2,P1_R2_DSA-10k.fastq.bz2,615,63,547,610,99.186992,89.672131,558,599.65738,97.560976,93.053136
4,85p1,85p,1,D501,D702,l5Sp,human,Charge-titration,Percent charge,85p,2,AGGCTATA,TCCGGAGA,GGCTGCCATGCTGCGA,P2_R1_DSA-10k.fastq.bz2,P2_R2_DSA-10k.fastq.bz2,752,198,545,743,98.803191,73.351279,709,742.474481,98.803191,95.491497
5,85p2,85p,2,D501,D703,l6Sp,human,Charge-titration,Percent charge,85p,2,AGGCTATA,CGCTCATT,GGCTGCCATGCAAGCTG,P3_R1_DSA-10k.fastq.bz2,P3_R2_DSA-10k.fastq.bz2,739,172,556,728,98.511502,76.373626,667,728.494107,98.64682,91.558736
6,85p3,85p,3,D501,D701,l7Sp,human,Charge-titration,Percent charge,85p,2,AGGCTATA,ATTACTCG,GGCTGCCATGCTGGTGAC,P1_R1_DSA-10k.fastq.bz2,P1_R2_DSA-10k.fastq.bz2,199,37,162,199,100.0,81.407035,175,194.963927,97.98995,89.760194
7,85p4,85p,4,D501,D702,l8Sp,human,Charge-titration,Percent charge,85p,2,AGGCTATA,TCCGGAGA,GGCTGCCATGCAACGCATC,P2_R1_DSA-10k.fastq.bz2,P2_R2_DSA-10k.fastq.bz2,771,177,592,769,99.740597,76.983095,720,761.447249,98.832685,94.55678
8,70p1,70p,1,D501,D703,l9Sp,human,Charge-titration,Percent charge,70p,3,AGGCTATA,CGCTCATT,GGCTGCCATGCTACAG,P3_R1_DSA-10k.fastq.bz2,P3_R2_DSA-10k.fastq.bz2,905,399,498,897,99.116022,55.518395,816,891.242475,98.563536,91.557575
9,70p2,70p,2,D501,D701,l10Sp,human,Charge-titration,Percent charge,70p,3,AGGCTATA,ATTACTCG,GGCTGCCATGCACATGA,P1_R1_DSA-10k.fastq.bz2,P1_R2_DSA-10k.fastq.bz2,1914,1190,702,1892,98.850575,37.103594,1743,1884.606457,98.641588,92.486152
