In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys, shutil, bz2, random, resource, warnings
from subprocess import Popen, PIPE, STDOUT
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np
from Bio import SeqIO, bgzf
from Bio.SeqIO.QualityIO import FastqGeneralIterator
from mpire import WorkerPool
import jellyfish

In [3]:
# Navigate back to NBdir in case of re-running a code block:
if not 'NBdir' in globals():
    NBdir = os.getcwd()
print('Notebook is in: {}'.format(NBdir))
os.chdir(NBdir)  # If you changed the current working dir, this will take you back to the notebook dir.

# Define the path to the repo folder.
# Change if necessary.
homedir = '/'.join(NBdir.split('/')[0:-2])
print('Repo is in: {}'.format(homedir))
sys.path.insert(1, homedir)
from utils.functions import indices, index_to_sample_df, downsample_raw_input, AR_merge, BC_split, Kmer_analysis, BC_analysis

# These are default folder names for data and raw fastq files
# relative to the folder in which this notebook is in:
data_folder = 'data'
seq_folder_noDS = 'raw_fastq' # Not downsampled
seq_folder = 'raw_fastq'

# These folder names are used in subsequent processing steps
# to dump data. Best to not change:
AdapterRemoval_dir = 'AdapterRemoval'
BC_dir = 'BC_split'
UMI_dir = 'UMI_trimmed'
tRNA_database = dict()
tRNA_database['human'] = '{}/utils/tRNA_database/human/hg38-tRNAs.fa'.format(homedir)
tRNA_database['mouse'] = '{}/utils/tRNA_database/mouse/mm10-tRNAs.fa'.format(homedir)


# Define minimum read length based on minimum insert size:
MIN_INSERT = 15
UMI_LEN = 10
BC_MAX_LEN = 19
MIN_READ_LEN = MIN_INSERT + UMI_LEN + BC_MAX_LEN
print('Using minimum read length: {} (after merge)'.format(MIN_READ_LEN))


# Read index information:
index_list_fnam = 'index_list.xlsx'
index_df = pd.read_excel('{}/utils/{}'.format(homedir, index_list_fnam))

Notebook is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/projects/tRNAseq_third-gen
Repo is in: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq
Using minimum read length: 44 (after merge)


### Settings

In [4]:
sample_list_fnam = 'sample_list_P1-2-3.xlsx'
sample_df = pd.read_excel('{}/{}'.format(NBdir, sample_list_fnam))
# Add barcode sequences:
sample_df = index_to_sample_df(sample_df, index_df)
# Get filenames from the sample information:
inp_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].copy().drop_duplicates().reset_index(drop=True)

# Downsample:
if True:
    downsample_fold = 100
    downsample_absolute = 1e5 # Takes precedence, False if fold should be used
    sample_df, inp_file_df, seq_folder = downsample_raw_input(sample_df, inp_file_df, NBdir, data_folder, seq_folder_noDS, downsample_absolute=1e4)

In [5]:
# Run AdapterRemoval:
AR_obj = AR_merge(inp_file_df, NBdir, data_folder, seq_folder, AdapterRemoval_dir, MIN_READ_LEN)
AdapterRemoval_dir_abs = AR_obj.make_dir(overwrite=True)
# inp_file_df = AR_obj.run_serial()
inp_file_df = AR_obj.run_parallel()

In [6]:
# Split files based on barcodes:
BCsplit_obj = BC_split(sample_df, inp_file_df, NBdir, data_folder, AdapterRemoval_dir_abs, BC_dir)
BC_dir_abs = BCsplit_obj.make_dir(overwrite=True)
# sample_df, inp_file_df = BCsplit_obj.run_serial()
sample_df, inp_file_df = BCsplit_obj.run_parallel()

In [17]:
### Perform Kmer analysis on unmapped reads ###
if False: # barcode analysis is typically suficient 
    kmer_obj = Kmer_analysis(inp_file_df, index_df, BC_dir_abs)
    # Add a filter to avoid Kmers from the end of tRNA sequences:
    kmer_obj.filter_3p_fasta(tRNA_database['human'])
    # Add to this filter the constant region of the adapters:
    kmer_obj.filter_window_BC(filter_window=(0, 11))
    # Search for Kmers:
    all_kmer = kmer_obj.search_unmapped(search_size=13)

In [None]:
### Perform barcode analysis on unmapped reads ###
# Recall that adapters look like this:
# GGCTGCCATGC    GACTA
# GGCTGCCATGCA   AGTGC
# GGCTGCCATGCTG  TCACG
# GGCTGCCATGCAAC CTGAT
# With the barcode as the rightmost 5 nt.
# Search these 5 nt. barcodes by specifying BC_size_3p=5
bc_analysis_obj = BC_analysis(inp_file_df, index_df, BC_dir_abs, BC_size_3p=5)
# Search for barcodes in the unmapped reads:
bc_analysis_df = bc_analysis_obj.search_unmapped(group_dist=1)
bc_analysis_df

In [13]:
kmer_obj.filter_dict

{'GGCTG': 1,
 'GCTGC': 1,
 'CTGCC': 1,
 'TGCCA': 1,
 'GCCAT': 1,
 'CCATG': 1,
 'CATGC': 1,
 'CCCAC': 1,
 'CCACC': 1,
 'CACCA': 1,
 'CTTAC': 1,
 'TTACC': 1,
 'TACCA': 1,
 'CCAAC': 1,
 'CAACC': 1,
 'AACCA': 1,
 'CTAGC': 1,
 'TAGCC': 1,
 'AGCCA': 1,
 'GCTTC': 1,
 'CTTCC': 1,
 'TTCCA': 1,
 'AATAC': 1,
 'ATACC': 1,
 'AGTAC': 1,
 'GTACC': 1,
 'TACCC': 1,
 'ACCCC': 1,
 'CCCCA': 1,
 'TCTAC': 1,
 'CTACC': 1,
 'AACAC': 1,
 'ACACC': 1,
 'GTGAC': 1,
 'TGACC': 1,
 'GACCA': 1,
 'ACTAC': 1,
 'CTGAC': 1,
 'CTCAC': 1,
 'TCACC': 1,
 'TTTGC': 1,
 'TTGCC': 1,
 'GACAC': 1,
 'TCTGC': 1,
 'ACCAC': 1,
 'TCCAC': 1,
 'CTCGC': 1,
 'TCGCC': 1,
 'CGCCA': 1,
 'GTCGC': 1,
 'GGTAC': 1,
 'GGTGC': 1,
 'GTGCC': 1,
 'GTTAC': 1,
 'GTTGC': 1,
 'GATGC': 1,
 'ATGCC': 1,
 'GACGC': 1,
 'ACGCC': 1,
 'GGCGC': 1,
 'GCGCC': 1,
 'AACGC': 1,
 'GGAGC': 1,
 'GAGCC': 1,
 'CCCTC': 1,
 'CCTCC': 1,
 'CTCCA': 1,
 'CCCCC': 1,
 'CCTTC': 1,
 'ACCTC': 1,
 'GGAAC': 1,
 'GAACC': 1,
 'GAAAC': 1,
 'AAACC': 1,
 'TGCAC': 1,
 'GCACC': 1,
 'CGCAC': 1,

In [44]:
s = 'ABCDE'

In [46]:
s[-5:]

'ABCDE'

In [None]:
%%bash
pwd

In [9]:
### Find Kmers for unmapped reads ###

#os.chdir(sample_fastq_dir)




# Then search for Kmers in the last 13 nt. in reads
# longer than the minimum cutoff:
search_size = 13
k_dict = dict()
with bz2.open('no-barcode_untrimmed.fastq.bz2', "rt") as unmapped_fh:
    for title, seq, qual in FastqGeneralIterator(unmapped_fh):
        if len(seq) >= MIN_READ_LEN:
            k_dict = add_kmers(k_dict, seq[-search_size:], k_size, filter_dict)

# Rank Kmers by occurence and find closely related adapters: 
kmer_df_dat = list()
for kmer_seq, count in sorted(k_dict.items(), key=lambda x:x[1], reverse=True):
    bc_min_dist, dist_min = find_min_dist_bc(kmer_seq, index_dict)
    if dist_min < 2:
        kmer_df_dat.append([kmer_seq, count, dist_min, bc_min_dist])
    else:
        kmer_df_dat.append([kmer_seq, count, None, None])
kmer_df = pd.DataFrame(kmer_df_dat, columns=['Kmer', 'Count', 'Barcode distance', 'Barcode'])
kmer_df.to_excel('no-barcode_Kmer-analysis.xlsx')

#os.chdir('..')

In [10]:
### Generate UMI stats and write final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.

# From: https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
# I get the expected number of unique UMIs:
# E_X = n*(1-((n-1) / n)**k)
# Where k = is the number of sequences (draws)
# and n = to the number of possible UMIs (bins)
n_bins = 4**9 * 2 # number of UMI bins (n)


# Create folder for files:
try:
    os.mkdir(umi_dir)
except:
    shutil.rmtree(umi_dir)
    os.mkdir(umi_dir)
os.chdir(umi_dir)

# Dump all the sequences where no UMI was found:
unmapped_fh = bz2.open('no-UMI_untrimmed.fastq.bz2', "wt")

# Trim UMIs off sequences:
N_umi_obs = list()
N_umi_exp = list()
N_seq_list = list()
for index, row in sample_df.iterrows(): # Process each sample individually
    fastq_name = '../{}/{}.fastq.bz2'.format(sample_fastq_dir, row['sample_name_unique'])
    UMIs = set()
    Nseqs = 0
    with bz2.open('{}_UMI-trimmed.fastq.bz2'.format(row['sample_name_unique']), "wt") as output_fh:
        with bz2.open(fastq_name, "rt") as input_fh:
            for title, seq, qual in FastqGeneralIterator(input_fh):
                umi = seq[0:UMI_LEN]
                if umi[-1] == 'T' or umi[-1] == 'C': # UMI sequence requirement
                    UMIs.add(umi)
                    Nseqs += 1
                    # Add UMI sequence to title:
                    title = title + ':' + umi
                    # Write the trimmed sequence:
                    output_fh.write("@{}\n{}\n+\n{}\n".format(title, seq[10:], qual[10:]))
                else:
                    # Write the untrimmed sequence if UMI was not found:
                    unmapped_fh.write("@{}\n{}\n+\n{}\n".format(title, seq, qual))
    # Calculate the observed and expected UMI count:
    N_seq_list.append(Nseqs)
    k_draws = Nseqs
    N_umi_obs.append(len(UMIs))
    E_X = n_bins*(1-((n_bins-1) / n_bins)**k_draws)
    N_umi_exp.append(round(E_X))

# Collect UMI stats:
sample_df['N_UMI_observed'] = N_umi_obs
sample_df['N_UMI_expected'] = N_umi_exp
sample_df['percent_seqs_after_UMI_trim'] = np.array(N_seq_list) / sample_df['N_total'].values * 100
sample_df['percent_UMI_obs-vs-exp'] = sample_df['N_UMI_observed'].values / sample_df['N_UMI_expected'].values * 100
sample_df.to_excel('sample_UMI_stats.xlsx')

os.chdir('..')
# Move stats files to project folder:
shutil.copy2(umi_dir + '/sample_UMI_stats.xlsx', stats_dir)

'../../projects/tRNAseq_third-gen/read_processing_stats/sample_UMI_stats.xlsx'