In [15]:
import os, sys, shutil, bz2
from subprocess import Popen, PIPE, STDOUT
import pandas as pd
import numpy as np
from Bio import SeqIO, bgzf
from Bio.SeqIO.QualityIO import FastqGeneralIterator

# Navigate back to workbookDir in case of re-running a code block:
if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
print('workbookDir: ' + workbookDir)
os.chdir(workbookDir)  # If you changed the current working dir, this will take you back to the workbook dir.

workbookDir: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/1-fastq_processing


### Requirements
gggg
ssss

In [None]:
##

### Need a method to validate adapter barcode uniqueness

##

In [16]:
# These folder/files change depending on dataset:
#data_folder = 'data/pilot_exp'
#project_folder = 'projects/pilot_exp'
data_folder = 'data/pilot_exp_v3'
project_folder = 'projects/pilot_exp_v3'
seq_folder = 'raw_fastq'
index_list = 'index_list.xlsx'
sample_list = 'sample_list.xlsx'

# Define minimum read length based on minimum insert size:
MIN_INSERT = 15
UMI_LEN = 10
BC_MAX_LEN = 12
MIN_READ_LEN = MIN_INSERT + UMI_LEN + BC_MAX_LEN

In [17]:
# These folder names are used in subsequent steps, so do not change:
AdapterRemoval_dir = 'AdapterRemoval'
sample_fastq_dir = 'processed_fastq'
umi_dir = 'UMI_trimmed'

In [18]:
# Read sample/index information:
index_df = pd.read_excel(index_list)
sample_df = pd.read_excel('../' + project_folder + '/' + sample_list)

# Make folder structure for data and stats:
os.chdir('../' + data_folder)
stats_dir = '../../' + project_folder + '/read_processing_stats'
try:
    os.mkdir(stats_dir)
except:
    shutil.rmtree(stats_dir)
    os.mkdir(stats_dir)

In [19]:
# Read index sequences into dict:
index_dict = dict()
for t, i, s in zip(index_df['type'].values, index_df['id'].values, index_df['sequence'].values):
    if t not in index_dict:
        index_dict[t] = dict()
    index_dict[t][i] = s

# Add index sequences to dataframe:
sample_df['P5_index_seq'] = [index_dict['P5_index'][i] for i in sample_df['P5_index'].values]
sample_df['P7_index_seq'] = [index_dict['P7_index'][i] for i in sample_df['P7_index'].values]
sample_df['barcode_seq'] = [index_dict['barcode'][i] for i in sample_df['barcode'].values]

# Add seq_folder to filename:
sample_df['fastq_mate1_filename'] = [seq_folder + '/' + fn for fn in sample_df['fastq_mate1_filename'].values]
sample_df['fastq_mate2_filename'] = [seq_folder + '/' + fn for fn in sample_df['fastq_mate2_filename'].values]

In [20]:
### AdapterRomoval and paired end read merging ###

adapter1_tmp = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC<P7_index>ATCTCGTATGCCGTCTTCTGCTTG'
adapter2_tmp = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT<P5_index>GTGTAGATCTCGGTGGTCGCCGTATCATT'
AR_cmd_tmp = ["AdapterRemoval", "--bzip2", "--preserve5p", "--collapse", "--minalignmentlength", "10", "--threads", "12"]

# Create folder for files:
try:
    os.mkdir(AdapterRemoval_dir)
except:
    shutil.rmtree(AdapterRemoval_dir)
    os.mkdir(AdapterRemoval_dir)
os.chdir(AdapterRemoval_dir)

# Generate list of files to merge:
AR_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].drop_duplicates()

# Check files exists before starting:
for index, row in AR_file_df.iterrows():
    basename = '{}-{}'.format(row['P5_index'], row['P7_index'])
    assert(os.path.exists('../{}'.format(row['fastq_mate1_filename'])))
    assert(os.path.exists('../{}'.format(row['fastq_mate2_filename'])))

# Merge files:
N_pairs = list()
N_merged = list()
for index, row in AR_file_df.iterrows():
    AR_cmd = AR_cmd_tmp.copy()
    basename = '{}-{}'.format(row['P5_index'], row['P7_index'])
    adapter1 = adapter1_tmp.replace('<P7_index>', row['P7_index_seq'])
    adapter2 = adapter2_tmp.replace('<P5_index>', row['P5_index_seq'])

    AR_cmd.extend(['--adapter1', adapter1])
    AR_cmd.extend(['--adapter2', adapter2])
    AR_cmd.extend(['--basename', basename])
    AR_cmd.extend(['--file1', '../{}'.format(row['fastq_mate1_filename'])])
    AR_cmd.extend(['--file2', '../{}'.format(row['fastq_mate2_filename'])])

    with Popen(AR_cmd, stdout=PIPE, stderr=STDOUT, bufsize=1) as p, open('logfile.txt', 'a') as file:
        file.write('Starting subprocess with command:')
        file.write(str(AR_cmd))
        file.write('\n')
        for line in p.stdout: # b'\n'-separated lines
            #sys.stdout.write(line) # pass bytes as is
            file.write(line.decode('utf-8'))
        file.write('\n****** DONE ******\n\n\n')

    with open('{}.settings'.format(basename), 'r') as fh:
        for line in fh:
            if 'Total number of read pairs:' in line:
                N_pairs.append(int(line.split(':')[1][1:]))
            if 'Number of full-length collapsed pairs:' in line:
                N_merged.append(int(line.split(':')[1][1:]))

# Write stats:
AR_file_df['N_pairs'] = N_pairs
AR_file_df['N_merged'] = N_merged
AR_file_df['percent_successfully_merged'] = AR_file_df['N_merged'].values / AR_file_df['N_pairs'].values *100
AR_file_df.to_excel('merge_stats.xlsx')

os.chdir('..')
# Move stats files to project folder:
shutil.copy2(AdapterRemoval_dir + '/merge_stats.xlsx', stats_dir)

  self.stdout = io.open(c2pread, 'rb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)


'../../projects/pilot_exp_v3/read_processing_stats/merge_stats.xlsx'

In [21]:
### Splitting into files based on barcode ###

# Create folder for files:
try:
    os.mkdir(sample_fastq_dir)
except:
    shutil.rmtree(sample_fastq_dir)
    os.mkdir(sample_fastq_dir)
os.chdir(sample_fastq_dir)


# Map barcode sequences to reads:
Nmapped = list()
Nunmapped = list()
Ncc = {k:0 for k in sample_df['sample_name_unique'].values}
Ncca = {k:0 for k in sample_df['sample_name_unique'].values}
Ntot = {k:0 for k in sample_df['sample_name_unique'].values}
# Dump all the sequences where no barcode was found:
unmapped_fh = bz2.open('no-barcode_untrimmed.fastq.bz2', "wt")

for index, row in AR_file_df.iterrows(): # Pull out each merged fastq file
    basename = '{}-{}'.format(row['P5_index'], row['P7_index'])
    merged_fastq_fn = '../{}/{}.collapsed.bz2'.format(AdapterRemoval_dir, basename)
    
    # List the barcodes and associated sample names:
    mask = (sample_df['P5_index'] == row['P5_index']) & (sample_df['P7_index'] == row['P7_index'])
    bc_fh = [(k, v, bz2.open('{}.fastq.bz2'.format(v), "wt")) for k, v in zip(sample_df[mask]['barcode_seq'].values, sample_df[mask]['sample_name_unique'].values)]
    
    # Iterate over each record in the fastq file:
    with bz2.open(merged_fastq_fn, "rt") as input_fh:
        Nmapped.append(0)
        Nunmapped.append(0)
        for title, seq, qual in FastqGeneralIterator(input_fh):
            # Search for barcodes and write to barcode specific file:
            found = False
            for bc, sample_name, fh in bc_fh:
                if all(l1==l2 for l1, l2 in zip(seq[-len(bc):], bc) if l2 != 'N'):
                    found = True
                    # Add barcode sequence to title:
                    title = title + ':' + seq[-len(bc):]
                    fh.write("@{}\n{}\n+\n{}\n".format(title, seq[:-len(bc)], qual[:-len(bc)]))
                    Nmapped[-1] += 1
                    Ntot[sample_name] += 1
                    # Count if CC, CCA or not:
                    if seq[-(len(bc)+2):-len(bc)] == 'CC':
                        Ncc[sample_name] += 1
                    elif seq[-(len(bc)+3):-len(bc)] == 'CCA':
                        Ncca[sample_name] += 1
                    break
            if not found:
                Nunmapped[-1] += 1
                unmapped_fh.write("@{}\n{}\n+\n{}\n".format(title, seq, qual))
    for bc, sample_name, fh in bc_fh:
        fh.close()
unmapped_fh.close()

# Collect stats:
AR_file_df['N_BC-mapped'] = Nmapped
AR_file_df['N_BC-unmapped'] = Nunmapped
AR_file_df['N_sum-check'] = AR_file_df['N_BC-mapped'] + AR_file_df['N_BC-unmapped']
AR_file_df['percent_BC-mapped'] = AR_file_df['N_BC-mapped'].values / AR_file_df['N_merged'].values *100

sample_df['N_total'] = [Ntot[sn] for sn in sample_df['sample_name_unique']]
sample_df['N_CC'] = [Ncc[sn] for sn in sample_df['sample_name_unique']]
sample_df['N_CCA'] = [Ncca[sn] for sn in sample_df['sample_name_unique']]
sample_df['N_CCA+CC'] = sample_df['N_CCA'].values + sample_df['N_CC'].values
sample_df['CCA+CC_percent_total'] = sample_df['N_CCA+CC'].values / sample_df['N_total'].values *100
sample_df['percent_CCA'] = sample_df['N_CCA'].values / sample_df['N_CCA+CC'].values *100

AR_file_df.to_excel('index-pair_stats.xlsx')
sample_df.to_excel('sample_stats.xlsx')

os.chdir('..')
# Move stats files to project folder:
shutil.copy2(sample_fastq_dir + '/index-pair_stats.xlsx', stats_dir)

'../../projects/pilot_exp_v3/read_processing_stats/index-pair_stats.xlsx'

In [22]:
### Find Kmers for unmapped reads ###

os.chdir(sample_fastq_dir)

def add_kmers(k_dict, seq, k_size, filter_dict=None):
    '''Find Kmers in input sequence and add to dictionary if not filtered.'''
    for i in range(len(seq) - k_size + 1):
        kmer = seq[i:(i+k_size)]
        if filter_dict is None or kmer not in filter_dict:
            try:
                k_dict[kmer] += 1
            except KeyError:
                k_dict[kmer] = 1
    return(k_dict)

def find_min_dist_bc(kmer_seq, index_dict):
    '''Search for the Kmers in the adapter sequences.'''
    import jellyfish
    dist_min = 999
    bc_min_dist = ''
    for bc, bc_seq in index_dict['barcode'].items():
        for i in range(len(bc_seq) - len(kmer_seq) + 1):
            window = bc_seq[i:(i+len(kmer_seq))]
            dist = jellyfish.hamming_distance(window, kmer_seq)
            if dist < dist_min:
                dist_min = dist
                bc_min_dist = bc
    return(bc_min_dist, dist_min)

# Search for Kmers #
k_size = 5 # Size of Kmer

# First generate a filter composed of the Kmers contained
# in the last 7 nt. of human tRNA seqeunces.
filter_search_size = 7
filter_dict = dict()
tRNA_database = '../../../2-align_reads/tRNA_database/human/hg38-tRNAs.fa'
with open(tRNA_database, "r") as tRNA_fh:
    for tRNA in SeqIO.parse(tRNA_fh, "fasta"):
        filter_dict = add_kmers(filter_dict, str(tRNA.seq)[-filter_search_size:], k_size)

# Then search for Kmers in the last 13 nt. in reads
# longer than the minimum cutoff:
search_size = 13
k_dict = dict()
with bz2.open('no-barcode_untrimmed.fastq.bz2', "rt") as unmapped_fh:
    for title, seq, qual in FastqGeneralIterator(unmapped_fh):
        if len(seq) >= MIN_READ_LEN:
            k_dict = add_kmers(k_dict, seq[-search_size:], k_size, filter_dict)

# Rank Kmers by occurence and find closely related adapters: 
kmer_df_dat = list()
for kmer_seq, count in sorted(k_dict.items(), key=lambda x:x[1], reverse=True):
    bc_min_dist, dist_min = find_min_dist_bc(kmer_seq, index_dict)
    if dist_min < 2:
        kmer_df_dat.append([kmer_seq, count, dist_min, bc_min_dist])
    else:
        kmer_df_dat.append([kmer_seq, count, None, None])
kmer_df = pd.DataFrame(kmer_df_dat, columns=['Kmer', 'Count', 'Barcode distance', 'Barcode'])
kmer_df.to_excel('no-barcode_Kmer-analysis.xlsx')

os.chdir('..')

In [23]:
### Generate UMI stats and write final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.

# From: https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
# I get the expected number of unique UMIs:
# E_X = n*(1-((n-1) / n)**k)
# Where k = is the number of sequences (draws)
# and n = to the number of possible UMIs (bins)
n_bins = 4**9 * 2 # number of UMI bins (n)


# Create folder for files:
try:
    os.mkdir(umi_dir)
except:
    shutil.rmtree(umi_dir)
    os.mkdir(umi_dir)
os.chdir(umi_dir)

# Dump all the sequences where no UMI was found:
unmapped_fh = bz2.open('no-UMI_untrimmed.fastq.bz2', "wt")

# Trim UMIs off sequences:
N_umi_obs = list()
N_umi_exp = list()
N_seq_list = list()
for index, row in sample_df.iterrows(): # Process each sample individually
    fastq_name = '../{}/{}.fastq.bz2'.format(sample_fastq_dir, row['sample_name_unique'])
    UMIs = set()
    Nseqs = 0
    with bz2.open('{}_UMI-trimmed.fastq.bz2'.format(row['sample_name_unique']), "wt") as output_fh:
        with bz2.open(fastq_name, "rt") as input_fh:
            for title, seq, qual in FastqGeneralIterator(input_fh):
                umi = seq[0:UMI_LEN]
                if umi[-1] == 'T' or umi[-1] == 'C': # UMI sequence requirement
                    UMIs.add(umi)
                    Nseqs += 1
                    # Add UMI sequence to title:
                    title = title + ':' + umi
                    # Write the trimmed sequence:
                    output_fh.write("@{}\n{}\n+\n{}\n".format(title, seq[10:], qual[10:]))
                else:
                    # Write the untrimmed sequence if UMI was not found:
                    unmapped_fh.write("@{}\n{}\n+\n{}\n".format(title, seq, qual))
    # Calculate the observed and expected UMI count:
    N_seq_list.append(Nseqs)
    k_draws = Nseqs
    N_umi_obs.append(len(UMIs))
    E_X = n_bins*(1-((n_bins-1) / n_bins)**k_draws)
    N_umi_exp.append(round(E_X))

# Collect UMI stats:
sample_df['N_UMI_observed'] = N_umi_obs
sample_df['N_UMI_expected'] = N_umi_exp
sample_df['percent_seqs_after_UMI_trim'] = np.array(N_seq_list) / sample_df['N_total'].values * 100
sample_df['percent_UMI_obs-vs-exp'] = sample_df['N_UMI_observed'].values / sample_df['N_UMI_expected'].values * 100
sample_df.to_excel('sample_UMI_stats.xlsx')

os.chdir('..')
# Move stats files to project folder:
shutil.copy2(umi_dir + '/sample_UMI_stats.xlsx', stats_dir)

'../../projects/pilot_exp_v3/read_processing_stats/sample_UMI_stats.xlsx'