In [1]:
import pandas as pd
from subprocess import Popen, PIPE, STDOUT
import os
import sys
from Bio import SeqIO, bgzf
import gzip
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import numpy as np

In [5]:
seq_folder = '/Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA_charging/tRNAseq_pilot/seq_data'
sample_list = 'sample_list.xlsx'
index_list = 'index_list.xlsx'
os.chdir(seq_folder)

In [None]:
AdapterRemoval_dir = 'AdapterRemoval'
sample_fastq_dir = 'processed_fastq'
umi_dir = 'UMI_trimmed'

In [6]:
### Read sample information ###
sample_df = pd.read_excel(sample_list)
index_df = pd.read_excel(index_list)

# Read index sequences into dict:
index_dict = dict()
for t, i, s in zip(index_df['type'].values, index_df['id'].values, index_df['sequence'].values):
    if t not in index_dict:
        index_dict[t] = dict()
    index_dict[t][i] = s

# Add index sequences to dataframe:
sample_df['P5_index_seq'] = [index_dict['P5_index'][i] for i in sample_df['P5_index'].values]
sample_df['P7_index_seq'] = [index_dict['P7_index'][i] for i in sample_df['P7_index'].values]
sample_df['barcode_seq'] = [index_dict['barcode'][i] for i in sample_df['barcode'].values]

In [7]:
### AdapterRomoval and paired end read merging ###

adapter1_tmp = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC<P7_index>ATCTCGTATGCCGTCTTCTGCTTG'
adapter2_tmp = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT<P5_index>GTGTAGATCTCGGTGGTCGCCGTATCATT'
AR_cmd_tmp = ["AdapterRemoval", "--gzip", "--preserve5p", "--collapse", "--minalignmentlength", "10",]

# Create folder for files:
os.mkdir(AdapterRemoval_dir)
os.chdir(AdapterRemoval_dir)

# Generate list of files to merge:
AR_file_df = sample_df[['fastq_mate1_filename', 'fastq_mate2_filename', 'P5_index', 'P7_index', 'P5_index_seq', 'P7_index_seq']].drop_duplicates()

# Merge files:
N_pairs = list()
N_merged = list()
for index, row in AR_file_df.iterrows():
    AR_cmd = AR_cmd_tmp.copy()
    basename = '{}-{}'.format(row['P5_index'], row['P7_index'])
    adapter1 = adapter1_tmp.replace('<P7_index>', row['P7_index_seq'])
    adapter2 = adapter2_tmp.replace('<P5_index>', row['P5_index_seq'])

    AR_cmd.extend(['--adapter1', adapter1])
    AR_cmd.extend(['--adapter2', adapter2])
    AR_cmd.extend(['--basename', basename])
    AR_cmd.extend(['--file1', '../{}'.format(row['fastq_mate1_filename'])])
    AR_cmd.extend(['--file2', '../{}'.format(row['fastq_mate2_filename'])])

    with Popen(AR_cmd, stdout=PIPE, stderr=STDOUT, bufsize=1) as p, open('logfile.txt', 'a') as file:
        file.write('Starting subprocess with command:')
        file.write(str(AR_cmd))
        file.write('\n')
        for line in p.stdout: # b'\n'-separated lines
            #sys.stdout.write(line) # pass bytes as is
            file.write(line.decode('utf-8'))
        file.write('\n****** DONE ******\n\n\n')

    with open('{}.settings'.format(basename), 'r') as fh:
        for line in fh:
            if 'Total number of read pairs:' in line:
                N_pairs.append(int(line.split(':')[1][1:]))
            if 'Number of full-length collapsed pairs:' in line:
                N_merged.append(int(line.split(':')[1][1:]))

# Write stats:
AR_file_df['N_pairs'] = N_pairs
AR_file_df['N_merged'] = N_merged
AR_file_df['percent_successfully_merged'] = AR_file_df['N_merged'].values / AR_file_df['N_pairs'].values *100
AR_file_df.to_excel('merge_stats.xlsx')

os.chdir('..')

  self.stdout = io.open(c2pread, 'rb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)
  self.stdout = io.open(c2pread, 'rb', bufsize)


In [8]:
### Splitting into files based on barcode ###

# Create folder for files:
os.mkdir(sample_fastq_dir)
os.chdir(sample_fastq_dir)


# Map barcode sequences to reads:
Nmapped = list()
Ncc = {k:0 for k in sample_df['sample_name'].values}
Ncca = {k:0 for k in sample_df['sample_name'].values}

for index, row in AR_file_df.iterrows(): # Pull out each merged fastq file
    basename = '{}-{}'.format(row['P5_index'], row['P7_index'])
    merged_fastq_fn = '../{}/{}.collapsed.gz'.format(AdapterRemoval_dir, basename)
    
    # List the barcodes and associated sample names:
    mask = (sample_df['P5_index'] == row['P5_index']) & (sample_df['P7_index'] == row['P7_index'])
    bc_fh = [(k, v, gzip.open('{}.fastq.gz'.format(v), "wt")) for k, v in zip(sample_df[mask]['barcode_seq'].values, sample_df[mask]['sample_name'].values)]
    
    # Iterate over each record in the fastq file:
    with gzip.open(merged_fastq_fn, "rt") as input_fh:
        Nmapped.append(0)
        for title, seq, qual in FastqGeneralIterator(input_fh):
            # Search for barcodes and write to barcode specific file:
            for bc, sample_name, fh in bc_fh:
                bc_CC = 'CC' + bc
                bc_CCA = 'CCA' + bc
                if seq[-len(bc_CC):] == bc_CC:
                    fh.write("@{}\n{}\n+\n{}\n".format(title, seq[:-len(bc)], qual[:-len(bc)]))
                    Nmapped[-1] += 1
                    Ncc[sample_name] += 1
                elif seq[-len(bc_CCA):] == bc_CCA:
                    fh.write("@{}\n{}\n+\n{}\n".format(title, seq[:-len(bc)], qual[:-len(bc)]))
                    Nmapped[-1] += 1
                    Ncca[sample_name] += 1
    for bc, sample_name, fh in bc_fh:
        fh.close()

# Collect stats:
AR_file_df['N_mapped'] = Nmapped
AR_file_df['percent_mapped'] = AR_file_df['N_mapped'].values / AR_file_df['N_merged'].values *100

sample_df['N_CC'] = [Ncc[sn] for sn in sample_df['sample_name']]
sample_df['N_CCA'] = [Ncca[sn] for sn in sample_df['sample_name']]
sample_df['N_seqs'] = sample_df['N_CC'].values + sample_df['N_CCA'].values
sample_df['percent_charging'] = sample_df['N_CCA'].values / sample_df['N_seqs'].values *100

AR_file_df.to_excel('index-pair_stats.xlsx')
sample_df.to_excel('sample_stats.xlsx')

os.chdir('..')

In [9]:
### Generate UMI stats and wrie final trimmed tRNA sequences ###
# Note, the cDNA input amount is so large that it is very unlikely to sequence
# the same PCR amplified DNA twice. Therefore, this processing step does not
# attempt to merge possible UMI duplicates.

# From: https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
# I get the expected number of unique UMIs:
# E_X = n*(1-((n-1) / n)**k)
# Where k = is the number of sequences (draws)
# and n = to the number of possible UMIs (bins)
n_bins = 4**9 * 2 # number of UMI bins (n)


# Create folder for files:
os.mkdir(umi_dir)
os.chdir(umi_dir)

# Trim UMIs off sequences:
N_umi_obs = list()
N_umi_exp = list()
N_seq_list = list()
for index, row in sample_df.iterrows(): # Process each sample individually
    fastq_name = '../{}/{}.fastq.gz'.format(sample_fastq_dir, row['sample_name'])
    UMIs = set()
    Nseqs = 0
    with gzip.open('{}_UMI-trimmed.fastq.gz'.format(row['sample_name']), "wt") as output_fh:
        with gzip.open(fastq_name, "rt") as input_fh:
            for title, seq, qual in FastqGeneralIterator(input_fh):
                umi = seq[0:10]
                if umi[-1] == 'T' or umi[-1] == 'C': # UMI sequence requirement
                    UMIs.add(umi)
                    Nseqs += 1
                    # Write the trimmed sequence:
                    output_fh.write("@{}\n{}\n+\n{}\n".format(title, seq[10:], qual[10:]))
    # Calculate the observed and expected UMI count:
    N_seq_list.append(Nseqs)
    k_draws = Nseqs
    N_umi_obs.append(len(UMIs))
    E_X = n_bins*(1-((n_bins-1) / n_bins)**k_draws)
    N_umi_exp.append(round(E_X))

# Collect UMI stats:
sample_df['N_UMI_observed'] = N_umi_obs
sample_df['N_UMI_expected'] = N_umi_exp
sample_df['percent_seqs_after_UMI_trim'] = np.array(N_seq_list) / sample_df['N_seqs'].values * 100
sample_df['percent_UMI_obs-vs-exp'] = sample_df['N_UMI_observed'].values / sample_df['N_UMI_expected'].values * 100
sample_df.to_excel('sample_UMI_stats.xlsx')