In [1]:
import sys, os, subprocess, shutil, glob, bz2, json
from Bio import SeqIO
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
import matplotlib.colors as mcolors
import matplotlib.ticker as ticker
palette = list(mcolors.TABLEAU_COLORS.keys())
%matplotlib inline


# Navigate back to workbookDir in case of re-running a code block:
if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
print('workbookDir: ' + workbookDir)
os.chdir(workbookDir)  # If you changed the current working dir, this will take you back to the workbook dir.

workbookDir: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/3-stats_collection


### Requirements
1. gfsa
2. hgrewsa

In [2]:
DNAcompRNA = {a: b for a, b in zip('ATGC', 'UACG')}
def anticodon2codon(anticodon):
    codon = ''.join([DNAcompRNA[b] for b in anticodon[::-1]])
    return(codon)

In [3]:
# Variables that changes from batch to batch:
CLEAN_DIR = False  # Delete old stats_collection dir
OVERWRITE = True   # Overwrite old stat file
# Following, not implemented
DRY_RUN = False     # Do dry-run, print files to run, nothing else
SP_SET = {'mouse', 'human'} # Only run if species is in set


#data_folder = 'data/pilot_exp'
data_folder = 'data/pilot_exp_v2'
# project_folder = 'projects/pilot_exp'
project_folder = 'projects/pilot_exp_v2'
tRNA_database = dict()
tRNA_database['human'] = '../../../2-align_reads/tRNA_database/human/hg38-tRNAs.fa'
tRNA_database['mouse'] = '../../../2-align_reads/tRNA_database/mouse/mm10-tRNAs.fa'

In [4]:
# Variables that should not change:
data_dir = 'stats_collection'
align_dir = 'SWalign'
umi_dir = 'UMI_trimmed'
sample_list = 'sample_list.xlsx'

In [5]:
### Read sample information ###
sample_df = pd.read_excel('../' + project_folder + '/' + sample_list)
sample_dict = {un: {'sample_name': n, 'replicate': r, 'barcode': b, 'species': sp} for un, n, r, b, sp in zip(sample_df['sample_name_unique'].values, sample_df['sample_name'].values, sample_df['replicate'].values, sample_df['barcode'].values, sample_df['species'].values)}

# Create folder for data and stats:
os.chdir('../' + data_folder)
stats_dir = '../../' + project_folder + '/stats_collection'
try:
    os.mkdir(stats_dir) # For stats
except:
    if CLEAN_DIR:
        shutil.rmtree(stats_dir)
        os.mkdir(stats_dir)
    else:
        pass

# For manipulations and final data:
try:
    os.mkdir(data_dir) # For data
except:
    if CLEAN_DIR:
        shutil.rmtree(data_dir)
        os.mkdir(data_dir)
    else:
        pass
os.chdir(data_dir)

In [6]:
# Read the tRNA database to find the length of each sequence:
tRNA_data = dict()
for species in tRNA_database:
    for record in SeqIO.parse(tRNA_database[species], "fasta"):
        tRNA_data[record.id] = dict()
        tRNA_data[record.id]['len'] = len(record.seq)
        tRNA_data[record.id]['codon'] = anticodon2codon(record.id.split('-')[2])
        tRNA_data[record.id]['anticodon'] = record.id.split('-')[2]
        tRNA_data[record.id]['amino_acid'] = record.id.split('-')[1]

In [16]:
stat_csv_fnam = 'stats_collection.csv.bz2'
agg_csv_fnam = 'stats_filtered_CC-CCA-aggregate.csv'
agg_strict_csv_fnam = 'stats_strict_filtered_CC-CCA-aggregate.csv'
if OVERWRITE:
    try:
        os.remove(stat_csv_fnam)
        os.remove(agg_csv_fnam)
        os.remove(agg_strict_csv_fnam)
    except:
        pass

# Open filehandles and printer headers:
fh_stats_out = bz2.open(stat_csv_fnam, 'ab')
header = ['readID', 'sample_name', 'replicate', 'barcode', 'tRNA_annotation', 'align_score', 'unique_annotation', 'tRNA_annotation_len', 'align_5p_idx', 'align_3p_idx', 'align_5p_nt', 'align_3p_nt', 'codon', 'anticodon', 'amino_acid', '5p_cover', '3p_cover', '5p_non-temp', '3p_non-temp', '5p_UMI', '3p_BC']
header_bin = str.encode(','.join(header) + '\n')
fh_stats_out.write(header_bin)

fh_agg_out = open(agg_csv_fnam, 'a')
fh_agg_strict_out = open(agg_strict_csv_fnam, 'a')
agg_cols = ['sample_name', 'replicate', 'barcode', 'tRNA_annotation', 'tRNA_annotation_len', 'unique_annotation', 'align_3p_nt', 'codon', 'anticodon', 'amino_acid', 'count']
print(','.join(agg_cols), file=fh_agg_out)
print(','.join(agg_cols), file=fh_agg_strict_out)

# Files to collect stats from:
json_files = glob.glob('../' + align_dir + '/*.json.bz2')
json_files.sort(key=os.path.getmtime)
# print(json_files)
for json_file in json_files:
    fnam_base = '_'.join(json_file.split('/')[-1].split('_')[0:-1])
    uidx = fnam_base.index('UMI')
    unique_sample_name = fnam_base[0:uidx-1]
    print('Processing: {}'.format(unique_sample_name))
    with bz2.open(json_file, 'rt', encoding="utf-8") as fh_gz:
        SWres = json.load(fh_gz)
    
    # Extract non-template bases from UMI processed reads:
    fastq_fnam = '../' + umi_dir + '/' + fnam_base + '.fastq.bz2'
    with bz2.open(fastq_fnam, 'rt') as fh_gz:
        for UMIread in SeqIO.parse(fh_gz, "fastq"):
            if UMIread.id in SWres:
                qpos = SWres[UMIread.id]['qpos'][0]
                SWres[UMIread.id]['5p_non-temp'] = str(UMIread.seq)[0:(qpos[0]-1)]
                SWres[UMIread.id]['3p_non-temp'] = str(UMIread.seq)[qpos[1]:]
                _3p_bc, _5p_umi = UMIread.description.split()[-1].split(':')[-2:]
                SWres[UMIread.id]['5p_UMI'] = _5p_umi
                SWres[UMIread.id]['3p_BC'] = _3p_bc

    ### Collect stats on a per sample basis and store in tmp file ###
    with open('tmp_stat.csv', 'w') as tmp_csv:
        print(','.join(header), file=tmp_csv)
        for i, readID in enumerate(SWres):
            try:
                SWres[readID]['5p_non-temp']
            except KeyError:
                raise Exception('5p_non-temp was not defined in SWres[readID]. Should have been added from the UMI trimmed reads. Did any of the fastq headers change such that there is a mismatch between headers in the alignment json and those in the trimmed UMIs?')
            sample_name = sample_dict[unique_sample_name]['sample_name']
            replicate = sample_dict[unique_sample_name]['replicate']
            barcode = sample_dict[unique_sample_name]['barcode']
            tRNA_annotation = SWres[readID]['name']
            tRNA_annotation_first = tRNA_annotation.split('@')[0]
            align_score = SWres[readID]['score']
            unique_annotation = '@' not in tRNA_annotation
            tRNA_annotation_len = tRNA_data[tRNA_annotation_first]['len']
            align_5p_idx, align_3p_idx = SWres[readID]['dpos'][0]
            align_5p_nt = SWres[readID]['qseq'][0]
            align_3p_nt = SWres[readID]['qseq'][-1]
            # Move index for reads with beta-eliminated A:
            if align_3p_idx == (tRNA_annotation_len - 1) and align_3p_nt == 'C':
                align_3p_idx += 1
            codon = tRNA_data[tRNA_annotation_first]['codon']
            anticodon = tRNA_data[tRNA_annotation_first]['anticodon']
            amino_acid = tRNA_data[tRNA_annotation_first]['amino_acid']
            _5p_cover = align_5p_idx == 1
            _3p_cover = align_3p_idx == tRNA_annotation_len
            _5p_non_temp = SWres[readID]['5p_non-temp']
            _3p_non_temp = SWres[readID]['3p_non-temp']
            _5p_umi = SWres[readID]['5p_UMI']
            _3p_bc = SWres[readID]['3p_BC']
            # Print line to tmp file:
            csv_line = ','.join(map(str, [readID, sample_name, replicate, barcode, tRNA_annotation, align_score, unique_annotation, tRNA_annotation_len, align_5p_idx, align_3p_idx, align_5p_nt, align_3p_nt, codon, anticodon, amino_acid, _5p_cover, _3p_cover, _5p_non_temp, _3p_non_temp, _5p_umi, _3p_bc]))
            print(csv_line, file=tmp_csv)
    # Append the sample statistics to the final csv file using bz2 compression:
    with open('tmp_stat.csv', 'rb') as tmp_csv:
        next(tmp_csv) # skip header
        fh_stats_out.write(tmp_csv.read())

    ### Aggregate filtered data to count charged/uncharged tRNAs ###
    # Read stats from tmp csv file (this is faster than building row by row):
    stat_df = pd.read_csv('tmp_stat.csv', keep_default_na=False)
    os.remove('tmp_stat.csv')
    # 3' must be covered and no 3' non-template bases:
    row_mask = (stat_df['3p_cover']) & (stat_df['3p_non-temp'] == '')
    agg_df = stat_df.loc[row_mask, agg_cols[0:-1]]
    agg_df['count'] = stat_df.loc[row_mask, ['align_3p_nt']]  # dummy for groupby count
    agg_df = agg_df.groupby(agg_cols, as_index=False).agg({"count": "count"})
    agg_df.to_csv(fh_agg_out, header=False, index=False, mode='a')
    
    row_mask = (stat_df['unique_annotation']) & (stat_df['5p_cover']) & (stat_df['3p_cover']) & (stat_df['3p_non-temp'] == '')
    agg_strict_df = stat_df.loc[row_mask, agg_cols[0:-1]]
    agg_strict_df['count'] = stat_df.loc[row_mask, ['align_3p_nt']]  # dummy for groupby count
    agg_strict_df = agg_strict_df.groupby(agg_cols, as_index=False).agg({"count": "count"})
    agg_strict_df.to_csv(fh_agg_strict_out, header=False, index=False, mode='a')

fh_stats_out.close()
fh_agg_out.close()
fh_agg_strict_out.close()

Processing: 100p1
Processing: 75p2
Processing: 25p4
Processing: 25p2
Processing: 0p3
Processing: 100p4N
Processing: 25p2N
Processing: 0p3N
Processing: 50p3N
Processing: 100p4
Processing: 100p1N
Processing: 50p1
Processing: 50p1N
Processing: 25p4N
Processing: 50p3
Processing: 75p2N


In [9]:
# Files to collect stats from:
json_files = glob.glob('../' + align_dir + '/*.json.bz2')
# print(json_files)
file_done = True
for json_file in json_files[::-1]:
    fnam_base = '_'.join(json_file.split('/')[-1].split('_')[0:-1])
    uidx = fnam_base.index('UMI')
    unique_sample_name = fnam_base[0:uidx-1]
    if unique_sample_name != '8U2' and file_done:
        continue
    else:
        file_done = False
    
    print('Processing: {}'.format(unique_sample_name))