In [27]:
import sys, os, subprocess, copy, shutil, re, glob, bz2, json
import xml.etree.ElementTree as ET
from pathlib import Path
from Bio import Seq, SeqIO, SearchIO, SeqRecord
import pandas as pd

# Navigate back to workbookDir in case of re-running a code block:
if not 'workbookDir' in globals():
    workbookDir = os.getcwd()
print('workbookDir: ' + workbookDir)
os.chdir(workbookDir)  # If you changed the current working dir, this will take you back to the workbook dir.

workbookDir: /Users/krdav/Google Drive/MCB/Sullivan_lab/tRNA-charge-seq/2-align_reads


In [28]:
def indices(lst, element):
    result = []
    offset = -1
    while True:
        try:
            offset = lst.index(element, offset+1)
        except ValueError:
            return result
        result.append(offset)

In [29]:
def fast_fasta_count(filename):
    '''See: https://stackoverflow.com/a/9631635'''
    def blocks(files, size=65536):
        while True:
            b = files.read(size)
            if not b: break
            yield b

    with open(filename, "r", encoding="utf-8", errors='ignore') as f:
        return(sum(bl.count(">") for bl in blocks(f)))

In [30]:
def fast_fastq_count_bz(filename):
    '''See: https://stackoverflow.com/a/9631635'''
    def blocks(files, size=65536):
        while True:
            b = files.read(size)
            if not b: break
            yield b

    with bz2.open(filename, 'rt', encoding="utf-8", errors='ignore') as f:
        return(sum(bl.count("@") for bl in blocks(f)))

### Requirements
1. tRNA database must be formated as a Fasta file with unique headers and no white space.
2. Swipe output must be sorted by alignment score (this is default).

In [31]:
MIN_SCORE = 25
CLEAN_DIR = False  # Delete old SWalign dir
OVERWRITE = True   # Overwrite old json files
DRY_RUN = False     # Do dry-run, print swipe commands, nothing deleted
SP_SET = {'mouse', 'human'} # Only run if species is in set

#data_folder = 'data/pilot_exp'
#project_folder = 'projects/pilot_exp'
data_folder = 'data/pilot_exp_v2'
project_folder = 'projects/pilot_exp_v2'
sample_list = 'sample_list.xlsx'


In [32]:
seq_folder = 'raw_fastq'
umi_dir = 'UMI_trimmed'
align_dir = 'SWalign'
score_mat = '../../../2-align_reads/nuc_score-matrix.txt'
tRNA_database = dict()
tRNA_database['human'] = '../../../2-align_reads/tRNA_database/human/hg38-tRNAs.fa'
tRNA_database['mouse'] = '../../../2-align_reads/tRNA_database/mouse/mm10-tRNAs.fa'

In [33]:
# Read sample species information:
sample_df = pd.read_excel('../' + project_folder + '/' + sample_list)
sp_dict = {sn: sp for sn, sp in zip(sample_df['sample_name_unique'].values, sample_df['species'].values)}

In [34]:
# Create folder for data and stats:
os.chdir('../' + data_folder)
stats_dir = '../../' + project_folder + '/align_reads_stats'
try:
    os.mkdir(stats_dir) # For stats
except:
    if CLEAN_DIR and not DRY_RUN:
        shutil.rmtree(stats_dir)
        os.mkdir(stats_dir)
    else:
        pass
# For manipulations and final data:
try:
    os.mkdir(align_dir) # For data
except:
    if CLEAN_DIR and not DRY_RUN:
        shutil.rmtree(align_dir)
        os.mkdir(align_dir)
    else:
        pass
os.chdir(align_dir)

In [35]:
def parse_Swipe_XML(SWxml, db_id_set, MIN_SCORE):
    query_hits = dict()
    query_nohits = set()
    hit_dict = {tag: [] for tag in ['score', 'query', 'name', 'qpos', 'dpos', 'qseq', 'aseq', 'dseq']}
    pickup = True # When True, pick up hit data and store in tmp dict ("hit_dict")
    flush = False # When True, flush tmp dict into "query_hits"
    high_score = -999
    hit_dict_prev = None # For debugging
    for event, elem in SWxml:
        # When "result" tag is encountered it marks the end of the hits for a query.
        # Flush the data picked up:
        if elem.tag == 'result':
            elem.clear() # clear for saving memory
            flush = True
        # Pick up all tags defined in "hit_dict":
        elif pickup and elem.tag in hit_dict:
            hit_dict[elem.tag].append(elem.text)
            # If all highest alignment score(s) have been picked up,
            # stop picking up more data:
            if elem.tag == 'score':
                if int(elem.text) >= high_score:
                    high_score = int(elem.text)
                else:
                    pickup = False

        # Flush out hit results into "query_hits".
        # Only if results are stored and alignment score is above minimum:
        if flush and len(hit_dict['score']) > 0 and high_score >= MIN_SCORE:
            # Convert alignment score to integers:
            hit_dict['score'] = [int(s) for s in hit_dict['score']]
            # Find all the highest scoring hits, extract indices for selection:
            high_score_idx = indices(hit_dict['score'], high_score)
            # Remove all hits with alignment score lower than
            # the maximun score:
            for tag in hit_dict:
                hit_dict[tag] = [hit_dict[tag][hidx] for hidx in high_score_idx]
            # Convert qpos/dpos (query/database alignment position) string to integer tuple:
            hit_dict['qpos'] = [tuple(map(int, qp.split(','))) for qp in hit_dict['qpos']]
            hit_dict['dpos'] = [tuple(map(int, dp.split(','))) for dp in hit_dict['dpos']]
            # Assert that only one query sequence has been picked up:
            ls_query = list(set(hit_dict['query']))
            assert(len(ls_query) == 1)
            # Start to populate the dict entry for the query sequence:
            query = ls_query[0]
            query_hits[query] = {'score': high_score}
            # The "name" tag is the database result.
            # First extract the right hand side of the string,
            # corresponding to the fasta header,
            # then sort (if multiple hits) and merge with @:
            hit_dict['name'] = [n.split(' ')[-1] for n in hit_dict['name']]
            for n in hit_dict['name']: # quick assertion that name is in database
                 assert(n in db_id_set)
            # Extract sorting index for other data to be sorted:
            name_idx = sorted(range(len(hit_dict['name'])), key=lambda k: hit_dict['name'][k])
            name = '@'.join([hit_dict['name'][didx] for didx in name_idx])
            query_hits[query]['name'] = name
            # Add qpos/dpos:
            query_hits[query]['qpos'] = [hit_dict['qpos'][didx] for didx in name_idx]
            query_hits[query]['dpos'] = [hit_dict['dpos'][didx] for didx in name_idx]
            # Add alignment strings, but only for the first hit:
            query_hits[query]['qseq'] = hit_dict['qseq'][name_idx[0]]
            query_hits[query]['aseq'] = hit_dict['aseq'][name_idx[0]]
            query_hits[query]['dseq'] = hit_dict['dseq'][name_idx[0]]
        elif flush:
            ls_query = list(set(hit_dict['query']))
            if len(ls_query) > 0:
                query = ls_query[0]
                query_nohits.add(query)

        # After flushing, reset the variables for new data pickup:
        if flush:
            hit_dict_prev = hit_dict.copy() # For debugging
            hit_dict = {tag: [] for tag in ['score', 'query', 'name', 'qpos', 'dpos', 'qseq', 'aseq', 'dseq']}
            flush = False
            pickup = True
            high_score = -999
    return(query_hits, query_nohits)

In [36]:
### Align reads to reference ###

# Swipe command template: 
swipe_cmd_tmp = 'swipe --query INPUT_FILE --db DATABASE_FILE --out OUTPUT_FILE --symtype 1 --outfmt 7 --num_descriptions 3 --num_alignments 3 --evalue 0.000000001 --num_threads 12 --strand 1 --matrix SCORE_MATRIX -G 6 -E 1'
swipe_cmd_tmp = swipe_cmd_tmp.replace('SCORE_MATRIX', score_mat)

# Files to align:
gz_files = glob.glob('../' + umi_dir + '/*.bz2')
if DRY_RUN:
    print('.bz2 files found:')
    for fnam in gz_files:
        fnam_r = fnam.split('/')[-1]
        # Skip untrimmed sequences:
        if fnam_r == 'no-UMI_untrimmed.fastq.bz2':
            continue
        print(fnam)
for fnam in gz_files:
    fnam_r = fnam.split('/')[-1]
    # Skip untrimmed sequences:
    if fnam_r == 'no-UMI_untrimmed.fastq.bz2':
        continue
    UMI_idx = fnam_r.index('_UMI')
    usam_nam = fnam_r[:UMI_idx]
    species = sp_dict[usam_nam]
    sp_tRNA_database = tRNA_database[species]
    if species not in SP_SET:
        continue
    # Skip, if results file has already been made:
    SWres_fnam = '{}_SWalign.json.bz2'.format(fnam_r[:-10])
    SWnohits_fnam = '{}_SWalign-nohits.fasta.bz2'.format(fnam_r[:-10])
    if os.path.isfile(SWres_fnam) and OVERWRITE:
        if DRY_RUN:
            pass
        else:
            print('Overwriting: {}'.format(SWres_fnam))
            os.remove(SWres_fnam)
    elif os.path.isfile(SWres_fnam):
        continue
    else:
        pass

    if not DRY_RUN:
        # Convert to fasta as required by Swipe:
        with bz2.open(fnam, 'rt') as fh_gz:
            SeqIO.convert(fh_gz, "fastq", fnam[:-10] + '.fasta', 'fasta')
    else:
        print('Basename: {}'.format(fnam_r))

    # Run Swipe:
    swipe_cmd = swipe_cmd_tmp
    swipe_cmd = swipe_cmd.replace('DATABASE_FILE', sp_tRNA_database)
    swipe_cmd = swipe_cmd.replace('INPUT_FILE', fnam[:-10] + '.fasta')
    swipe_outfile = '{}_SWalign'.format(fnam_r[:-10])
    swipe_cmd = swipe_cmd.replace('OUTPUT_FILE', swipe_outfile)
    swipe_cmd = swipe_cmd.split(' ')
    if DRY_RUN:
        print('Swipe cmd: {}'.format(' '.join(swipe_cmd)))
    else:
        print('Running Swipe on: {}'.format(fnam_r))
        subprocess.check_call(swipe_cmd, stdout = subprocess.DEVNULL, stderr=subprocess.DEVNULL)    

    if not DRY_RUN:
        # Add "data" as root for the xml file:
        swipe_outfile_xml = swipe_outfile + '.xml'
        xml_first_line = '<data>\n'
        xml_last_line = '</data>\n'
        with open(swipe_outfile, 'r') as from_file:
            try:
                os.remove(swipe_outfile_xml)
            except:
                pass
            with open(swipe_outfile_xml, 'a') as to_file:
                from_file.readline()
                to_file.write(xml_first_line)
                shutil.copyfileobj(from_file, to_file)
                to_file.write(xml_last_line)

        # Read the database IDs and use them to verify alignment results:
        db_id_set = set()
        for record in SeqIO.parse(sp_tRNA_database, "fasta"):
            db_id_set.add(record.id)

        # Parse XML:
        SWxml = ET.iterparse(swipe_outfile_xml)
        query_hits, query_nohits = parse_Swipe_XML(SWxml, db_id_set, MIN_SCORE)

        # Dump unaligned sequences: 
        with bz2.open(SWnohits_fnam, 'wt', encoding="utf-8") as fh_gz:
            for record in SeqIO.parse(fnam[:-10]+'.fasta', "fasta"):
                if record.id in query_nohits:
                    print('>{}'.format(record.id), file=fh_gz)
                    print('{}'.format(record.seq), file=fh_gz)
        
        # Dump query_hits as JSON:
        if os.path.isfile(SWres_fnam) and OVERWRITE:
            print('Overwriting: {}'.format(SWres_fnam))
            with bz2.open(SWres_fnam, 'wt', encoding="utf-8") as fh_gz:
                 json.dump(query_hits, fh_gz)
        elif os.path.isfile(SWres_fnam) and not OVERWRITE:
            print('Skipping: {}\nTurn OVERWRITE on if desired.'.format(SWres_fnam))
        else:
            with bz2.open(SWres_fnam, 'wt', encoding="utf-8") as fh_gz:
                 json.dump(query_hits, fh_gz)

        # Remove tmp files:
        os.remove(fnam[:-10] + '.fasta')
        os.remove(swipe_outfile)
        os.remove(swipe_outfile_xml)


Overwriting: 100p1_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 100p1_UMI-trimmed.fastq.bz2
Overwriting: 75p2_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 75p2_UMI-trimmed.fastq.bz2
Overwriting: 25p4_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 25p4_UMI-trimmed.fastq.bz2
Overwriting: 25p2_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 25p2_UMI-trimmed.fastq.bz2
Overwriting: 0p3_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 0p3_UMI-trimmed.fastq.bz2
Overwriting: 100p4N_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 100p4N_UMI-trimmed.fastq.bz2
Overwriting: 25p2N_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 25p2N_UMI-trimmed.fastq.bz2
Overwriting: 0p3N_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 0p3N_UMI-trimmed.fastq.bz2
Overwriting: 50p3N_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 50p3N_UMI-trimmed.fastq.bz2
Overwriting: 100p4_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 100p4_UMI-trimmed.fastq.bz2
Overwriting: 100p1N_UMI-trimmed_SWalign.json.bz2
Running Swipe on: 100

In [38]:
%%bash
ls -lrt ../UMI_trimmed/100p1N_UMI-trimmed.fastq.bz2

-rw-rw----  1 krdav  staff  441330 Nov 10 15:00 ../UMI_trimmed/100p1N_UMI-trimmed.fastq.bz2


In [39]:

with bz2.open('../UMI_trimmed/100p1N_UMI-trimmed.fastq.bz2', 'rt') as fh_gz:
    SeqIO.convert(fh_gz, "fastq", 'test.fasta', 'fasta')

In [None]:
### Generate alignment statistics ###

df_stats = pd.DataFrame(columns=['Filename', 'N_reads', 'N_mapped', 'percent_single_annotation', 'percent_multiple_annotation', 'Mapping_percent'])
gz_files = glob.glob('*.json.bz2')
for fnam in gz_files:
    fnam_r = fnam.split('/')[-1]
    print(fnam)
    # Read query_hits from JSON:
    with bz2.open(fnam, 'rt', encoding="utf-8") as fh_gz:
         query_hits = json.load(fh_gz)

    # Calculate stats:
    N_reads = fast_fastq_count_bz('../' + umi_dir + '/' + fnam[:-17] + '.fastq.bz2')
    N_mapped = len(query_hits)
    map_p = N_mapped / N_reads * 100
    P_ma = sum(1 for h in query_hits.values() if '@' in h['name']) / N_mapped * 100
    P_sa = 100 - P_ma
    df_stats.loc[len(df_stats)] = [fnam_r, N_reads, N_mapped, P_sa, P_ma, map_p]

# Write stats:
df_stats.to_excel('alignment_stats.xlsx')

os.chdir('..')
# Move stats files to project folder:
shutil.copy2(align_dir + '/alignment_stats.xlsx', stats_dir)