In [37]:
COLS = [
    "qseqid",
    "sseqid",
    "pident",
    "length",
    "mismatch",
    "gapopen",
    "qstart",
    "qend",
    "sstart",
    "send",
    "evalue",
    "bitscore",
    "slen",
    "qlen"
]

In [38]:
# imports

import pandas as pd
import os

In [39]:
in_base = "/ccb/salz4-3/hji20/hg002-q100-annotation/results/analysis/blastp"

In [40]:
def find_close_matches(
    fn : str,
    eval_cutoff : float = 1e-20,
    min_pident : float = 90.0,
    min_qcov : float = 0.9,
    min_scov : float = 0.9
):
    df = pd.read_csv(fn, sep='\t', header=None)
    df.columns = COLS
    close_matches = set()
    max_pident = -1
    for _, row in df.iterrows():
        if float(row['evalue']) > eval_cutoff:
            continue
        pident = float(row['pident'])
        if pident < min_pident:
            continue
        if pident > max_pident:
            max_pident = pident
        sid = row['sseqid']
        qcov = (int(row['qend']) - int(row['qstart'])) / int(row['qlen'])
        scov = (int(row['send']) - int(row['sstart'])) / int(row['slen'])
        if qcov > min_qcov and scov > min_scov:
            close_matches.add(sid.split('.')[0])
    return close_matches, max_pident

### Process MAT search results

In [46]:
wkdir = os.path.join(in_base, 'mat_loff')

In [47]:
fn = os.path.join(in_base, 'mat_only.auto.tsv')
qry_df = pd.read_csv(fn, sep='\t', header=None)
qry_df.columns = [
    'gene_name',
    'gene_ids'
]

qry_infos = {}
for _, row in qry_df.iterrows():
    qry_infos[row['gene_name']] = row['gene_ids'].split(',')

print(f'{len(qry_infos)} query genes loaded')

24 query genes loaded


In [48]:
out_fn = os.path.join(in_base, 'mat_only.auto.close_matches.tsv')
out_fh = open(out_fn, 'w')
out_fh.write(f'gene_name\tclose_match_gene_ids\tmax_pident\n')

for gene_name, gene_ids in qry_infos.items():
    in_fn = os.path.join(wkdir, f'{gene_name}_mat.blastp.out')
    if not os.path.exists(in_fn):
        out_fh.write(f'{gene_name}\tNone\tNA\n')
        print(f'no search results file for {gene_name}')
        continue
    if os.path.getsize(in_fn) == 0:
        out_fh.write(f'{gene_name}\tNone\tNA\n')
        print(f'empty search results for {gene_name}')
        continue
    close_matches, max_pident = find_close_matches(in_fn)
    if len(close_matches) == 0:
        out_fh.write(f'{gene_name}\tNone\tNA\n')
    else:
        out_fh.write(f'{gene_name}\t{",".join(list(close_matches))}\t{max_pident}\n')
out_fh.close()

empty search results for LOC124900189


### Process PAT search results

In [50]:
wkdir = os.path.join(in_base, 'pat_loff')

In [51]:
fn = os.path.join(in_base, 'pat_only.auto.tsv')
qry_df = pd.read_csv(fn, sep='\t', header=None)
qry_df.columns = [
    'gene_name',
    'gene_ids'
]

qry_infos = {}
for _, row in qry_df.iterrows():
    qry_infos[row['gene_name']] = row['gene_ids'].split(',')

print(f'{len(qry_infos)} query genes loaded')

31 query genes loaded


In [52]:
out_fn = os.path.join(in_base, 'pat_only.auto.close_matches.tsv')
out_fh = open(out_fn, 'w')
out_fh.write(f'gene_name\tclose_match_gene_ids\tmax_pident\n')

for gene_name, gene_ids in qry_infos.items():
    in_fn = os.path.join(wkdir, f'{gene_name}_pat.blastp.out')
    if not os.path.exists(in_fn):
        out_fh.write(f'{gene_name}\tNone\tNA\n')
        print(f'no search results file for {gene_name}')
        continue
    if os.path.getsize(in_fn) == 0:
        out_fh.write(f'{gene_name}\tNone\tNA\n')
        print(f'empty search results for {gene_name}')
        continue
    close_matches, max_pident = find_close_matches(in_fn)
    if len(close_matches) == 0:
        out_fh.write(f'{gene_name}\tNone\tNA\n')
    else:
        out_fh.write(f'{gene_name}\t{",".join(list(close_matches))}\t{max_pident}\n')
out_fh.close()