In [None]:
from pathlib import Path
import pickle
import sqlite3

import pandas as pd
from tqdm import tqdm

FORMAT = 'qseqid sseqid saccver evalue staxids length qlen slen pident stitle qseq sseq qcovs frames'
FULLNAMELINEAGE_SQLITE3 = Path('fullnamelineage.sqlite3')

# fullnamelineage.dmp (from NCBI: https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/)
# tax_id  |       tax_name  |   lineage |
# 1       |       root    |               |
# 131567  |       cellular organisms      |               |
# 2157    |       Archaea |       cellular organisms;     |
# 1935183 |       Asgard group    |       cellular organisms; Archaea;    |
# 2798909 |       Candidatus Baldrarchaeota       |       cellular organisms; Archaea; Asgard group;      |

# fullnamelineage.sqlite3
# converted from the above .dmp file, where the lineage column contains a pickled set object (generated from the 'lineage' text separeted by ";")

In [None]:
def taxid2fullnamelineage(taxid: str) -> set:
    con = sqlite3.connect(FULLNAMELINEAGE_SQLITE3.as_posix())

    cursor_ = con.cursor()
    cmd_sql = 'SELECT name,pickle ' \
        'FROM taxid2fullnamelineage WHERE taxid = ' + \
        taxid
    cursor_.execute(cmd_sql)
    row = cursor_.fetchone()
    
    if row is not None:
        # name = row[0]
        fullnamelineage_set = pickle.loads(row[1])
    else:
        fullnamelineage_set = set()

    con.close()

    return fullnamelineage_set

In [None]:
def get_d_name(fullname_lineage: set) -> str:
    if 'Eukaryota' in fullname_lineage:
        return_str = 'Eukaryota'
    elif 'Bacteria' in fullname_lineage:
        return_str = 'Bacteria'
    elif 'Archaea' in fullname_lineage:
        return_str = 'Archaea'
    elif 'cellular organisms' in fullname_lineage:
        return_str = 'cellular organisms'
    elif 'environmental samples' in fullname_lineage:
        return_str = 'environmental samples'
    elif 'artificial sequences' in fullname_lineage:
        return_str = 'artificial sequences'
    elif 'vectors' in fullname_lineage:
        return_str = 'vectors'
    elif 'Viruses' in fullname_lineage:
        return_str = 'Viruses'
    else:
        return_str = 'Other'
        
    return return_str

In [None]:
col_names = FORMAT.split(' ')
col_names.remove('qseq')
col_names.remove('sseq')

nr_df = pd.read_csv('./nr.blast_results.tsv', delimiter='\t', header=None, names=FORMAT.split(' '), usecols=col_names)
# nr_df.head()

In [None]:
nr_groupby = nr_df.groupby('qseqid')

In [None]:
nr_filtered_df = nr_df.query(" evalue <= 1e-10 & pident >= 95.0 & qcovs >= 50").copy()
nr_filtered_df.reset_index(drop=True, inplace=True)

filtered_groupby = nr_filtered_df.groupby('qseqid')

In [None]:
def get_taxgroup_superset(taxids) -> set:
    if isinstance(taxids, str) is False:
        return set([])
    _taxid_list = taxids.split(';')
    taxgroup_sets = [taxid2fullnamelineage(taxid=x) for x in _taxid_list]
    taxgroup_superset = set()
    for set_ in taxgroup_sets:
        taxgroup_superset = taxgroup_superset | set_
    return taxgroup_superset

In [None]:
return_list = []
metamonada_existance = {}

for id_ in tqdm(filtered_groupby.groups.keys()):
    # is Metamonada in each hit list?
    nr_get_df = nr_groupby.get_group(id_)
    taxnames_series = nr_get_df['staxids'].apply(get_taxgroup_superset)
    taxnames_in_hits_set = set()
    
    for x in taxnames_series:
        taxnames_in_hits_set = taxnames_in_hits_set | x

    if 'Metamonada' in taxnames_in_hits_set:
        metamonada_existance[id_] = True
        continue

    # for filtered hits
    df_ = filtered_groupby.get_group(id_)
    taxnames_series_filtered = df_['staxids'].apply(get_taxgroup_superset)

    taxnames_in_filtered_hits_set = set()
    for x in taxnames_series_filtered:
        taxnames_in_filtered_hits_set = taxnames_in_filtered_hits_set | x

    if len(taxnames_in_filtered_hits_set) == 0:
        print('{} was not found, try searching in merged.dmp'.format(set(df_['staxids'])))
    d_name = get_d_name(taxnames_in_filtered_hits_set)

    # if 'cellular organisms' == d_name:
    #     if len(taxnames_series_filtered) == 1:  # if only '2' was included
    #         print('{}: Only 2'.format(id_))
    # elif 'environmental samples' == d_name:
    #     if len(taxnames_series_filtered) == 1:  # if only '417996' was included
    #         print('{}: Only 417996'.format(id_))
    # elif 'Other' == d_name:
    #     print('{}: Something wrong'.format(id_))

    if d_name is not None:
        return_list.append({'seqid': id_, 'supergroupname': d_name})        

In [None]:
with open('contaminants_seqids.txt', 'w') as w:
    for d_ in return_list:
        w.write(d_['seqid'] + '\n')