In [52]:
import pandas as pd
import numpy as np

In [53]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def get_phrog(row):
    """ get phrog cluster """
    if 'phrog_' in str(row['target']):
        return int(row['target'].split('_')[-1])
    else:
        return 0
    
    
def curate_columns(row):
    
    phrog = str(row['phrog'])
    alan_profile = str(row['alan_profile'])
    
    if phrog != '0': return phrog
    elif alan_profile != '0': return alan_profile
    else: return '0'

In [54]:
print('Load tables for testing... ', end='')


phrogs_df = pd.read_csv('big_tables/phrogs.tsv', sep='\t')
alan_df = pd.read_csv('big_tables/alan.tsv', sep='\t')
pfam_df = pd.read_csv('big_tables/pfam.tsv', sep='\t')
ecod_df = pd.read_csv('big_tables/ecod.tsv', sep='\t')

### metadata tables


### replace PHROGS functions to CUSTOM MGG PHROGS functions.

# load
phrogs_annot_df = pd.read_csv('metadata_tables/phrog_annot_v4.tsv', sep='\t') # load PHROGS
mgg_phrogs_annot_df = pd.read_csv('metadata_tables/v3_phrogs-table-rafal-3_12.csv', sep=';') # load MGG PHROGS

# rename columns
mgg_phrogs_annot_df = mgg_phrogs_annot_df.rename(columns={'funct.orig': 'annot', 'funct.new': 'annot_mgg'})[['annot', 'annot_mgg']]

# map to CUSTOM MGG PHROGS functions
phrogs_annot_df.merge(mgg_phrogs_annot_df, on='annot', how='left') \
               .drop('annot', axis=1) \
               .rename(columns={'annot_mgg': 'annot'})

# format
phrogs_annot_df = phrogs_annot_df[['phrog', 'color', 'annot', 'category']] # order columns
phrogs_annot_df = phrogs_annot_df.fillna('unknown function')


### ALANDB
alan_annot_df = pd.read_csv('metadata_tables/alan_annot.tsv', sep=',')
alan_annot_df = alan_annot_df.rename(columns={'definition': 'annot', 'funct': 'category', 'profile': 'alan_profile'})
alan_annot_df = alan_annot_df.drop(['abbrev', 'hmm.name', 'family.name', 'family.name.base', 'where.it.occurs', 'is.custom', 'custom.hmm.exists'], axis=1)

print('Done!')


print('Report hits... ', end='')
print('PHROGS metadata... ', end='')
phrogs_df['phrog'] = phrogs_df.apply(get_phrog, axis=1)
phrogs_df = phrogs_df.merge(phrogs_annot_df, on='phrog', how='left')

print('ALANDB metadata... ', end='')
alan_df['alan_profile'] = alan_df['target']
alan_df = alan_df.merge(alan_annot_df, on='alan_profile', how='left')

print('final table... ', end='')
df = pd.concat([phrogs_df, alan_df, pfam_df, ecod_df], axis=0)
df[['phrog', 'alan_profile']] = df[['phrog', 'alan_profile']].fillna('0')
df['phrog'] = df['phrog'].astype(int)
df['phrog/alan_profile'] = df.apply(curate_columns, axis=1)
df = df.drop(['phrog', 'alan_profile'], axis=1)


print('Done!')

Load tables for testing... Done!
Report hits... PHROGS metadata... ALANDB metadata... final table... Done!


In [55]:
# df.query('query == "PC00001"')

In [56]:
def get_no_hit_frames(pcid):

    # by defalut: no hit
    phrogs_df, alan_df, pfam_df, ecod_df = get_no_hit_row(2), get_no_hit_row(2), get_no_hit_row(1), get_no_hit_row(1)

    dfs = [phrogs_df, alan_df, pfam_df, ecod_df]
    report_labels = [['PHROGS1', 'PHROGS2'], ['ALAN1', 'ALAN2'], ['PFAM'], ['ECOD']]

    for df, labels in zip(dfs, report_labels):
        df['query'] = pcid
        df['report_confidence'] = get_confidence_column(df)
        df['report_label'] = labels
        
    return phrogs_df, alan_df, pfam_df, ecod_df

def get_confidence_column(df, eval_intervals=(10**-10, 10**-5, 10**-3, 1), col='evalue', verbose=False):
    # df = pd.DataFrame({'evalue': [10**-10, 10**-7, 10**-3, 10, 10**2, 0]})

    ### conditions
    conditions, choices = [], []
    
    # zero as seperate category (no hits)
    conditions.append(df[col] == 0)
    if verbose: print(f' == 0')
    
    for i, evalue in enumerate(eval_intervals):
        # lower than first number
        if i == 0:
            conditions.append((df[col] <= evalue))
            if verbose: print(f'<= {evalue:.2E}')

        # between numbers
        else:
            lower_evalue = eval_intervals[i-1]
            conditions.append((df[col] > lower_evalue) & (df[col] <= evalue))
            if verbose: print(f'> {lower_evalue:.2E} and <= {evalue:.2E}')
    
    # higher than last number
    conditions.append(df[col] >= eval_intervals[-1])
    if verbose: print(f'> {eval_intervals[-1]}')

    ### choices
    choices = ['*' * i for i in range(len(eval_intervals)).__reversed__()]
    choices = ['-'] + choices[:-1] + ['!', '!']

    # print
    if verbose: print(choices)
    if verbose: np.select(conditions, choices, default='?')

    return np.select(conditions, choices, default='?')


def get_no_hit_row(n, columns_mapper = {'query': 'string', 'target': 'string', 'prob': 'float', \
                                        'pvalue': 'float', 'ident': 'float', 'qcov': 'float', \
                                        'tcov': 'float', 'bits': 'float', 'qstart': 'int', \
                                        'qend': 'int', 'qlength': 'int', 'tstart': 'int', \
                                        'tend': 'int', 'tlength': 'int', 'evalue': 'float', \
                                        'db': 'string', 'name': 'string', 'color': 'string', \
                                        'annot': 'string', 'category': 'string', 'phrog/alan_profile': 'string',
                                        'report_label': 'string', 'report_function': 'string', 'report_params': 'string'}):
    
    """ Give dict of column names and variable types to create 'no hit' row as data frame object """

    values, indicies = [], columns_mapper.keys()
    for key, variable_type in columns_mapper.items():
        if variable_type == 'string': values.append('-')
        else: values.append(0)

    no_hit_row = pd.Series(values, index=indicies).to_frame().T
    no_hit_row = pd.concat([no_hit_row]*n)
    return no_hit_row


def report_phrogs(df, max_evalue=10**-3, nfunc2report=2, verbose=False):
    
    """
    1. Filter eval 10**-3
    2. Remove unknown function [REPORT ONLY WHEN NO OTHER FUNCTION, PRIORITY-0]
    3. Remove non-informative functions (lytic tail protein, tail protein, structural protein, virion structural protein, minor tail protein ...) [REPORT ONLY WHEN NO OTHER FUNCTION; PRIORITY-1]
    4. Group by unique functions. For each function report independently max bitscore and max qcov (hits to this function).
    5. Take two functions with highest bitscores.
    6. Report {confidence} {function} in one genbank field, and in seperate field bitscore and qcov.
    7. Report two best PHROG hits seperataly (in total four PHROGS field: 2x function with confidence and 2x params: bitscore and qcov) [PRIORITY-2]
    """
    print('Take unknown function if no other function!')
    
    # get PC name
    pcid = df['query'].unique()[0]
    
    #### get informative hits
    noninformative_functions = ['lytic tail protein', 'tail protein', 'structural protein', 'virion structural protein', 'minor tail protein']

    filt_evalue = 'evalue <= @max_evalue'
    filt_unknown = 'annot != "unknown function"'
    fitl_noninformative_functions = '~(annot.isin(@noninformative_functions))'

    query = ' and '.join([filt_evalue, filt_unknown, fitl_noninformative_functions])
    df = df.query(query)

    # select best hits for each unique function (highest bitscore)
    best_hits_df = df.loc[df.groupby('annot')['bits'].idxmax()] \
                     .sort_values('bits', ascending=False) \
                     .copy()
    
    # get highest qcov for each unique function
    best_qcov_df = df.loc[df.groupby('annot')['qcov'] \
                            .idxmax()][['annot','qcov']] \
                            .copy()
    
    # best hits for each unique function & highest qcov for given function
    final_df = best_hits_df.merge(best_qcov_df, on='annot', how='left', suffixes=('_oryginal', '_best')) \
                           .drop('qcov_oryginal', axis=1) \
                           .rename(columns={'qcov_best': 'qcov'}) \
                           .sort_values('bits', ascending=False)
    
    
    ### report function    
    top_hits_df = final_df.iloc[:nfunc2report].copy()
    
    # prepare columns2report
    if len(top_hits_df) == 0: 
        top_hits_df = get_no_hit_row(2)    
        top_hits_df['report_params'] = ['-', '-']
    elif len(top_hits_df) == 1: 
        holder_row = get_no_hit_row(1)
        holder_row['report_params'] = '-'
        top_hits_df['report_params'] = '-'
        top_hits_df = pd.concat([top_hits_df, holder_row])
    else: 
        top_hits_df['report_params'] = top_hits_df.apply(lambda row: f'bits: {int(row["bits"]): <4} qcov: {row["qcov"]:.2f}', axis=1)

    # report columns
    labels = [f'PHROGS{i}' for i in range(1,len(top_hits_df)+1)]
    
    top_hits_df['query'] = [pcid] * len(top_hits_df)
    top_hits_df['report_label'] = labels
    top_hits_df['report_function'] = top_hits_df['annot']
    top_hits_df['report_confidence'] = get_confidence_column(top_hits_df)
            
    if verbose: display(top_hits_df)
    return top_hits_df


def report_alan(df, min_prob=0.95, nfunc2report=2, verbose=True):
    """ ... """
    
    # get PC name
    pcid = df['query'].unique()[0]
    
    # sort significant hits
    filter_df = df.query('prob >= @min_prob').sort_values('bits', ascending=False)
    
    ### report function    
    top_hits_df = filter_df.iloc[:nfunc2report].copy()
    
       # prepare columns2report
    if len(top_hits_df) == 0: 
        top_hits_df = get_no_hit_row(2)    
    elif len(top_hits_df) == 1: 
        holder_row = get_no_hit_row(1)
        top_hits_df = pd.concat([top_hits_df, holder_row])
    else: pass

    # report columns
    labels = [f'ALAN{i}' for i in range(1,len(top_hits_df)+1)]
    
    top_hits_df['query'] = [pcid] * len(top_hits_df)
    top_hits_df['report_label'] = labels
    top_hits_df['report_function'] = top_hits_df['category']
    top_hits_df['report_params'] = top_hits_df.apply(lambda row: f'bits: {int(row["bits"]): <4} eval: {row["evalue"]:.1E}', axis=1)
    top_hits_df['report_confidence'] = get_confidence_column(top_hits_df)
            
    if verbose: display(top_hits_df)

    return top_hits_df

In [57]:
def report_ecod(df, verbose=True):
    """ from the most sigfinicant to the none-significant """
    
    # get PC name
    pcid = df['query'].unique()[0]
    
    # sort significant hits
    df = df.sort_values('bits', ascending=False)
    
    # best hit
    top_hit_df = df.iloc[0].copy()
    
    # report function    
    if len(top_hit_df) != 0: 
        name = top_hit_df['name']
        F_INDEX, ecod_levels = name.split('|')[1].strip(), name.split('|')[3]
        T, F = ecod_levels.split(': ')[4].strip(', F'), ecod_levels.split(': ')[5].strip()
        report_function = f'T: {T}, F: {F} [{F_INDEX}]'
        report_params = f'bits: {int(top_hit_df["bits"]): <4} evalue: {top_hit_df["evalue"]:.1E}'
    else: # no hit
        top_hits_df = get_no_hit_row(1).iloc[0]
        report_function, report_params = '-', '-'

    # report columns
    top_hit_df['query'] = pcid
    top_hit_df['report_label'] = 'ECOD'
    top_hit_df['report_function'] = report_function
    top_hit_df['report_params'] = report_params
    top_hit_df['report_confidence'] = get_confidence_column(top_hit_df)
            
    if verbose: display(top_hit_df.to_frame().T)
    return top_hit_df.to_frame().T


def report_pfam(df, verbose=True):

    print('Take unknown function if no other function!')
    
    # get PC name
    pcid = df['query'].unique()[0]
    
    # informative hits
    df = df.query('~name.str.contains("DUF")').sort_values('bits', ascending=False)

    # best hit & function2report
    if len(df) != 0: 
        top_hit_df = df.iloc[0].copy()
        name = top_hit_df['name']
        pfamID, func_short, func_detailed = name.split(';')[0].strip(), name.split(';')[1].strip(), name.split(';')[2].strip()
        report_function = f'{func_detailed} [{func_short}] [{pfamID}]'
        report_params = f'bits: {int(top_hit_df["bits"]): <4} evalue: {top_hit_df["evalue"]:.1E}'
    else: # no hit 
        top_hit_df = get_no_hit_row(1).iloc[0]
        report_function, report_params = '-', '-'
    
    
    # report columns
    top_hit_df['query'] = pcid
    top_hit_df['report_label'] = 'PFAM'
    top_hit_df['report_function'] = report_function
    top_hit_df['report_params'] = report_params
    top_hit_df['report_confidence'] = get_confidence_column(top_hit_df)
            
    if verbose: display(top_hit_df.to_frame().T)
    return top_hit_df.to_frame().T

In [67]:
print('Subsampling 1/1000')
print('!!!!!!!Groupby PC!!!!!!!!!')
frac = int(df.shape[0]/1000)
df = df.iloc[:frac]

Subsampling 1/1000
!!!!!!!Groupby PC!!!!!!!!!


In [63]:
%time
best_hits_dfs = []
for pcid, pc in df.groupby('query'):
    
    # default (no hit)
    phrogs_df, alan_df, pfam_df, ecod_df = get_no_hit_frames(pcid)
    
    for dbid, db in pc.groupby('db'):
        
        # report function
        if dbid == 'PHROGS': phrogs_df = report_phrogs(db, max_evalue=10**-3, nfunc2report=2, verbose=False)
        elif dbid == 'ALANDB': alan_df = report_alan(db, min_prob=0.95, nfunc2report=2, verbose=False)
        elif dbid == 'PFAM': pfam_df = report_pfam(db, verbose=False)
        elif dbid == 'ECOD': ecod_df = report_ecod(db, verbose=False)
        else: pass

    best_hits_dfs.append(phrogs_df)
    best_hits_dfs.append(alan_df)
    best_hits_dfs.append(pfam_df)
    best_hits_dfs.append(ecod_df)


    # if pcid == 'PC00010': break

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.25 µs
Take unknown function if no other function!


In [64]:
best_hits_df = pd.concat(best_hits_dfs).reset_index(drop=True)
best_hits_df['evalue'] = best_hits_df.apply(lambda row: f'{row["evalue"]:.2E}', axis=1)
best_hits_df = best_hits_df[['query','target', 'prob', 'qcov', 'tcov', 'bits', 'evalue', 'report_label', 'report_function', 'report_params', 'report_confidence']]

In [65]:
best_hits_df.to_csv('annot.tsv', sep='\t', index=False)