# Dev-work: making HMMER/pfam parsing function

In [65]:
# system dependecies
import subprocess
import os
from pathlib import Path
import time

# library dependencies
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns
from collections import defaultdict

## biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SearchIO
from Bio.SearchIO.HmmerIO.hmmer3_domtab import Hmmer3DomtabHmmqueryIndexer, Hmmer3DomtabHmmqueryParser


# local dependencies/utils

## Paths
PFAM_PATH = Path("/Users/humoodalanzi/pfam/Pfam-A.hmm")
ID_DB_PATH = Path("/Users/humoodalanzi/pfam/proteins_id.zip")
#probably need path of unit tests

In [3]:
meso_output = os.path.abspath(os.path.join('..', 'examples', "meso_output.domtblout"))
thermo_output = os.path.abspath(os.path.join('..', 'examples', "thermo_output.domtblout"))

## Testing out biopython hmmer indexer

In [8]:
meso_indexer = Hmmer3DomtabHmmqueryIndexer(meso_output)

In [26]:
dir(meso_indexer)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_handle',
 '_kwargs',
 '_parse',
 '_parser',
 '_query_id_idx',
 'get',
 'get_raw']

In [41]:
with open(meso_output, 'r') as handle:
    for query_result in Hmmer3DomtabHmmqueryParser(handle):
        #process each query result
        query_id = query_result.id
        hits = query_result.hits
        for hit in hits:
            hit_id = hit.id
            hsps = hit.evalue
print(query_id, hit_id, hsps)

11636 Rad17 0.29


Simple parsing to see how the function above works:

In [42]:
# a dict to save best hits
best_hits = {}

with open(meso_output, 'r') as handle:
    for query_result in Hmmer3DomtabHmmqueryParser(handle):
        #process each query result
        query_id = query_result.id
        hits = query_result.hits
        for hit in hits:
            for hsp in hit.hsps:
                family_id = hit.id
                evalue = hit.evalue
                seqlen = hit.seq_len
                hsps = hsp.evalue
                start = hsp.query_start
                end = hsp.query_end
print(query_id, family_id, evalue, seqlen, hsps, start, end)

            # if hit.id not in best_hits:
            #     # First hit for this hit ID, so store it as the best hit
            #     best_hits[hit.id] = hit
            # else:
            #     # Check if this hit is better than the current best hit
            #     current_best_hit = best_hits[hit.id]
            #     hit_hsps = sorted(hit.hsp, key= lambda h: h.evalue_cond)
            #     hit_id = hit.id
            #     hsps = hit.evalue

11636 Rad17 0.29 186 0.65 48 69


### Past work

---

#### Redoing Evan's code

In [56]:
test = SearchIO.parse(meso_output, 'hmmscan3-domtab')

In [58]:
result_test = list(test)

In [63]:
result_test[0].hits

[Hit(id='Response_reg', query_id='13026', 1 hsps),
 Hit(id='Trans_reg_C', query_id='13026', 2 hsps),
 Hit(id='FleQ', query_id='13026', 1 hsps)]

In [64]:
def iter():
    while True:
        try:
            yield next(result_test)
        except StopIteration:
            break
        except:
            yield None

In [None]:
t0 = time.time()
found = None
N = 0
failures = 0
for result_test in iter():
    N +=1
    print(N)
    if result_test is None:
        failures += 1
        print(f'Did {N}, broek {failures}')
        continue
    else:
        pass

    if int(result_test.id) == 11324:
        found = +1
    else:
        pass
print(N)
print(time.time() - t0)

---

In [45]:
# define a function to extract the best hit for each protein (essentially what Evan did). Big thanks to Biostar as usual
def get_best_hits(filename):
    best_hits = {}
    for query_result in SearchIO.parse(filename, 'hmmscan3-domtab'):
        best_hit = None
        for hit in query_result:
            if best_hit is None or hit.evalue < best_hit.evalue:
                best_hit = hit
        best_hits[query_result.id] = {'hit_id': best_hit.id, 'evalue': best_hit.evalue}
    return best_hits

In [46]:
get_best_hits(meso_output)

{'12897': {'hit_id': 'ABC_tran', 'evalue': 6.9e-29},
 '13026': {'hit_id': 'ABC_tran', 'evalue': 2.8e-29},
 '8203': {'hit_id': 'adh_short_C2', 'evalue': 1.1e-46},
 '3340': {'hit_id': 'TetR_C_16', 'evalue': 2.3e-27},
 '14020': {'hit_id': 'BPD_transp_1', 'evalue': 1.3e-14},
 '3582': {'hit_id': 'TetR_C_33', 'evalue': 8.2e-17},
 '13920': {'hit_id': 'MarR_2', 'evalue': 3.9e-10},
 '6370': {'hit_id': 'MlaD', 'evalue': 2.7e-14},
 '9289': {'hit_id': 'PALP', 'evalue': 3.7e-77},
 '742': {'hit_id': 'adh_short', 'evalue': 1.3e-40},
 '11706': {'hit_id': 'BPD_transp_2', 'evalue': 2.8e-40},
 '753': {'hit_id': 'Aldo_ket_red', 'evalue': 3.7e-75},
 '13575': {'hit_id': 'ABC_tran', 'evalue': 1.2e-23},
 '9179': {'hit_id': 'adh_short_C2', 'evalue': 5.9e-60},
 '14594': {'hit_id': 'BPD_transp_1', 'evalue': 9.9e-16},
 '7818': {'hit_id': 'ABC_tran', 'evalue': 0.024},
 '8772': {'hit_id': 'DUF1707', 'evalue': 1.8e-15},
 '14840': {'hit_id': 'ABC_tran', 'evalue': 3.5e-36},
 '8446': {'hit_id': 'HypA', 'evalue': 2.2e-2

In [None]:
# hell mode
def compare_hmmer_outputs(mesophile_file: str, thermophile_file: str, how: str, **kwargs):
    """
    Compares the HMMER output files for mesophilic and thermophilic sequences to
    determine if they share any target protein families.

    Parameters
    ----------
    mesophile_file : str
        Path to the HMMER output file for mesophilic sequences.
    thermophile_file : str
        Path to the HMMER output file for thermophilic sequences.
    how : str
        methods to parse the two outputs:
            -"extract_best_hit': Given a QueryResult object, return a dictionary containing the best HSP objects for each unique hit ID and query ID combination.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the target protein families and a boolean indicating
        whether they are present in both mesophilic and thermophilic sequences.

    """
    # Read in the two HMMER outputs
    try:
        mesophile_results = SearchIO.parse(mesophile_file, 'hmmscan3-domtab')
        thermophile_results = SearchIO.parse(thermophile_file, 'hmmscan3-domtab')
    except Exception as e:
        return f"Error: {e}"

    # methods
    if how == "extract_best_hit":
        # extract the best hit for each protein in the mesophilic dataset
        meso_hits = {}
        for meso_res, thermo_res in zip(mesophile_results, thermophile_results):
            protein_id = meso_res.id
            best_hits = extract_best_hit(meso_res)
            if best_hits is not None:
                if isinstance(best_hits, list):
                    meso_hits[protein_id] = best_hits
                else:
                    meso_hits[protein_id] = [best_hits]
            else:
                meso_hits[protein_id] = []

            # check that there is a corresponding hit for each meso hit in the thermo results
            num_meso_hits = len(meso_hits[protein_id])
            num_thermo_hits = len(thermo_res.hits)
            if num_meso_hits != num_thermo_hits:
                raise ValueError(f"The number of hits for meso protein {protein_id} ({num_meso_hits}) does not match the number of hits for its corresponding thermo protein ({num_thermo_hits})")

        # extract the best hit for each protein in the thermophilic dataset
        thermo_hits = {}
        for result in thermophile_results:
            protein_id = result.id
            best_hit = extract_best_hit(result)
            if best_hit is not None:
                thermo_hits[protein_id] = best_hit
            else:
                thermo_hits[protein_id] = []

        # get the target families for each protein
        mesophile_target_families = [hit.id for hits in meso_hits.values() for hit in hits]
        thermophile_target_families = [hit.id for hits in thermo_hits.values() for hit in hits]
        
        # Compare target families to determine if they are functional or not
        if set(mesophile_target_families) == set(thermophile_target_families):
            functional = True
        else:
            functional = False

        # Create a dataframe with the target families and functional status
        data = {'Target Family': mesophile_target_families, 'Functional': functional}
        df = pd.DataFrame(data)

        logging.info('compare_hmmer_outputs function completed successfully')

        return df
    else:
        raise NotImplementedError(f"Type {how} not supported yet. So far, the only available method is  'extract_best_hit' ")