# Dev-work: hashing as parsing

In [4]:
# system dependecies
import os
from pathlib import Path
import time
import pickle

# library dependencies
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns
from collections import defaultdict

## biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SearchIO

## pyhmmer
import pyhmmer

## datasketch
from datasketch import MinHash

# local dependencies/utils

## Paths
PFAM_PATH = Path("/Users/humoodalanzi/pfam/Pfam-A.hmm")
ID_DB_PATH = Path("/Users/humoodalanzi/pfam/proteins_id.zip")
#probably need path of unit tests

In [2]:
meso_output = os.path.abspath(os.path.join('..', 'examples', "meso_output.domtblout"))
thermo_output = os.path.abspath(os.path.join('..', 'examples', "thermo_output.domtblout"))

From [dev-HA_pyhmmer.ipynb](./dev-HA_pyhmmer.ipynb), I developed the below function:

In [6]:
def run_hmmer(
        seqs: pd.core.frame.DataFrame,
        input_file: str,
        hmm: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database.

    Parameters
    ----------
    seqs : pandas.core.frame.DataFrame
        A dataframe with string amino acid sequences in a 'seq' column.
    input_file : str
        Path to the input sequence file.
    hmm : str
        Path to the HMM database.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Raises
    ------
    ValueError
        If the input dataframe is empty.
    AttributeError
        If any of the sequences are invalid.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # generate meso and thermo files
    read_seq(seqs, input_file)

    # place files into HMMER/pfam
    run_pyhmmer(
        hmm,
        input_file,
        output_file,
        cpu,
        prefetching,
        save_out,
        eval_con)


def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file


    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError :
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")

    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except BaseException as exc:
            raise AttributeError("Invalid sequence") from exc

    # function
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError as exc:
            raise AttributeError(f"Invalid sequence: {seq}") from exc

    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")

    with open(f"{inputname}.fasta", "w", encoding="utf-8") as file:
        SeqIO.write(records, file, "fasta")
    return file


def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(
        os.path.join(
            "../data/pfam/",
            os.path.basename(hmmdb) +
            ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")

    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb", encoding="utf-8") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i == 0)
        else:
            all_hits = pyhmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

In [7]:
### testing

# read df 
df_sample = pd.read_csv("learn2therm_sample_50k_exploration.csv", index_col=0)

# split the database into corresponding thermo and meso lists
meso_seq_db = df_sample[["meso_index", "m_protein_seq"]]
thermo_seq_db = df_sample[["thermo_index", "t_protein_seq"]]

# make the corresponding index the dataframe index and only sample a 50 sequences
meso_seq_list = meso_seq_db.set_index("meso_index").iloc[:50]
meso_seq_list.index.name = None
meso_seq_list.rename({'m_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

thermo_seq_list = thermo_seq_db.set_index("thermo_index").iloc[:50]
thermo_seq_list.index.name = None
thermo_seq_list.rename({'t_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

## Hashing

In [8]:
run_hmmer(meso_seq_list, "meso_input", PFAM_PATH, "meso_output", cpu=5, prefetching=True)

In [9]:
all_hits

NameError: name 'all_hits' is not defined

So I've to work with the run_pyhmmer function

In [15]:
def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10,
        num_hashes: int = 128
        ):
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(
        os.path.join(
            "../data/pfam/",
            os.path.basename(hmmdb) +
            ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file and has .fasta extension
    input_file, input_ext = os.path.splitext(input_file)
    if input_ext != '.fasta':
        input_file += '.fasta'

    output_file, output_ext = os.path.splitext(output_file)
    if output_ext != '.domtblout':
        output_file += '.domtblout'


    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")
    # minhash dict
    minhashes = {}

    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                        hits.write(dst, format="domains", header=i == 0)
                        for hit in hits:
                            # Get the accession ID
                            acc_id = hit.accession

                            # Create a MinHash object for the accession ID
                            mh = MinHash(num_perm=num_hashes)
                            for d in acc_id:
                                mh.update(d.to_bytes(1, 'little'))

                            # Add the MinHash object to the dictionary
                            if acc_id not in minhashes:
                                minhashes[acc_id] = mh
            return minhashes 
        else:
            all_hits = pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)
            for hits in all_hits:
                for hit in hits:
                    # Get the accession ID
                    acc_id = hit.accession

                    # Create a MinHash object for the accession ID
                    mh = MinHash(num_perm=num_hashes)
                    for d in acc_id:
                        mh.update(d.to_bytes(1, 'little'))

                    # Add the MinHash object to the dictionary
                    if acc_id not in minhashes:
                        minhashes[acc_id] = mh

            # minhashes is now a dictionary with accession IDs as keys and MinHash objects as values
            return minhashes

    return minhashes if not save_out else None

In [16]:
hmmdb = PFAM_PATH
input_file = "../data/meso_input"
output_file = "testing3"

# Test with prefetching off
# start_time = time.time()
# run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=False)
# end_time = time.time()
# print(f"Time with prefetching off: {end_time - start_time:.3f} seconds")

# Test with prefetching on
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=True, save_out=True)
end_time = time.time()
print(f"Time with prefetching on: {end_time - start_time:.3f} seconds")

Time with prefetching on: 10.858 seconds


I needed to figure out a way to view the minhashes and save them to file. As I test, I added pickle below

In [6]:
def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10,
        num_hashes: int = 128,
        minhash_file=None,
        ):
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(
        os.path.join(
            "../data/pfam/",
            os.path.basename(hmmdb) +
            ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # ensure input_file and has .fasta extension
    input_file, input_ext = os.path.splitext(input_file)
    if input_ext != '.fasta':
        input_file += '.fasta'

    output_file, output_ext = os.path.splitext(output_file)
    if output_ext != '.domtblout':
        output_file += '.domtblout'


    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")
    # minhash dict
    minhashes = {}

    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        all_hits = pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)
        for hits in all_hits:
            for hit in hits:
                # get the accession ID
                acc_id = hit.accession

                # Create a MinHash object for the accession ID
                mh = MinHash(num_perm=num_hashes)
                for d in acc_id:
                    mh.update(d.to_bytes(1, 'little'))

                # Add the MinHash object to the dictionary
                if acc_id not in minhashes:
                    minhashes[acc_id] = mh
        
        # Save the minhashes to a file
        if minhash_file is not None:
            with open(minhash_file, 'wb') as f:
                pickle.dump(minhashes, f)

        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(all_hits):
                    hits.write(dst, format="domains", header=i == 0)

    return minhashes if not save_out else None

In [8]:
hmmdb = PFAM_PATH
input_file = "../data/meso_input"
output_file = "testing2"
minhash_file = "testing3.pickle"


# Test with prefetching on
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=True, minhash_file=minhash_file)
end_time = time.time()
print(f"Time with prefetching on: {end_time - start_time:.3f} seconds")

Time with prefetching on: 10.931 seconds


In [9]:
with open('testing3.pickle', 'rb') as f:
    minhashes = pickle.load(f)

# View the minhashes
for acc_id, mh in minhashes.items():
    print(acc_id, mh.digest())

b'PF04542.17' [ 170058183  231972810   13184479   31312792  132721297  162003033
  223352304 1001639860   56920750  689953153  709707179  264093939
  589979148  206606881  728114937  716560562  303782223  518718070
   61898119  345530959  972384886  238376316  242292921  225330139
  478898225  209569695  247430539 1701580170 1631227315  146302257
  208408024 1193163431   63338930  723740236  279173791  137035955
  184549058  760508848 1151800413  820341160  447125243  180635373
  437814554  459883785  422609979  593356018  670234631   53239987
 1516053798   95211786   50762164  834942432   55729082  199573619
   81751156 1413411833 1052487644  433736305  396240731  108931309
  454099809  675714005  117605995  433070387  181700671  273260755
  586056180  589223797  120193397   14029173  308464802  361896725
  728093259  403360895  469689648  310342692  589547676   12057391
  216195692  274538963 1411730339  194981993    1530477  324159111
  908951260 1225962489  533181291   85051013  38