# Dev-work: working on pyhmmer

In [2]:
# system dependecies
import subprocess
import os
from pathlib import Path
import time

# library dependencies
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns
from collections import defaultdict

## biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SearchIO

# pyhmmer
import pyhmmer

# local dependencies/utils

## Paths
PFAM_PATH = Path("/Users/humoodalanzi/pfam/Pfam-A.hmm")
ID_DB_PATH = Path("/Users/humoodalanzi/pfam/proteins_id.zip")
DATA_PATH = Path("../data/")
OUT_PATH = Path("../data/pfam/")
#probably need path of unit tests

For references on how to use the pyhmmer API, check this [link](https://pyhmmer.readthedocs.io/en/stable/api/)

### Press pfam HMM into db for speed purposes

In [2]:
type(pyhmmer)

module

In [3]:
# Create hmms
hmms = pyhmmer.plan7.HMMFile(PFAM_PATH)
# press hmms and store them in the pfam data folder
pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

19632

Like my HMMER3 code, this should be the number of HMMs inside my pfam db

#### An aside; to create FASTA files
---

I still need my read_seq function from previous rounds of development. The reason why I didn't directly import it is because I am using a different enviroment for testing

In [4]:
def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file

    
    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError : 
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")
    
    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except:
            raise AttributeError("Invalid sequence")

    # function    
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError:
            raise AttributeError(f"Invalid sequence: {seq}")
    
    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")
    
    with open(f"{inputname}.fasta", "w") as file:
            SeqIO.write(records, file, "fasta")
    return file

In [5]:
df_sample = pd.read_csv('learn2therm_sample_50k_exploration.csv', index_col=0)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 29 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   local_gap_compressed_percent_id        50000 non-null  float64
 1   scaled_local_query_percent_id          50000 non-null  float64
 2   scaled_local_symmetric_percent_id      50000 non-null  float64
 3   query_align_len                        50000 non-null  int64  
 4   query_align_cov                        50000 non-null  float64
 5   subject_align_len                      50000 non-null  int64  
 6   subject_align_cov                      50000 non-null  float64
 7   bit_score                              50000 non-null  int64  
 8   thermo_index                           50000 non-null  int64  
 9   meso_index                             50000 non-null  int64  
 10  prot_pair_index                        50000 non-null  int64  
 11  me

In [6]:
## Meso (+a bit of processing)
meso_seq_db2 = df_sample[["meso_index", "m_protein_seq"]]
meso_seq_list2 = meso_seq_db2.set_index("meso_index").iloc[:100]
meso_seq_list2.index.name = None
meso_seq_list2.rename({'m_protein_seq': 'protein_seq'}, axis="columns", inplace=True)


## Thermo (+ a bit of processing)
thermo_seq_db2 = df_sample[["thermo_index", "t_protein_seq"]]
thermo_seq_list2 = thermo_seq_db2.set_index("thermo_index").iloc[:100]
thermo_seq_list2.index.name = None
thermo_seq_list2.rename({'t_protein_seq': 'protein_seq'}, axis="columns", inplace=True)



# generate meso and thermo files
read_seq(meso_seq_list2, "../data/meso_input")
read_seq(thermo_seq_list2, "../data/thermo_input")

<_io.TextIOWrapper name='../data/thermo_input.fasta' mode='w' encoding='UTF-8'>

---

### Working with pyhmmer proper

In [2]:
with pyhmmer.plan7.HMMFile("../data/pfam/") as hmms:
    with pyhmmer.easel.SequenceFile("../data/meso_input.fasta", digital=True) as seqs:
        t1 = time.time()
        all_hits = list(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=5, E=1e-10))
        totals = len(all_hits)
        print(f"- hmmscan found {totals} hits without prefetching in {time.time() - t1:.3} seconds")

- hmmscan found 100 hits without prefetching in 6.14 seconds


Let's explore the variable "all_hits"

In [8]:
all_hits[0]

<pyhmmer.plan7.TopHits at 0x7f839b0b9320>

In [9]:
test1 = all_hits[0][0]

let's explore the different methods

In [10]:
dir(test1)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'accession',
 'best_domain',
 'bias',
 'description',
 'domains',
 'dropped',
 'duplicate',
 'evalue',
 'hits',
 'included',
 'name',
 'new',
 'pre_score',
 'pvalue',
 'reported',
 'score',
 'sum_score']

In [11]:
dir(all_hits[0])

['E',
 'T',
 'Z',
 '__add__',
 '__bool__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'bit_cutoffs',
 'block_length',
 'compare_ranking',
 'copy',
 'domE',
 'domT',
 'domZ',
 'incE',
 'incT',
 'incdomE',
 'incdomT',
 'included',
 'is_sorted',
 'long_targets',
 'merge',
 'query_accession',
 'query_name',
 'reported',
 'searched_models',
 'searched_nodes',
 'searched_residues',
 'searched_sequences',
 'sort',
 'strand',
 'to_msa',
 'write']

In [12]:
len(all_hits[0])

3

We got three hits for the first input; interesting

In [13]:
test1.name
# correct

b'Sigma70_r2'

In [14]:
test1.accession
# that's true

b'PF04542.17'

In [15]:
test1.evalue
# right

2.5999653142493426e-19

In [16]:
test1.pvalue
# this is c-evalue in hmmer3

1.3243507101922078e-23

In [17]:
test1.best_domain

<pyhmmer.plan7.Domain at 0x148baf6c0>

In [18]:
dir(test1.best_domain)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'alignment',
 'bias',
 'c_evalue',
 'correction',
 'env_from',
 'env_to',
 'envelope_score',
 'hit',
 'i_evalue',
 'included',
 'pvalue',
 'reported',
 'score']

way too much information

In [19]:
dir(test1.hits)
# you can move between top hits and hit; very cool

['E',
 'T',
 'Z',
 '__add__',
 '__bool__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'bit_cutoffs',
 'block_length',
 'compare_ranking',
 'copy',
 'domE',
 'domT',
 'domZ',
 'incE',
 'incT',
 'incdomE',
 'incdomT',
 'included',
 'is_sorted',
 'long_targets',
 'merge',
 'query_accession',
 'query_name',
 'reported',
 'searched_models',
 'searched_nodes',
 'searched_residues',
 'searched_sequences',
 'sort',
 'strand',
 'to_msa',
 'write']

Can I save the pyhmmer results?

In [20]:
dir(all_hits[0])

['E',
 'T',
 'Z',
 '__add__',
 '__bool__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'bit_cutoffs',
 'block_length',
 'compare_ranking',
 'copy',
 'domE',
 'domT',
 'domZ',
 'incE',
 'incT',
 'incdomE',
 'incdomT',
 'included',
 'is_sorted',
 'long_targets',
 'merge',
 'query_accession',
 'query_name',
 'reported',
 'searched_models',
 'searched_nodes',
 'searched_residues',
 'searched_sequences',
 'sort',
 'strand',
 'to_msa',
 'write']

### Saving output

In [115]:
# import collections
# Result = collections.namedtuple("Result", ["name", "query", "accession" , "bitscore"])

# results = []
# with pyhmmer.plan7.HMMFile("../data/pfam/") as hmms:
#     with pyhmmer.easel.SequenceFile("../data/meso_input.fasta", digital=True) as seqs:
#         t1 = time.time()
#         all_hits = list(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=5, E=1e-10))
#         totals = len(all_hits)
#         print(f"- hmmscan found {totals} hits without prefetching in {time.time() - t1:.3} seconds")
        
#         # find query
#         for top_hits in all_hits:
#             query = top_hits.query_name.decode()
#         for hit in top_hits:
#             if hit.included:
#                 results.append(Result(hit.name.decode(), query, hit.accession, hit.score))

- hmmscan found 100 hits without prefetching in 5.51 seconds


Really inefficent (meomry) I don't like that. Let's see if I can use IOBase

In [28]:
with pyhmmer.plan7.HMMFile("../data/pfam/.h3m") as hmms:
    with pyhmmer.easel.SequenceFile("../data/meso_input.fasta", digital=True) as seqs:
        with open("testing.domtblout", "wb") as dst:
            t1 = time.time()
            for i, hits in enumerate(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=5, E=1e-10)):
                hits.write(dst, format="domains", header=i==0)
            print(f"- hmmscan took {time.time() - t1:.3} seconds")

- hmmscan took 5.89 seconds


#### prefetching work

In [107]:
#### lab
with pyhmmer.plan7.HMMFile("../data/pfam/.h3m") as hmms:
    test = hmms.is_pressed()
    # target = hmms.optimized_profiles()
    print(test)
    # print(target)

False


In [108]:
#### lab
with pyhmmer.plan7.HMMFile("../data/pfam/.h3p") as hmms:
    test = hmms.is_pressed()
    # target = hmms.optimized_profiles()
    print(test)
    # print(target)

ValueError: format not recognized by HMMER

In [109]:
#### lab
with pyhmmer.plan7.HMMFile("../data/pfam/") as hmms:
    test = hmms.is_pressed()
    # target = hmms.optimized_profiles()
    print(test)
    # print(target)

True


In [32]:
dir(pyhmmer.plan7.Profile)

['L',
 'M',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'accession',
 'alphabet',
 'clear',
 'configure',
 'consensus',
 'consensus_structure',
 'copy',
 'cutoffs',
 'description',
 'evalue_parameters',
 'local',
 'multihit',
 'name',
 'offsets',
 'to_optimized']

how do I save the hmm file/pfam as optimized profiles?

In [71]:
dir(pyhmmer.plan7.OptimizedProfileBlock)

['__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'alphabet',
 'append',
 'clear',
 'copy',
 'extend',
 'index',
 'insert',
 'pop',
 'remove']

In [82]:
# empty list to store profiles
optimized_profiles = []
# amino acid alphabet
aa = pyhmmer.easel.Alphabet.amino()
with pyhmmer.plan7.HMMPressedFile(PFAM_PATH) as hfile:
    for profiles in hfile:
        profile = hfile.read()
        optimized_profiles.append(profile)
test = pyhmmer.plan7.OptimizedProfileBlock(aa, optimized_profiles)
print(test)

OptimizedProfileBlock(pyhmmer.easel.Alphabet.amino(), [<pyhmmer.plan7.OptimizedProfile object at 0x10be58b40>, <pyhmmer.plan7.OptimizedProfile object at 0x1413543c0>, <pyhmmer.plan7.OptimizedProfile object at 0x141356240>, <pyhmmer.plan7.OptimizedProfile object at 0x1413560c0>, <pyhmmer.plan7.OptimizedProfile object at 0x141354280>, <pyhmmer.plan7.OptimizedProfile object at 0x141357040>, <pyhmmer.plan7.OptimizedProfile object at 0x1413545c0>, <pyhmmer.plan7.OptimizedProfile object at 0x141357e40>, <pyhmmer.plan7.OptimizedProfile object at 0x141356a00>, <pyhmmer.plan7.OptimizedProfile object at 0x141354600>, <pyhmmer.plan7.OptimizedProfile object at 0x1413544c0>, <pyhmmer.plan7.OptimizedProfile object at 0x141357c80>, <pyhmmer.plan7.OptimizedProfile object at 0x141356040>, <pyhmmer.plan7.OptimizedProfile object at 0x141354f00>, <pyhmmer.plan7.OptimizedProfile object at 0x141357c00>, <pyhmmer.plan7.OptimizedProfile object at 0x1413570c0>, <pyhmmer.plan7.OptimizedProfile object at 0x14135

Future update: turns out the above was not totally correct, but with new sets of eyes (literally), it was solved!

Hope we figured it out!

In [99]:
# # empty list to store profiles
# optimized_profiles = []

# amino acid alphabet
aa = pyhmmer.easel.Alphabet.amino()

optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(PFAM_PATH))
targets = pyhmmer.plan7.OptimizedProfileBlock(aa, optimized_profiles)

with pyhmmer.easel.SequenceFile("../data/meso_input.fasta", digital=True) as seqs:
    with open("testing42.domtblout", "wb") as dst:
        t1 = time.time()
        for i, hits in enumerate(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=5, E=1e-10)):
                hits.write(dst, format="domains", header=i==0)
        print(f"- hmmscan took {time.time() - t1:.3} seconds")

- hmmscan took 1.92 seconds


Both results have the same number of hits!

---

### Developing my pyhmmer function

Thing I want to do for my pyhmmer func:
- Figured out if pressing can be made faster

In [105]:
def run_pyhmmer(hmmdb: str, input_file: str, output_file:str, cpu: int = 4, prefetching= False, save_out=False, eval_con: int = 1e-10):
    """
    TODO
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    if prefetching is False:
        if save_out is True:
            with pyhmmer.plan7.HMMFile("../data/pfam/.h3m") as hmms:
                with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
                    with open(output_file, "wb") as dst:
                        for i, hits in enumerate(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=cpu, E=eval_con)):
                            hits.write(dst, format="domains", header=i==0)
        else:
            with pyhmmer.plan7.HMMFile("../data/pfam/.h3m") as hmms:
                with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
                    all_hits = pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=cpu, E=eval_con)
    else:
        # amino acid alphabet and prefetched inputs
        aa = pyhmmer.easel.Alphabet.amino()
        optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
        targets = pyhmmer.plan7.OptimizedProfileBlock(aa, optimized_profiles)

        if save_out is True:
            with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
                with open(output_file, "wb") as dst:
                    for i, hits in enumerate(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)):
                            hits.write(dst, format="domains", header=i==0)
        else:
            with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
                all_hits = pyhmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

In [52]:
#### Testing function
run_pyhmmer(PFAM_PATH, "../data/meso_input", "testing2", save_out=True)

In [103]:
#### Testing prefetching I
hmmdb = PFAM_PATH
input_file = "../data/meso_input"
output_file = "testing4"

# Test with prefetching off
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=False, save_out=True)
end_time = time.time()
print(f"Time with prefetching off: {end_time - start_time:.3f} seconds")

# Test with prefetching on
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=True, save_out=True)
end_time = time.time()
print(f"Time with prefetching on: {end_time - start_time:.3f} seconds")


Time with prefetching off: 12.942 seconds
Time with prefetching on: 10.131 seconds


In [106]:
#### Testing prefetching II
hmmdb = PFAM_PATH
input_file = "../data/meso_input"
output_file = "testing4"

# Test with prefetching off
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=False)
end_time = time.time()
print(f"Time with prefetching off: {end_time - start_time:.3f} seconds")

# Test with prefetching on
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=True)
end_time = time.time()
print(f"Time with prefetching on: {end_time - start_time:.3f} seconds")

Time with prefetching off: 13.036 seconds
Time with prefetching on: 8.615 seconds


Cool!

## Optimizing my pyhmmer function

In [110]:
def run_pyhmmer(hmmdb: str, input_file: str, output_file: str, cpu: int = 4, prefetching=False, save_out=False, eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database. 
    The function supports two modes: normal mode and prefetching mode. 
    In normal mode, the HMMs are pressed and stored in a directory before execution. 
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(os.path.join("../data/pfam/", os.path.basename(hmmdb) + ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")


    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i==0)
        else:
            all_hits = pyhmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

In [111]:
hmmdb = PFAM_PATH
input_file = "../data/meso_input"
output_file = "testing4"

# Test with prefetching off
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=False)
end_time = time.time()
print(f"Time with prefetching off: {end_time - start_time:.3f} seconds")

# Test with prefetching on
start_time = time.time()
run_pyhmmer(hmmdb, input_file, output_file, cpu=4, prefetching=True)
end_time = time.time()
print(f"Time with prefetching on: {end_time - start_time:.3f} seconds")

Time with prefetching off: 14.725 seconds
Time with prefetching on: 8.162 seconds


## Making a wrapper for pyhmmer

In [112]:
def HMMER_run(seqs: pd.core.frame.DataFrame, input_file: str, hmm: str, output_file: str, cpu: int = 4, prefetching=False, save_out=False, eval_con: float = 1e-10):
    """
    Runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database.

    Parameters
    ----------
    seqs : pandas.core.frame.DataFrame
        A dataframe with string amino acid sequences in a 'seq' column.
    input_file : str
        Path to the input sequence file.
    hmm : str
        Path to the HMM database.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Raises
    ------
    ValueError
        If the input dataframe is empty.
    AttributeError
        If any of the sequences are invalid.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database. 
    The function supports two modes: normal mode and prefetching mode. 
    In normal mode, the HMMs are pressed and stored in a directory before execution. 
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # generate meso and thermo files
    read_seq(seqs, input_file)

    # place files into HMMER/pfam
    run_pyhmmer(
        hmm,
        input_file,
        output_file,
        cpu,
        prefetching,
        save_out,
        eval_con)

def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file

    
    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError : 
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")
    
    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except:
            raise AttributeError("Invalid sequence")

    # function    
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError:
            raise AttributeError(f"Invalid sequence: {seq}")
    
    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")
    
    with open(f"{inputname}.fasta", "w") as file:
            SeqIO.write(records, file, "fasta")
    return file

def run_pyhmmer(hmmdb: str, input_file: str, output_file: str, cpu: int = 4, prefetching=False, save_out=False, eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database. 
    The function supports two modes: normal mode and prefetching mode. 
    In normal mode, the HMMs are pressed and stored in a directory before execution. 
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(os.path.join("../data/pfam/", os.path.basename(hmmdb) + ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")


    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i==0)
        else:
            all_hits = pyhmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

In [113]:
### testing

# read df 
df_sample = pd.read_csv("learn2therm_sample_50k_exploration.csv", index_col=0)

# split the database into corresponding thermo and meso lists
meso_seq_db = df_sample[["meso_index", "m_protein_seq"]]
thermo_seq_db = df_sample[["thermo_index", "t_protein_seq"]]

# make the corresponding index the dataframe index and only sample a 50 sequences
meso_seq_list = meso_seq_db.set_index("meso_index").iloc[:50]
meso_seq_list.index.name = None
meso_seq_list.rename({'m_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

thermo_seq_list = thermo_seq_db.set_index("thermo_index").iloc[:50]
thermo_seq_list.index.name = None
thermo_seq_list.rename({'t_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

In [115]:
### testing
HMMER_run(meso_seq_list, "meso_input", PFAM_PATH, "meso_output", cpu=5)
HMMER_run(thermo_seq_list, "thermo_input" ,PFAM_PATH, "thermo_output", cpu=5)

Nice. It works!

## Old notes

##### w1 Sp23 notes
- pre-fetching ought to be an argument 
- re-make the whole component 
    - test it out on small bit
- time a 100 seq as a resource test with 1 CPU {embarrsibgly parrallel}
- redo with 10 CPUS; compare speed
    - analysis plot
HMMER(path/to/seqs, path/to/hmms, cpus= int, perfetch=boolean, saveOut=boolean)

##### w2 Sp23 notes
- make unittests for pyhmmer function

### After scripting work

In [3]:
def run_hmmer(
        seqs: pd.core.frame.DataFrame,
        input_file: str,
        hmm: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database.

    Parameters
    ----------
    seqs : pandas.core.frame.DataFrame
        A dataframe with string amino acid sequences in a 'seq' column.
    input_file : str
        Path to the input sequence file.
    hmm : str
        Path to the HMM database.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Raises
    ------
    ValueError
        If the input dataframe is empty.
    AttributeError
        If any of the sequences are invalid.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # generate meso and thermo files
    read_seq(seqs, input_file)

    # place files into HMMER/pfam
    run_pyhmmer(
        hmm,
        input_file,
        output_file,
        cpu,
        prefetching,
        save_out,
        eval_con)


def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file


    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError :
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")

    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except BaseException as exc:
            raise AttributeError("Invalid sequence") from exc

    # function
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError as exc:
            raise AttributeError(f"Invalid sequence: {seq}") from exc

    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")

    with open(f"{inputname}.fasta", "w", encoding="utf-8") as file:
        SeqIO.write(records, file, "fasta")
    return file


def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(
        os.path.join(
            "../data/pfam/",
            os.path.basename(hmmdb) +
            ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")

    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb", encoding="utf-8") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i == 0)
        else:
            all_hits = pyhmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

In [4]:
### testing

# read df 
df_sample = pd.read_csv("learn2therm_sample_50k_exploration.csv", index_col=0)

# split the database into corresponding thermo and meso lists
meso_seq_db = df_sample[["meso_index", "m_protein_seq"]]
thermo_seq_db = df_sample[["thermo_index", "t_protein_seq"]]

# make the corresponding index the dataframe index and only sample a 50 sequences
meso_seq_list = meso_seq_db.set_index("meso_index").iloc[:50]
meso_seq_list.index.name = None
meso_seq_list.rename({'m_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

thermo_seq_list = thermo_seq_db.set_index("thermo_index").iloc[:50]
thermo_seq_list.index.name = None
thermo_seq_list.rename({'t_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

In [6]:
### testing
run_hmmer(meso_seq_list, "meso_input", PFAM_PATH, "meso_output", cpu=5)
# run_hmmer(thermo_seq_list, "thermo_input" ,PFAM_PATH, "thermo_output", cpu=5)

It works.

In [None]:
def run_hmmer(
        seqs: pd.core.frame.DataFrame,
        input_file: str,
        hmm: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database.

    Parameters
    ----------
    seqs : pandas.core.frame.DataFrame
        A dataframe with string amino acid sequences in a 'seq' column.
    input_file : str
        Path to the input sequence file.
    hmm : str
        Path to the HMM database.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Raises
    ------
    ValueError
        If the input dataframe is empty.
    AttributeError
        If any of the sequences are invalid.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # generate meso and thermo files
    read_seq(seqs, input_file)

    # place files into HMMER/pfam
    run_pyhmmer(
        hmm,
        input_file,
        output_file,
        cpu,
        prefetching,
        save_out,
        eval_con)


def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file


    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError :
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")

    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except BaseException as exc:
            raise AttributeError("Invalid sequence") from exc

    # function
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError as exc:
            raise AttributeError(f"Invalid sequence: {seq}") from exc

    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")

    with open(f"{inputname}.fasta", "w", encoding="utf-8") as file:
        SeqIO.write(records, file, "fasta")
    return file


def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmmdb)
    # press hmms and store them in the pfam data folder or w/e destination
    if not os.path.exists(
        os.path.join(
            "../data/pfam/",
            os.path.basename(hmmdb) +
            ".h3m")):
        pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")

    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i == 0)
        else:
            all_hits = pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

## Refactoring Work

Goal 1: functionalize the hmmpress part of the original run_pyhmmer

In [1]:
def hmmpress_hmms(hmms_path, pfam_data_folder):
    """
    Presses the HMMs in the given HMM database and stores the resulting files in a specified directory.

    Parameters
    ----------
    hmmdb_path : str
        Path to the HMM database.
    pfam_data_folder : str, optional
        Path to the directory where the HMMs should be stored.

    Returns
    -------
    None

    Notes
    -----
    This function uses HMMER's hmmpress program to compress the HMMs in the given HMM database and
    stores the resulting files in the specified directory for faster access during future HMMER runs.
    If the specified directory does not exist, it will be created.
    """
    if not os.path.exists(os.path.join(pfam_data_folder, os.path.basename(hmms_path) + ".h3m")):
        hmms = pyhmmer.plan7.HMMFile(hmms_path)
        pyhmmer.hmmer.hmmpress(hmms, pfam_data_folder)

In [None]:
def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Press hmms and store them in the pfam data folder
    hmmpress_hmms(hmmdb, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")
    
    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i == 0)
        else:
            all_hits = pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

Goal 2: functionalize target inputs

In [1]:
def fetch_targets(hmmdb: str, prefetching: bool):
    """
    Load HMM profiles from a given HMM database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    prefetching : bool
        Whether to use prefetching for faster search.

    Returns
    -------
    targets : pyhmmer.plan7.OptimizedProfileBlock
        The HMM profiles loaded from the database.

    Notes
    -----
    This function loads the HMM profiles from a given HMM database using the
    PyHMMER package. It supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are loaded from the disk on each use.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")
    return targets

In [None]:
def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Press hmms and store them in the pfam data folder
    hmmpress_hmms(hmmdb, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs to obtain profile targets
    targets = fetch_targets(hmmdb, prefetching)
    
    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i == 0)
        else:
            all_hits = list(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con))

    return all_hits if not save_out else None

##### Testing

In [3]:
def run_hmmer(
        seqs: pd.core.frame.DataFrame,
        input_file: str,
        hmm: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Runs HMMER's hmmscan program on a set of input sequences using HMMs from a given database.

    Parameters
    ----------
    seqs : pandas.core.frame.DataFrame
        A dataframe with string amino acid sequences in a 'seq' column.
    input_file : str
        Path to the input sequence file.
    hmm : str
        Path to the HMM database.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Raises
    ------
    ValueError
        If the input dataframe is empty.
    AttributeError
        If any of the sequences are invalid.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # generate meso and thermo files
    read_seq(seqs, input_file)

    # place files into HMMER/pfam
    run_pyhmmer(
        hmm,
        input_file,
        output_file,
        cpu,
        prefetching,
        save_out,
        eval_con)


def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file


    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError :
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")

    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except BaseException as exc:
            raise AttributeError("Invalid sequence") from exc

    # function
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError as exc:
            raise AttributeError(f"Invalid sequence: {seq}") from exc

    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")

    with open(f"{inputname}.fasta", "w", encoding="utf-8") as file:
        SeqIO.write(records, file, "fasta")
    return file


def hmmpress_hmms(hmms_path, pfam_data_folder):
    """
    Presses the HMMs in the given HMM database and stores the resulting files in a specified directory.

    Parameters
    ----------
    hmmdb_path : str
        Path to the HMM database.
    pfam_data_folder : str, optional
        Path to the directory where the HMMs should be stored.

    Returns
    -------
    None

    Notes
    -----
    This function uses HMMER's hmmpress program to compress the HMMs in the given HMM database and
    stores the resulting files in the specified directory for faster access during future HMMER runs.
    If the specified directory does not exist, it will be created.
    """
    if not os.path.exists(
        os.path.join(
            pfam_data_folder,
            os.path.basename(hmms_path) +
            ".h3m")):
        hmms = pyhmmer.plan7.HMMFile(hmms_path)
        pyhmmer.hmmer.hmmpress(hmms, pfam_data_folder)

def fetch_targets(hmmdb: str, prefetching: bool):
    """
    Load HMM profiles from a given HMM database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    prefetching : bool
        Whether to use prefetching for faster search.

    Returns
    -------
    targets : pyhmmer.plan7.OptimizedProfileBlock
        The HMM profiles loaded from the database.

    Notes
    -----
    This function loads the HMM profiles from a given HMM database using the
    PyHMMER package. It supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are loaded from the disk on each use.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # amino acid alphabet and prefetched inputs
    aa = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmmdb))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        aa, optimized_profiles) if prefetching else pyhmmer.plan7.HMMFile("../data/pfam/.h3m")
    return targets

def run_pyhmmer(
        hmmdb: str,
        input_file: str,
        output_file: str,
        cpu: int = 4,
        prefetching=False,
        save_out=False,
        eval_con: float = 1e-10):
    """
    Run hmmscan on input sequences with HMMs from a database.

    Parameters
    ----------
    hmmdb : str
        Path to the HMM database.
    input_file : str
        Path to the input sequence file.
    output_file : str
        Path to the output file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    prefetching : bool, optional
        Whether to use prefetching for faster search. Default is False.
    save_out : bool, optional
        Whether to save the output to file. Default is False.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.

    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or None
        The output hits if `save_out` is False, otherwise None.

    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # Press hmms and store them in the pfam data folder
    hmmpress_hmms(hmmdb, "../data/pfam/")

    # Ensure input_file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"
    # Ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # amino acid alphabet and prefetched inputs to obtain profile targets
    targets = fetch_targets(hmmdb, prefetching)
    
    # HMMscan execution with or without save_out
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        if save_out:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(
                    pyhmmer.hmmer.hmmscan(
                        seqs, targets, cpus=cpu, E=eval_con)):
                    hits.write(dst, format="domains", header=i == 0)
        else:
            all_hits = pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con)

    return all_hits if not save_out else None

In [4]:
### testing

# read df 
df_sample = pd.read_csv("learn2therm_sample_50k_exploration.csv", index_col=0)

# split the database into corresponding thermo and meso lists
meso_seq_db = df_sample[["meso_index", "m_protein_seq"]]
thermo_seq_db = df_sample[["thermo_index", "t_protein_seq"]]

# make the corresponding index the dataframe index and only sample a 50 sequences
meso_seq_list = meso_seq_db.set_index("meso_index").iloc[:50]
meso_seq_list.index.name = None
meso_seq_list.rename({'m_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

thermo_seq_list = thermo_seq_db.set_index("thermo_index").iloc[:50]
thermo_seq_list.index.name = None
thermo_seq_list.rename({'t_protein_seq': 'protein_seq'}, axis="columns", inplace=True)

In [5]:
### testing
run_hmmer(meso_seq_list, "meso_input", PFAM_PATH, "meso_output", cpu=5)

---

- A lot of the code has been reworked in the `compute_pyhmmer.py` script