# Dev-work: working on pyhmmer

In [95]:
# system dependecies
import subprocess
import os
from pathlib import Path
import time

# library dependencies
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns
from collections import defaultdict

## biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SearchIO

# pyhmmer
import pyhmmer

# local dependencies/utils

## Paths
PFAM_PATH = Path("/Users/humoodalanzi/pfam/Pfam-A.hmm")
ID_DB_PATH = Path("/Users/humoodalanzi/pfam/proteins_id.zip")
DATA_PATH = Path("../data/")
OUT_PATH = Path("../data/pfam/")
#probably need path of unit tests

For references on how to use the pyhmmer API, check this [link](https://pyhmmer.readthedocs.io/en/stable/api/)

### Press pfam HMM into db for speed purposes

In [2]:
type(pyhmmer)

module

In [93]:
# Create hmms
hmms = pyhmmer.plan7.HMMFile(PFAM_PATH)
# press hmms and store them in the pfam data folder
pyhmmer.hmmer.hmmpress(hmms, "../data/pfam/")

19632

Like my HMMER3 code, this should be the number of HMMs inside my pfam db

#### An aside; to create FASTA files
---

I still need my read_seq function from previous rounds of development. The reason why I didn't directly import it is because I am using a different enviroment for testing

In [4]:
def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file

    
    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError : 
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")
    
    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except:
            raise AttributeError("Invalid sequence")

    # function    
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError:
            raise AttributeError(f"Invalid sequence: {seq}")
    
    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")
    
    with open(f"{inputname}.fasta", "w") as file:
            SeqIO.write(records, file, "fasta")
    return file

In [6]:
df_sample = pd.read_csv('learn2therm_sample_50k_exploration.csv', index_col=0)
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 29 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   local_gap_compressed_percent_id        50000 non-null  float64
 1   scaled_local_query_percent_id          50000 non-null  float64
 2   scaled_local_symmetric_percent_id      50000 non-null  float64
 3   query_align_len                        50000 non-null  int64  
 4   query_align_cov                        50000 non-null  float64
 5   subject_align_len                      50000 non-null  int64  
 6   subject_align_cov                      50000 non-null  float64
 7   bit_score                              50000 non-null  int64  
 8   thermo_index                           50000 non-null  int64  
 9   meso_index                             50000 non-null  int64  
 10  prot_pair_index                        50000 non-null  int64  
 11  me

In [7]:
## Meso (+a bit of processing)
meso_seq_db2 = df_sample[["meso_index", "m_protein_seq"]]
meso_seq_list2 = meso_seq_db2.set_index("meso_index").iloc[:100]
meso_seq_list2.index.name = None
meso_seq_list2.rename({'m_protein_seq': 'protein_seq'}, axis="columns", inplace=True)


## Thermo (+ a bit of processing)
thermo_seq_db2 = df_sample[["thermo_index", "t_protein_seq"]]
thermo_seq_list2 = thermo_seq_db2.set_index("thermo_index").iloc[:100]
thermo_seq_list2.index.name = None
thermo_seq_list2.rename({'t_protein_seq': 'protein_seq'}, axis="columns", inplace=True)



# generate meso and thermo files
read_seq(meso_seq_list2, "../data/meso_input")
read_seq(thermo_seq_list2, "../data/thermo_input")

<_io.TextIOWrapper name='../data/thermo_input.fasta' mode='w' encoding='UTF-8'>

---

### Working with pyhmmer proper

In [94]:
with pyhmmer.plan7.HMMFile("../data/pfam/") as hmms:
    with pyhmmer.easel.SequenceFile("../data/meso_input.fasta", digital=True) as seqs:
        t1 = time.time()
        all_hits = list(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=5, E=1e-10))
        totals = len(all_hits)
        print(f"- hmmscan found {totals} hits without prefetching in {time.time() - t1:.3} seconds")

ValueError: format not recognized by HMMER

Let's explore the variable "all_hits"

In [64]:
all_hits[0]

<pyhmmer.plan7.TopHits at 0x7f9e0f96bc50>

In [65]:
test1 = all_hits[0][0]

let's explore the different methods

In [66]:
dir(test1)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'accession',
 'best_domain',
 'bias',
 'description',
 'domains',
 'dropped',
 'duplicate',
 'evalue',
 'hits',
 'included',
 'name',
 'new',
 'pre_score',
 'pvalue',
 'reported',
 'score',
 'sum_score']

In [55]:
dir(all_hits[0])

['E',
 'T',
 'Z',
 '__add__',
 '__bool__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'bit_cutoffs',
 'block_length',
 'compare_ranking',
 'copy',
 'domE',
 'domT',
 'domZ',
 'incE',
 'incT',
 'incdomE',
 'incdomT',
 'included',
 'is_sorted',
 'long_targets',
 'merge',
 'query_accession',
 'query_name',
 'reported',
 'searched_models',
 'searched_nodes',
 'searched_residues',
 'searched_sequences',
 'sort',
 'strand',
 'to_msa',
 'write']

In [69]:
len(all_hits[0])

3

We got three hits for the first input; interesting

In [71]:
test1.name
# correct

b'Sigma70_r2'

In [73]:
test1.accession
# that's true

b'PF04542.17'

In [74]:
test1.evalue
# right

2.5999653142493426e-19

In [76]:
test1.pvalue
# this is c-evalue in hmmer3

1.3243507101922078e-23

In [77]:
test1.best_domain

<pyhmmer.plan7.Domain at 0x148472480>

In [78]:
dir(test1.best_domain)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'alignment',
 'bias',
 'c_evalue',
 'correction',
 'env_from',
 'env_to',
 'envelope_score',
 'hit',
 'i_evalue',
 'included',
 'pvalue',
 'reported',
 'score']

way too much information

In [80]:
dir(test1.hits)
# you can move between top hits and hit; very cool

['E',
 'T',
 'Z',
 '__add__',
 '__bool__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'bit_cutoffs',
 'block_length',
 'compare_ranking',
 'copy',
 'domE',
 'domT',
 'domZ',
 'incE',
 'incT',
 'incdomE',
 'incdomT',
 'included',
 'is_sorted',
 'long_targets',
 'merge',
 'query_accession',
 'query_name',
 'reported',
 'searched_models',
 'searched_nodes',
 'searched_residues',
 'searched_sequences',
 'sort',
 'strand',
 'to_msa',
 'write']

In [None]:
def run_pyhmmer(hmm: str, input_file: str, destination:str, cpu: int = 4, prefetching= False, saveout=False):
    """
    TODO
    """
    # Create hmms
    hmms = pyhmmer.plan7.HMMFile(hmm)
    # press hmms and store them in the pfam data folder or w/e destination
    pyhmmer.hmmer.hmmpress(hmms, destination)

    if prefetching is False:
        with pyhmmer.plan7.HMMFile(destination) as hmms:
            with pyhmmer.easel.SequenceFile(input, digital=True) as seqs:
                t1 = time.time()
                all_hits = list(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=cpu, E=1e-10))
                totals = len(all_hits)
                print(f"- hmmscan found {totals} hits without prefetching in {time.time() - t1:.3} seconds")
    else:
        print("TODO")
    

Cool!

## Making a wrapper for pyhmmer

In [None]:
def HMMER_run(seqs: pd.core.frame.DataFrame, input_file: str, hmm: str, destination: str, cpu: int = 4, prefetching= False, saveout=False):
    """
    Executes HMMER against pfam/hmm
    
    Parameters:
    ------------
    seqs : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'protein_seq' column
        (has to be processed in a certain way)
    input_filename : str
        A file name for the input of the transformed seq to FASTA
    pfam_path : str
        path of the HMMER/pfam db
    input_filename_with_ext : str
        A file name for the input FASTA has to include the ext. FASTA
    output_filename_with_ext : str
        output file name perferred extension is domtblout
    cpu : 4
        number of cpus for i/o

    Returns:
    ------------
    file : TextIOWrapper (Input fasta file)
        the input fasta file created from the list of SeqRecord objects
    
    (Optional) 
    file : TextIOWrapper (Output domtblout file)
        an output domtblout file of the HMMER/pfam results
    """
    # generate meso and thermo files
    read_seq(seqs, input_file)

    # place files into HMMER/pfam
    run_pyhmmer(
        hmm,
        input_file,
        destination,
        cpu)

def read_seq(lists: pd.core.frame.DataFrame, inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them

    Parameters:
    ------------
    list : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'seq' column
    input name : str, default = 'input'
        a name for the input fasta file

    
    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects

    Raises
    -------
    ValueError : 
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # check if input is empty
    if lists.empty:
        raise ValueError("Input dataframe is empty")
    
    # check if sequences are valid
    for seq in lists['protein_seq']:
        try:
            Seq(seq)
        except:
            raise AttributeError("Invalid sequence")

    # function    
    records = []
    for index, seq in lists.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError:
            raise AttributeError(f"Invalid sequence: {seq}")
    
    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")
    
    with open(f"{inputname}.fasta", "w") as file:
            SeqIO.write(records, file, "fasta")
    return file

- pre-fetching ought to be an argument 
- re-make the whole component 
    - test it out on small bit
- time a 100 seq as a resource test with 1 CPU {embarrsibgly parrallel}
- redo with 10 CPUS; compare speed
    - analysis plot


HMMER(path/to/seqs, path/to/hmms, cpus= int, perfetch=boolean, saveOut=boolean)