# Getting the sequences information by using the API sysytem

-------

# Import Libraries and read the input CSV file:

In this part, the libraries have been imported, and the CSV file has been read.

In [9]:
# Library dependencies
import pandas as pd
import numpy as np
import os

# We need them to get the information from URL and for parsing the results and make the error notice in the function
import requests
import urllib.parse
import time

import nest_asyncio
import asyncio
import httpx



In [6]:
# Read the input data and print here
df = pd.read_csv("/Users/amin/ValidProt/FAFSA/learn2therm_sample_50k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,prot_pair_index,meso_seq,thermo_seq,meso_ogt,thermo_ogt,scaled_local_symmetric_percent_id,local_E_value,scaled_local_query_percent_id,local_gap_compressed_percent_id
0,1256842,126227630,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRKELESGRLELDPFDPAMLQPSSIDVRLDRFFRVFDNT...,27.5,45.0,0.777202,0.0,0.773196,0.802139
1,1456567,169784592,MRFEGTSGYVATDDLKVAVNAAIALERPLLVKGEPGTGKTVLAVEV...,MKFTGSDSYVATEDLMIAVNAAVTLERPLLVKGEPGTGKTELARQV...,30.0,54.0,0.782143,0.0,0.784946,0.784946
2,874464,31933768,MAYETINVDVQDHVCLIKLHRPEALNALNAALVSELCTALEEADAS...,MAYKTIIVEIEDHVALIKLNRPEALNALNSELLGELAQAVTEADAN...,19.5,54.0,0.775194,0.0,0.775194,0.775194
3,560201,32409414,MAIRKYKPTTPGRRGSSVADFAEITRSTPEKSLLRPLSKTGGRNNQ...,MGIRKYKPTTPGRRGASVADFVELTRREPEKSLLRPLPKKGGRNNR...,28.0,52.5,0.78777,0.0,0.790614,0.802198
4,33257,175862226,MLQRLQDRVAVVTGGGSGIGLATVRRFAAEGAKVVVADIDAAAGEA...,MSEDIICRRLTGRTAVVTGAGSGIGLASARRLASEGANVVCADVDE...,28.0,45.0,0.78835,0.0,0.780769,0.802372


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         10000 non-null  int64  
 1   prot_pair_index                    10000 non-null  int64  
 2   meso_seq                           10000 non-null  object 
 3   thermo_seq                         10000 non-null  object 
 4   meso_ogt                           10000 non-null  float64
 5   thermo_ogt                         10000 non-null  float64
 6   scaled_local_symmetric_percent_id  10000 non-null  float64
 7   local_E_value                      10000 non-null  float64
 8   scaled_local_query_percent_id      10000 non-null  float64
 9   local_gap_compressed_percent_id    10000 non-null  float64
dtypes: float64(6), int64(2), object(2)
memory usage: 781.4+ KB


In [8]:
df.iloc[1]

Unnamed: 0                                                                     1456567
prot_pair_index                                                              169784592
meso_seq                             MRFEGTSGYVATDDLKVAVNAAIALERPLLVKGEPGTGKTVLAVEV...
thermo_seq                           MKFTGSDSYVATEDLMIAVNAAVTLERPLLVKGEPGTGKTELARQV...
meso_ogt                                                                          30.0
thermo_ogt                                                                        54.0
scaled_local_symmetric_percent_id                                             0.782143
local_E_value                                                                      0.0
scaled_local_query_percent_id                                                 0.784946
local_gap_compressed_percent_id                                               0.784946
Name: 1, dtype: object

-------------

-----------

-----------

# Implemented different functions to have faster code:

I uploaded function 1 last quarter which is too slow. Function 20 can be the best one, and it is designed for 1000 sequences. At the end we have the Best Function and its unit test.

### Function 1

In [43]:
# 1

"""
This script takes a user defined dataframe and an integer k, which send HTTPs requests to the HMMER API
the packages you need to run this script are:

- pandas
- requests
- urllib.parse
- time

They were imported in part 1.
"""


def hmmerscanner(df: pd.DataFrame, k: int):
    """
    This function sends HTTP requests to the HMMER API to get information for protein sequences.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that has string amino acid sequences. This function has been used Meso s
        equence, we can change that to Thermo sequence according to our needed.
    k: int
        The number of sequences to scan.
    -------------
    Raises:
    -------------
    Exception:
        Raises an exception if the status is pending for too long, if the internet isn't working,
        or if the URL system doesn't wholly answer.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
        All the families are in the rows, and we have many columns that show the information that
        we need in the future. We can drop some columns and keep the needed information.
    """
    # Check if we need to use the local function instead of the API for large values of k.
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    # Create an empty DataFrame to store the results.
    results_df = pd.DataFrame()

    # Loop through the sequences to check them.
    for i in range(k):
        # This is for meso protein sequences; we can change that in the future according to our request.
        sequence = df['m_protein_seq'][i]

        # Send an HTTP request to the HMMER API to get information for the current sequence.
        url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
        headers = {'Content-Type': 'application/x-www-form-urlencoded',
                   'Accept': 'application/json'}
        data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
        data = urllib.parse.urlencode(data).encode('ascii')
        response = requests.post(url, headers=headers,
                                 data=data, allow_redirects=False)
        redirect_url = response.headers.get('Location')

        if redirect_url is None:
            # If the server doesn't work, show this error.
            print("Error: No redirect URL found in response.")
        elif redirect_url == 'late':
            # Raises an exception if the status is pending for too long.
            response.raise_for_status()
            time.sleep(180)
            raise IOError("Error notice after 3 minutes.")
        else:
            response2 = requests.get(redirect_url, headers=headers)

            # Put the results in the empty DataFrame.
            results = response2.json()
            hits = results['results']['hits']
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            results_df = pd.concat([results_df, dfff])
            if redirect_url == 'late':
                # Raises an exception if the status is pending for too long.
                response2.raise_for_status()
                time.sleep(180)
                raise IOError("Error notice after 3 minutes.")

    return results_df

In [44]:
# Test Function 1
%time hmmerscanner(df, 50)

CPU times: user 3.7 s, sys: 476 ms, total: 4.18 s
Wall time: 2min 9s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


In [None]:
%time hmmerscanner(df, 1000)

### Function 2

In [10]:
#2

import pandas as pd
import requests
import urllib.parse
import time
import concurrent.futures

def send_request(sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')
    response = requests.post(url, headers=headers,
                             data=data, allow_redirects=False)
    return response

def process_response(sequence, response):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print(f"Error: No redirect URL found in response for sequence {sequence}")
        return None

    try:
        response2 = requests.get(redirect_url, headers={'Accept': 'application/json'})
        response2.raise_for_status()
        results = response2.json()
        hits = results['results']['hits']
        dfff = pd.json_normalize(hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff = dfff.set_index('sequence')
        return dfff
    except requests.exceptions.RequestException as e:
        print(f"Error: Request error for sequence {sequence}: {e}")
        return None
    except ValueError as e:
        print(f"Error: Could not parse JSON response for sequence {sequence}: {e}")
        print(response2.content)
        return None


def hmmerscanner2(df: pd.DataFrame, k: int, max_workers=50):
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    results_df = pd.DataFrame()
    sequences = df['m_protein_seq'][:k]

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures_to_sequences = {executor.submit(send_request, seq): seq for seq in sequences}
        for future in concurrent.futures.as_completed(futures_to_sequences):
            seq = futures_to_sequences[future]
            try:
                response = future.result()
            except Exception as exc:
                print(f"Error sending request for sequence {seq}: {exc}")
            else:
                dfff = process_response(seq, response)
                if dfff is not None:
                    results_df = pd.concat([results_df, dfff])

    return results_df


In [11]:
# Test function 2
%time hmmerscanner2(df, 50)

KeyError: 'm_protein_seq'

### Function 3

In [47]:
#3

import pandas as pd
import requests
import urllib.parse
import concurrent.futures

def send_request(sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')
    response = requests.post(url, headers=headers,
                             data=data, allow_redirects=False)
    return response

def process_response(sequence, response):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        response2 = requests.get(redirect_url, headers=headers)
        results = response2.json()
        hits = results['results']['hits']
        dfff = pd.json_normalize(
            hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff = dfff.set_index('sequence')
        return dfff

def hmmerscanner3(df: pd.DataFrame, k: int, max_workers=50):
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures_to_sequences = {executor.submit(send_request, seq): seq for seq in sequences}
        results = []
        for future in concurrent.futures.as_completed(futures_to_sequences):
            seq = futures_to_sequences[future]
            try:
                response = future.result()
            except Exception as exc:
                print(f"Error sending request for sequence {seq}: {exc}")
            else:
                dfff = process_response(seq, response)
                if dfff is not None:
                    results.append(dfff)

    # Concatenate all DataFrames in the results list at once.
    results_df = pd.concat(results)
    return results_df


In [48]:
# Test function 3
%time hmmerscanner3(df, 50)

CPU times: user 2.97 s, sys: 480 ms, total: 3.45 s
Wall time: 1min 16s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.ABC_tran.186.E,act_site.CheB_methylest.166.S,act_site.CheB_methylest.193.H,act_site.CheB_methylest.290.D,act_site.ABC_tran.178.E,act_site.ABC_tran.174.E,act_site.ABC_tran.168.E,act_site.adh_short.152.K,act_site.adh_short.148.Y,act_site.adh_short_C2.148.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MVTTGERQRNARGEGARLRLEIVAATQALLADGETATLRSIARRAGISAPSIYRHFPDVDAVMSAVADDAFDELVDALVQKRDRHTDPVARLWAISDGYLDFARDRPHIYRVMFGGVWNAAAALELHPGEDAHFREMGMNAFRLLVAAIQACVDDGTSSSTDPRRDAAALWAGLHGLAQLLVTAPLFDWPAETDRAVVRSLARLKA,,16,,1,TetR_N,30.954098,1.0,0.0,64,0.804878,...,,,,,,,,,,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,66,,1,adh_short,137.959335,1.0,0.0,192,0.797872,...,,,,,,,,Similarity to Q06136,Similarity to P0AET8,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,66,,1,adh_short_C2,121.099525,0.0,0.0,217,0.769231,...,,,,,,,,,,Similarity to Q12634
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,36,,1,KR,37.597282,0.0,0.0,158,0.668831,...,,,,,,,,,,


### Function 4:

Here's a brief explanation of the changes:

1- I replaced `requests` with the `httpx` library, which supports asynchronous I/O operations.

2- I added the `async` keyword to the send_request, `process_response`, and `hmmerscanner` functions. This allows them to be used with asynchronous I/O operations.

3- In the `hmmerscanner` function, I used `asyncio.create_task` to create tasks for processing responses, and then used `asyncio.gather` to await the completion of all tasks.

4- I created a `run_hmmerscanner` function that wraps the `hmmerscanner` function call in an `asyncio.run` call. This allows you to run the asynchronous code from a synchronous context.

In [49]:
#4

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')
    async with httpx.AsyncClient() as client:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False)
    return response


async def process_response(sequence, response):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with httpx.AsyncClient() as client:
            response2 = await client.get(redirect_url, headers=headers)
        results = response2.json()
        hits = results['results']['hits']
        dfff = pd.json_normalize(
            hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff = dfff.set_index('sequence')
        return dfff

async def hmmerscanner(df: pd.DataFrame, k: int):
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []

    for seq in sequences:
        response = await send_request(seq)
        task = asyncio.create_task(process_response(seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df

# Wrap the async function call in an event loop.
def run_hmmerscanner4(df: pd.DataFrame, k: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k))




In [50]:
# Test function 4
%time run_hmmerscanner4(df, 50)

CPU times: user 3.24 s, sys: 308 ms, total: 3.55 s
Wall time: 48.3 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 5

In [51]:
#5

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner5(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [52]:
# Test function 5
%time run_hmmerscanner5(df, 50,20)

CPU times: user 2.2 s, sys: 270 ms, total: 2.47 s
Wall time: 15.8 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 6

In [48]:
#6

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner6(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [49]:
# Test function 6
%time run_hmmerscanner6(df, 50,10)

ReadTimeout: 

### Function 7

In [50]:
#7

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int = 1000):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner7(df: pd.DataFrame, k: int, max_concurrent_requests: int = 1000):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [51]:
# Test function 7
%time run_hmmerscanner7(df, 50, 20)

CPU times: user 1.95 s, sys: 228 ms, total: 2.18 s
Wall time: 22.1 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 8

In [52]:
# 8

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 10000:
        print("Use local function for the number of sequences more than 10000.")
        return pd.DataFrame()
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner8(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [53]:
# Test function 8
%time run_hmmerscanner8(df, 50, 20)

CPU times: user 1.9 s, sys: 195 ms, total: 2.1 s
Wall time: 24.6 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 9

In [22]:
# 9

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                results = response2.json()
                hits = results['results']['hits']
                if hits:
                    dfff = pd.json_normalize(
                        hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
                    dfff.insert(0, 'sequence', sequence)
                    dfff = dfff.set_index('sequence')
                    return dfff
                else:
                    return None
            except json.JSONDecodeError:
                if attempt < max_retries - 1:
                    await asyncio.sleep(300 ** attempt)  # Exponential backoff
                else:
                    print("Error: Could not parse response as JSON.")
                    return None



async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner9(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [24]:
# Test function 9
%time run_hmmerscanner9(df, 50, 20)

CPU times: user 1.88 s, sys: 184 ms, total: 2.06 s
Wall time: 32.9 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 10

In [25]:
"""
This script takes a user-defined data frame and an integer k, which sends multiple requests to
the HAMMER API at the same time.
The packages you need to run this script are:

- pandas
- requests
- urllib.parse
- time
- httpx
- nest_asyncio
"""


async def send_request(semaphore, sequence):
    """
    This function sends a POST request to the HMMER API with the given protein sequence.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore object used to limit the number of concurrent requests.
    sequence: str
        A protein sequence to scan.
    -------------
    Returns:
    -------------
    response: httpx.Response
        The response object containing the search results.
    """

    # Set up the POST request with the protein sequence and send it to the HMMER server
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    """
    This function extracts the redirect URL from the POST response, sends a GET request to the URL to retrieve 
    the search results, processes the JSON response into a pandas DataFrame, and returns the DataFrame.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore object used to limit the number of concurrent requests.
    sequence: str
        A protein sequence.
    response: httpx.Response
        The response object containing the search results.
    max_retries: int
        The maximum number of times to retry the GET request if it times out.
    -------------
    Returns:
    -------------
    dfff: pandas.core.DataFrame
        The DataFrame containing the search results for the given protein sequence.
    """

    # Extract the redirect URL from the POST response, send a GET request to the URL to retrieve the results
    # and process the JSON response into a pandas DataFrame
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(
                f"Error: 'results' key not found in response for sequence {sequence}.")
            return None

        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function performs an HMMER search for a given number of protein sequences in parallel.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that contains protein sequences.
    k: int
        The number of protein sequences to search.
    max_concurrent_requests: int
        The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
        A DataFrame containing the search results for all protein sequences.
    """

    # Check if k is greater than 10000 and print a warning message if so
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()
    # Extract the protein sequences from the input DataFrame, send them as POST requests
    # to the HMMER server, and retrieve the results asynchronously
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat(
        [result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner10(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that contains protein sequences.
    k: int
       The number of protein sequences to search.
    max_concurrent_requests: int
       The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
       A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [24]:
# Test function 10
%time run_hmmerscanner10(df, 100, 10)

CPU times: user 4.44 s, sys: 537 ms, total: 4.98 s
Wall time: 24.7 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.adh_short_C2.158.Y,act_site.adh_short.158.Y,act_site.adh_short.162.K,act_site.ABC_tran.156.E,act_site.adh_short_C2.150.Y,act_site.adh_short.150.Y,act_site.ABC_tran.175.E,act_site.ABC_tran.161.E,act_site.ABC_tran.169.E,act_site.Aminotran_1_2.211.K
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MGLYNTHLGNTMLRNVRNAVIIFLLLLFALATLFPIYFMFISSFGDPVEAGAVSYSLWPEKITLDSYKFFFDYSEYSYRWLGNSLLVATVTMASNVVFATMAGYAFAKIRFRGSKALFGVLLVAMMIPYQVTQVPLYILIVNVFNISNSYTALIAPSLVTVYNIFLAKQFMGSIPKEILESAKVEGCSQWQIFTRIVMPLSKTVMAVMAILTFMESWNTFFWPFLVTNTMDMQTIQVGLKNFRFANTTYFAPMMAGATVSALPMFILFFSLQRYFLEGVTVGAVKG,,32,,1,BPD_transp_1,68.931107,1.0,0.0,277,0.700565,...,,,,,,,,,,
MRIVKAINNNVALAVNEQGHELVIMGKGVGFQKKMDDIIEDAVIEKVFVLETDELSEKLMDLLGEIPAIHLEIADEIVNFAKETFDAKISDNVYLTLTDHINFAIARHEKGMLIRNVMLWEIKKFYKDEFRVGLKALEIIKERLGVQLGEDEAGFIALHIVNARTDGQGMKTTVDMTQVVQDVLNIVTYHFNVVLDETSLNFTRFVTHLQYFAQRLLRNEIVDSGDDFLFEQVQLKYPESFECTGKIDAYLQTAHHATLTKDERVYLTLHIHRVTERNRTNE,,33,,1,PRD,76.673180,1.0,0.0,163,0.806818,...,,,,,,,,,,
MRIVKAINNNVALAVNEQGHELVIMGKGVGFQKKMDDIIEDAVIEKVFVLETDELSEKLMDLLGEIPAIHLEIADEIVNFAKETFDAKISDNVYLTLTDHINFAIARHEKGMLIRNVMLWEIKKFYKDEFRVGLKALEIIKERLGVQLGEDEAGFIALHIVNARTDGQGMKTTVDMTQVVQDVLNIVTYHFNVVLDETSLNFTRFVTHLQYFAQRLLRNEIVDSGDDFLFEQVQLKYPESFECTGKIDAYLQTAHHATLTKDERVYLTLHIHRVTERNRTNE,,23,,1,PRD,64.256958,1.0,0.0,273,0.820225,...,,,,,,,,,,
MRIVKAINNNVALAVNEQGHELVIMGKGVGFQKKMDDIIEDAVIEKVFVLETDELSEKLMDLLGEIPAIHLEIADEIVNFAKETFDAKISDNVYLTLTDHINFAIARHEKGMLIRNVMLWEIKKFYKDEFRVGLKALEIIKERLGVQLGEDEAGFIALHIVNARTDGQGMKTTVDMTQVVQDVLNIVTYHFNVVLDETSLNFTRFVTHLQYFAQRLLRNEIVDSGDDFLFEQVQLKYPESFECTGKIDAYLQTAHHATLTKDERVYLTLHIHRVTERNRTNE,,29,,1,CAT_RBD,71.655800,1.0,4.0e-20,55,0.818182,...,,,,,,,,,,


### Function 11

In [36]:
"""
This script takes a user-defined data frame and an integer k, which sends multiple requests to
the HAMMER API at the same time.
The packages you need to run this script are:

- pandas
- requests
- urllib.parse
- time
- httpx
- nest_asyncio
"""


async def send_request(semaphore, sequence):
    """
    This function sends a POST request to the HMMER API with the given protein sequence.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore object used to limit the number of concurrent requests.
    sequence: str
        A protein sequence to scan.
    -------------
    Returns:
    -------------
    response: httpx.Response
        The response object containing the search results.
    """

    # Set up the POST request with the protein sequence and send it to the HMMER server
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    """
    This function extracts the redirect URL from the POST response, sends a GET request to the URL to retrieve 
    the search results, processes the JSON response into a pandas DataFrame, and returns the DataFrame.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore object used to limit the number of concurrent requests.
    sequence: str
        A protein sequence.
    response: httpx.Response
        The response object containing the search results.
    max_retries: int
        The maximum number of times to retry the GET request if it times out.
    -------------
    Returns:
    -------------
    dfff: pandas.core.DataFrame
        The DataFrame containing the search results for the given protein sequence.
    """

    # Extract the redirect URL from the POST response, send a GET request to the URL to retrieve the results
    # and process the JSON response into a pandas DataFrame
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(
                f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            #dfff = dfff.iloc[:, :47]  # Keep the first 47 columns
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function performs an HMMER search for a given number of protein sequences in parallel.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that contains protein sequences.
    k: int
        The number of protein sequences to search.
    max_concurrent_requests: int
        The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
        A DataFrame containing the search results for all protein sequences.
    """

    # Check if k is greater than 10000 and print a warning message if so
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()
    # Extract the protein sequences from the input DataFrame, send them as POST requests
    # to the HMMER server, and retrieve the results asynchronously
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner11(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [39]:
%time run_hmmerscanner11(df, 50, 20)

CPU times: user 1.78 s, sys: 178 ms, total: 1.96 s
Wall time: 14.5 s


  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

Unnamed: 0_level_0,aliaseq,alihmmto,alicsline,significant,alihmmfrom,bitscore,alisqfrom,evalue,uniq,aliId,...,aliIdCount,alintseq,score,acc,alihmmname,ievalue,jali,oasc,alippline,alisqto
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,70,HHHHHHHHHHHHHHHHCTCHHHHHHHHHHHHHHHHHHHHGCCTTTC...,1.0,1,67.940552,45,0.0,1.0,0.391304,...,27,,68.9,PF04542.17,Sigma70_r2,0.0,113,0.96,688899****************************************...,113
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,54,HHHHHHHHTTS-HHHHHHHHHHHTS---HHHHHHHHT--HHHHHHH...,1.0,1,50.895485,143,0.0,2.0,0.407407,...,22,,51.7,PF08281.15,Sigma70_r4_2,0.0,196,0.95,6899******************************************...,196
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,HRRAVDRVRA,50,HHHHHHHHHH,,41,-2.076300,101,0.0,,0.300000,...,3,,41.0,PF04545.19,Sigma70_r4,3000,110,0.86,799***9995,110
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,49,HHHTS-HHHHHHHHHHTTST--HHHHHHHHTS-HHHHHHHHHHHHHHHH,1.0,1,39.875805,149,0.0,3.0,0.244898,...,12,,41.0,PF04545.19,Sigma70_r4,0.0,197,0.98,69**********************************************9,197
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,112,EEEESSSHHHHHHHHHHHHHTTEEEEEEESSHHHHHHHHHHHHESE...,1.0,1,98.908218,4,0.0,1.0,0.390909,...,43,,99.6,PF00072.27,Response_reg,2.0e-28,113,0.98,679********************.**********************...,113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,VWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIM-------...,298,,1.0,166,31.065155,95,0.0,2.0,0.285714,...,28,,47.9,PF13304.9,AAA_21,0.0,192,0.83,566677777778888888888888888888888888886..........,192
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,ALLGPNGAGKTTLISIVCGLVNpsTGTVAVEGHDIIQDyrkarali...,113,EEESSTTSSHHHHHHHHHHHHT..SEEEEEETTTSSCS...........,,2,18.856600,35,0.000076,,0.214286,...,24,,23.3,PF00004.32,AAA,0.0018,195,0.70,589*****************97446679999999997778888888...,195
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,GASRGIGAAIALAFAQEGAAVVINYLQNKEgAEKVASSCREAGGDg...,233,TTTTSHHHHHHHHHHHTT-EEEEEESSGGH.HHHHHHHHHHTTSE....,1.0,1,216.260788,13,0.0,1.0,0.416309,...,97,,216.4,PF13561.9,adh_short_C2,0.0,253,0.95,999***************************9999999999999886...,253
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,KVVLVTGASRGIGAAIALAFAQEGAAVVInYLQNKEGAEKVASSCR...,192,SEEEEESTTSHHHHHHHHHHHHTTSEEEE.EESCHHHHHHHHHHHH...,1.0,1,154.287918,7,0.0,2.0,0.432292,...,83,,154.5,PF00106.28,adh_short,0.0,206,0.95,89********************************************...,206


### Functon 12

In [6]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    async with httpx.AsyncClient() as client:
        for seq in sequences:
            task = asyncio.create_task(send_request(semaphore, seq, client))
            tasks.append(task)

        responses = await asyncio.gather(*tasks)

        tasks = []
        for seq, response in zip(sequences, responses):
            task = asyncio.create_task(process_response(semaphore, seq, response, client))
            tasks.append(task)

        results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


def run_hmmerscanner12(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [7]:
%time run_hmmerscanner12(df, 50, 50)

CPU times: user 1.02 s, sys: 115 ms, total: 1.14 s
Wall time: 20.7 s


  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

Unnamed: 0_level_0,oasc,alintseq,bias,alihmmto,aliaseq,alisqfrom,aliM,evalue,alimmline,aliId,...,alisqname,jenv,alihindex,alisqto,desc,acc,alirfline,ienv,aliIdCount,cevalue
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.96,,1.66,70,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,45,71,0.0,,0.391304,...,>seq,114,16681,113,Sigma-70 region 2,PF04542.17,,45,27,0.0
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.95,,0.05,54,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,143,54,0.0,,0.407407,...,>seq,196,16684,196,"Sigma-70, region 4",PF08281.15,,143,22,0.0
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.86,,0.16,50,HRRAVDRVRA,101,50,0.0,,0.300000,...,>seq,110,16683,110,"Sigma-70, region 4",PF04545.19,,100,3,0.47
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.98,,0.02,49,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,149,50,0.0,,0.244898,...,>seq,198,16683,197,"Sigma-70, region 4",PF04545.19,,149,12,0.0
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,0.98,,0.46,112,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,4,112,0.0,,0.390909,...,>seq,113,15602,113,Response regulator receiver domain,PF00072.27,,4,43,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,0.83,,0.02,298,VWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIM-------...,95,304,0.0,,0.285714,...,>seq,194,118,192,"AAA domain, putative AbiEii toxin, Type IV TA ...",PF13304.9,,77,28,0.0
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,0.70,,0.29,113,ALLGPNGAGKTTLISIVCGLVNpsTGTVAVEGHDIIQDyrkarali...,35,132,0.000076,,0.214286,...,>seq,210,105,195,ATPase family associated with various cellular...,PF00004.32,,34,24,0.0
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,0.95,,0.01,233,GASRGIGAAIALAFAQEGAAVVINYLQNKEgAEKVASSCREAGGDg...,13,234,0.0,,0.416309,...,>seq,254,415,253,Enoyl-(Acyl carrier protein) reductase,PF13561.9,,13,97,0.0
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,0.95,,0.05,192,KVVLVTGASRGIGAAIALAFAQEGAAVVInYLQNKEGAEKVASSCR...,7,195,0.0,,0.432292,...,>seq,209,414,206,short chain dehydrogenase,PF00106.28,,7,83,0.0


### Function 13

In [4]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq in sequences:
                task = asyncio.create_task(send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for seq, response in zip(sequences, responses):
                task = asyncio.create_task(process_response(semaphore, seq, response, client))
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


def run_hmmerscanner13(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [5]:
%time run_hmmerscanner13(df, 50, 20)

CPU times: user 1.14 s, sys: 149 ms, total: 1.29 s
Wall time: 20.8 s


  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

Unnamed: 0_level_0,oasc,alintseq,bias,alihmmto,aliaseq,alisqfrom,aliM,evalue,alimmline,aliId,...,alisqname,jenv,alihindex,alisqto,desc,acc,alirfline,ienv,aliIdCount,cevalue
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.96,,1.66,70,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,45,71,0.0,,0.391304,...,>seq,114,16681,113,Sigma-70 region 2,PF04542.17,,45,27,0.0
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.95,,0.05,54,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,143,54,0.0,,0.407407,...,>seq,196,16684,196,"Sigma-70, region 4",PF08281.15,,143,22,0.0
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.86,,0.16,50,HRRAVDRVRA,101,50,0.0,,0.300000,...,>seq,110,16683,110,"Sigma-70, region 4",PF04545.19,,100,3,0.47
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,0.98,,0.02,49,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,149,50,0.0,,0.244898,...,>seq,198,16683,197,"Sigma-70, region 4",PF04545.19,,149,12,0.0
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,0.98,,0.46,112,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,4,112,0.0,,0.390909,...,>seq,113,15602,113,Response regulator receiver domain,PF00072.27,,4,43,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,0.83,,0.02,298,VWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIM-------...,95,304,0.0,,0.285714,...,>seq,194,118,192,"AAA domain, putative AbiEii toxin, Type IV TA ...",PF13304.9,,77,28,0.0
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,0.70,,0.29,113,ALLGPNGAGKTTLISIVCGLVNpsTGTVAVEGHDIIQDyrkarali...,35,132,0.000076,,0.214286,...,>seq,210,105,195,ATPase family associated with various cellular...,PF00004.32,,34,24,0.0
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,0.95,,0.01,233,GASRGIGAAIALAFAQEGAAVVINYLQNKEgAEKVASSCREAGGDg...,13,234,0.0,,0.416309,...,>seq,254,415,253,Enoyl-(Acyl carrier protein) reductase,PF13561.9,,13,97,0.0
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,0.95,,0.05,192,KVVLVTGASRGIGAAIALAFAQEGAAVVInYLQNKEGAEKVASSCR...,7,195,0.0,,0.432292,...,>seq,209,414,206,short chain dehydrogenase,PF00106.28,,7,83,0.0


### Function 14

In [119]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            first_col = dfff.columns[0]
            dfff = dfff[[first_col, 'sequence'] + [col for col in dfff.columns if col not in [first_col, 'sequence']]]
            #dfff = df['sequence']
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    meso_protein_int_indexes = 'meso_protein_int_index'
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq in sequences:
                task = asyncio.create_task(send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for seq, response in zip(sequences, responses):
                task = asyncio.create_task(process_response(semaphore, seq, response, client))
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    results_df['meso_protein_int_index'] = meso_protein_int_indexes 
    results_df = results_df.set_index('meso_protein_int_index')

    return results_df


def run_hmmerscanner14(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))



In [124]:
%time run_hmmerscanner14(df, 1000, 20)

Error: No redirect URL found in response.


  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

CPU times: user 17.8 s, sys: 1.52 s, total: 19.3 s
Wall time: 4min 5s


Unnamed: 0_level_0,aliaseq,alicsline,alihmmto,alihmmfrom,bitscore,alisqfrom,evalue,aliId,cevalue,alisqdesc,...,alintseq,score,acc,alisqto,alihmmname,ievalue,jali,oasc,alippline,alisqacc
meso_protein_int_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
meso_protein_int_index,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,HHHHHHHHHHHHHHHHCTCHHHHHHHHHHHHHHHHHHHHGCCTTTC...,70,1,67.940552,45,0.0,0.391304,0.0,,...,,68.9,PF04542.17,113,Sigma70_r2,0.0,113,0.96,688899****************************************...,
meso_protein_int_index,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,HHHHHHHHTTS-HHHHHHHHHHHTS---HHHHHHHHT--HHHHHHH...,54,1,50.895485,143,0.0,0.407407,0.0,,...,,51.7,PF08281.15,196,Sigma70_r4_2,0.0,196,0.95,6899******************************************...,
meso_protein_int_index,HRRAVDRVRA,HHHHHHHHHH,50,41,-2.076300,101,0.0,0.300000,0.47,,...,,41.0,PF04545.19,110,Sigma70_r4,3000,110,0.86,799***9995,
meso_protein_int_index,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,HHHTS-HHHHHHHHHHTTST--HHHHHHHHTS-HHHHHHHHHHHHHHHH,49,1,39.875805,149,0.0,0.244898,0.0,,...,,41.0,PF04545.19,197,Sigma70_r4,0.0,197,0.98,69**********************************************9,
meso_protein_int_index,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,EEEESSSHHHHHHHHHHHHHTTEEEEEEESSHHHHHHHHHHHHESE...,112,1,98.908218,4,0.0,0.390909,0.0,,...,,99.6,PF00072.27,113,Response_reg,2.0e-28,113,0.98,679********************.**********************...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
meso_protein_int_index,TALVTGATAGIGREFAEQLAAKGI-GLVLVARDVERLATV---SAE...,EEEEETTTSHHHHHHHHHHHHHHS-EEEEE-SS..--HHHHHHHHH...,165,2,59.709538,2,0.0,0.320988,0.0,,...,,60.1,PF08659.13,163,KR,0.0,163,0.86,679********************8.59******5544444...456...,
meso_protein_int_index,ALVTGATAGIGREFAEQLAAKGIGLVLVARDVERLATVSAE-Lrsa...,EEEETTTSHHHHHHHHHHHHTTSEEEEEES-SSTTTCHHTHHG......,163,1,34.150902,3,0.0,0.269939,4.0e-12,,...,,34.5,PF01370.24,169,Epimerase,2.0e-08,169,0.82,59********************9988888885544444433.2578...,
meso_protein_int_index,DGVHLDVGAGEIVALVGESGCGKTTLARTLLGLERPSAGTVSYAGR...,EEEEEEEETTSEEEEEESTTSSHHHHHHHHTTSS--SEEEEEETTC...,137,2,112.488411,44,0.0,0.389706,0.0,,...,,112.8,PF00005.30,194,ABC_tran,0.0,194,0.95,7899******************************************...,
meso_protein_int_index,VETGPVEKILVAPEHPYTQALLSVLPEARAGIP--VVLSGEPPDPS...,,59,1,54.188602,245,0.0,0.421053,0.0,,...,,55.4,PF08352.15,301,oligo_HPY,0.0,301,0.97,8********************************..***********...,


### Function 15

In [115]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            #dfff('sequence') = df('sequence')
            #dfff = dfff.set_index('sequence')
            #index = 'meso_protein_int_index'
            #dfff=dfff.set_index('index')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq in sequences:
                task = asyncio.create_task(send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for seq, response in zip(sequences, responses):
                task = asyncio.create_task(process_response(semaphore, seq, response, client))
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


def run_hmmerscanner15(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [118]:
%time run_hmmerscanner15(df, 1000, 20)

                                             aliaseq  alihmmto  \
0  LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...        70   
1  RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...        54   
2                                         HRRAVDRVRA        50   
3  CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR        49   

                                           alicsline  significant  alihmmfrom  \
0  HHHHHHHHHHHHHHHHCTCHHHHHHHHHHHHHHHHHHHHGCCTTTC...          1.0           1   
1  HHHHHHHHTTS-HHHHHHHHHHHTS---HHHHHHHHT--HHHHHHH...          1.0           1   
2                                         HHHHHHHHHH          NaN          41   
3  HHHTS-HHHHHHHHHHTTST--HHHHHHHHTS-HHHHHHHHHHHHHHHH          1.0           1   

    bitscore  alisqfrom  uniq evalue     aliId  ... aliIdCount alintseq score  \
0  67.940552         45   1.0    0.0  0.391304  ...         27           68.9   
1  50.895485        143   2.0    0.0  0.407407  ...         22           51.7   
2  -2.076300        

  [result[common_columns] for result in results if result is not None])


### function 16

In [12]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq in sequences:
                task = asyncio.create_task(send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for seq, response in zip(sequences, responses):
                task = asyncio.create_task(process_response(semaphore, seq, response, client))
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return

    if hits:
        loop = asyncio.get_event_loop()
        dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff['prot_pair_index'] = dfff.groupby('sequence').ngroup()
        dfff = dfff.set_index(['prot_pair_index', 'sequence'])
        return dfff
    else:
        return pd.DataFrame()


def run_hmmerscanner16(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [13]:
%time run_hmmerscanner16(df, 50, 20)

CPU times: user 829 ms, sys: 115 ms, total: 944 ms
Wall time: 14.6 s


  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

### function 17

In [15]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, prot_pair_index, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff.insert(0, 'prot_pair_index', prot_pair_index)  # Add new column here
            dfff = dfff.set_index('prot_pair_index')  # Set new column as index
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)
    
    

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for i, seq in enumerate(sequences):
                task = asyncio.create_task(send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for i, (seq, response) in enumerate(zip(sequences, responses)):
                task = asyncio.create_task(process_response(semaphore, seq, response, client, i))  # Include index here
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


def run_hmmerscanner17(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [16]:
%time run_hmmerscanner17(df, 50, 20)

CPU times: user 1.05 s, sys: 130 ms, total: 1.18 s
Wall time: 19.9 s


  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

Unnamed: 0_level_0,oasc,alintseq,bias,alihmmto,aliaseq,alisqfrom,aliM,evalue,alimmline,aliId,...,alisqname,jenv,alihindex,alisqto,desc,acc,alirfline,ienv,aliIdCount,cevalue
prot_pair_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.96,,1.66,70,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,45,71,0.0,,0.391304,...,>seq,114,16681,113,Sigma-70 region 2,PF04542.17,,45,27,0.0
0,0.95,,0.05,54,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,143,54,0.0,,0.407407,...,>seq,196,16684,196,"Sigma-70, region 4",PF08281.15,,143,22,0.0
0,0.86,,0.16,50,HRRAVDRVRA,101,50,0.0,,0.300000,...,>seq,110,16683,110,"Sigma-70, region 4",PF04545.19,,100,3,0.47
0,0.98,,0.02,49,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,149,50,0.0,,0.244898,...,>seq,198,16683,197,"Sigma-70, region 4",PF04545.19,,149,12,0.0
1,0.98,,0.46,112,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,4,112,0.0,,0.390909,...,>seq,113,15602,113,Response regulator receiver domain,PF00072.27,,4,43,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48,0.83,,0.02,298,VWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIM-------...,95,304,0.0,,0.285714,...,>seq,194,118,192,"AAA domain, putative AbiEii toxin, Type IV TA ...",PF13304.9,,77,28,0.0
48,0.70,,0.29,113,ALLGPNGAGKTTLISIVCGLVNpsTGTVAVEGHDIIQDyrkarali...,35,132,0.000076,,0.214286,...,>seq,210,105,195,ATPase family associated with various cellular...,PF00004.32,,34,24,0.0
49,0.95,,0.01,233,GASRGIGAAIALAFAQEGAAVVINYLQNKEgAEKVASSCREAGGDg...,13,234,0.0,,0.416309,...,>seq,254,415,253,Enoyl-(Acyl carrier protein) reductase,PF13561.9,,13,97,0.0
49,0.95,,0.05,192,KVVLVTGASRGIGAAIALAFAQEGAAVVInYLQNKEGAEKVASSCR...,7,195,0.0,,0.432292,...,>seq,209,414,206,short chain dehydrogenase,PF00106.28,,7,83,0.0


### function 18

In [6]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, prot_pair_index, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        await asyncio.sleep(5 ** attempt)  # Exponential backoff
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None
    

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff.insert(0, 'prot_pair_index', prot_pair_index)  # Add new column here
            dfff = dfff.set_index('prot_pair_index')  # Set new column as index
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    indices = df['prot_pair_index'][:k]  # Get corresponding prot_pair_index values
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq, idx in zip(sequences, indices):  # Include the index here
                task = asyncio.create_task(send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for (seq, idx), response in zip(zip(sequences, indices), responses):  # Include the index here
                task = asyncio.create_task(process_response(semaphore, seq, response, client, idx))  # idx is the prot
                tasks.append(task)
                
            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(*(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


def run_hmmerscanner18(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))




In [27]:
%time run_hmmerscanner18(df, 1000, 20)

  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not None])
  [result[common_columns] for result in results if result is not

CPU times: user 16.2 s, sys: 1.47 s, total: 17.7 s
Wall time: 1min 57s


Unnamed: 0_level_0,oasc,alintseq,bias,alihmmto,aliaseq,alisqfrom,aliM,evalue,alimmline,aliId,...,alisqname,jenv,alisqto,desc,acc,alirfline,cevalue,ienv,aliIdCount,alihmmdesc
prot_pair_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48641291,0.96,,1.66,70,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,45,71,0.0,,0.391304,...,>seq,114,113,Sigma-70 region 2,PF04542.17,,0.0,45,27,Sigma-70 region 2
48641291,0.95,,0.05,54,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,143,54,0.0,,0.407407,...,>seq,196,196,"Sigma-70, region 4",PF08281.15,,0.0,143,22,"Sigma-70, region 4"
48641291,0.86,,0.16,50,HRRAVDRVRA,101,50,0.0,,0.300000,...,>seq,110,110,"Sigma-70, region 4",PF04545.19,,0.47,100,3,"Sigma-70, region 4"
48641291,0.98,,0.02,49,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,149,50,0.0,,0.244898,...,>seq,198,197,"Sigma-70, region 4",PF04545.19,,0.0,149,12,"Sigma-70, region 4"
92992745,0.98,,0.46,112,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,4,112,0.0,,0.390909,...,>seq,113,113,Response regulator receiver domain,PF00072.27,,0.0,4,43,Response regulator receiver domain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11830119,0.86,,2.53,165,TALVTGATAGIGREFAEQLAAKGI-GLVLVARDVERLATV---SAE...,2,180,0.0,,0.320988,...,>seq,180,163,KR domain,PF08659.13,,0.0,1,52,KR domain
11830119,0.82,,0.10,163,ALVTGATAGIGREFAEQLAAKGIGLVLVARDVERLATVSAE-Lrsa...,3,241,0.0,,0.269939,...,>seq,187,169,NAD dependent epimerase/dehydratase family,PF01370.24,,4.0e-12,3,44,NAD dependent epimerase/dehydratase family
69817379,0.95,,0.00,137,DGVHLDVGAGEIVALVGESGCGKTTLARTLLGLERPSAGTVSYAGR...,44,137,0.0,,0.389706,...,seq,194,194,ABC transporter,0,xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...,0.0,43,53,ABC transporter
69817379,0.97,,0.02,59,VETGPVEKILVAPEHPYTQALLSVLPEARAGIP--VVLSGEPPDPS...,245,65,0.0,,0.421053,...,seq,305,301,"Oligopeptide/dipeptide transporter, C-terminal...",0,,0.0,245,24,"Oligopeptide/dipeptide transporter, C-terminal..."


### function 19

In [83]:
import pandas as pd
import httpx
import urllib.parse
import asyncio
import nest_asyncio
import json
from concurrent.futures import ProcessPoolExecutor
from IPython.display import display


async def send_request(semaphore, sequences, client):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': '\n'.join(sequences)}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=30)  # Increase the timeout to 30 seconds

    return response


async def process_response(semaphore, sequences, responses, client, indices, max_retries=3):
    tasks = []

    async def fetch_results(url):
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                response = await client.get(url, headers=headers, timeout=30)  # Increase the timeout to 30 seconds
                return response
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise

    async with semaphore:
        for response in responses:
            redirect_url = response.headers.get('Location')
            if redirect_url is not None:
                task = asyncio.create_task(fetch_results(redirect_url))
                tasks.append(task)
            else:
                print("Error: No redirect URL found in response.")

    try:
        results = await asyncio.gather(*tasks)
    except asyncio.CancelledError:
        print("Error: Request cancellation occurred.")
        return None

    dfs = []
    for sequence, response, indices in zip(sequences, results, indices):
        try:
            results_json = response.json()
            hits = results_json['results']['hits']
            df = pd.json_normalize(hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            df.insert(0, 'sequence', sequence)
            df.insert(0, 'prot_pair_index', indices)
            dfs.append(df)
        except KeyError:
            print(f"Error: 'results' key not found in response for sequence {sequence}.")
        except json.JSONDecodeError:
            print(f"Error: JSONDecodeError for sequence {sequence}. Response text: {response.text}")

    if dfs:
        results_df = pd.concat(dfs)
        results_df.set_index('prot_pair_index', inplace=True)
        return results_df
    else:
        return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int, batch_size: int = 10):
    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    indices = df['prot_pair_index'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    with ProcessPoolExecutor() as executor:
        async with httpx.AsyncClient() as client:
            for i in range(0, len(sequences), batch_size):
                batch_sequences = sequences[i:i+batch_size]
                batch_indices = indices[i:i+batch_size]
                task = asyncio.create_task(send_request(semaphore, batch_sequences, client))
                tasks.append((batch_sequences, task, batch_indices))

            responses = await asyncio.gather(*[task for _, task, _ in tasks])

            tasks = []
            for (batch_sequences, response, batch_indices) in zip([t for t, _, _ in tasks], responses, [i for _, _, i in tasks]):
                task = asyncio.create_task(process_response(semaphore, batch_sequences, response, client, batch_indices))
                tasks.append(task)

            results = await asyncio.gather(*tasks)

    if results:
        common_columns = list(set(results[0].columns).intersection(*[set(df.columns) for df in results if df is not None]))
        results_df = pd.concat([df[common_columns] for df in results if df is not None], ignore_index=True)
        return results_df
    else:
        return None



def run_hmmerscanner19(df: pd.DataFrame, k: int, max_concurrent_requests: int, batch_size: int = 10):
    nest_asyncio.apply()
    results = asyncio.run(hmmerscanner(df, k, max_concurrent_requests, batch_size))
    if results is not None:
        display(results)
    else:
        print("No results found.")
    return results



In [84]:
%time run_hmmerscanner19(df, 1000, 20, 10)


No results found.
CPU times: user 927 ms, sys: 84.8 ms, total: 1.01 s
Wall time: 20.7 s


### Function 20

In [99]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    """
    Sends a POST request to the HMMER API with a protein sequence.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore to limit concurrent requests.
    sequence: str
        The protein sequence to be sent in the request.
    client: httpx.AsyncClient
        An HTTP client for sending the request.
    -------------
    Returns:
    -------------
    response: httpx.Response
        The response received from the HMMER API.
    """

    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, prot_pair_index, max_retries=3):
    """
    Processes the response received from the HMMER API.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore to limit concurrent requests.
    sequence: str
        The protein sequence associated with the response.
    response: httpx.Response
        The response received from the HMMER API.
    client: httpx.AsyncClient
        An HTTP client for sending subsequent requests.
    prot_pair_index: int
        The protein pair index associated with the sequence.
    max_retries: int, optional
        The maximum number of retries for failed requests (default is 3).
    -------------
    Returns:
    -------------
    dfff: pd.DataFrame or None
        A DataFrame containing the search results for the protein sequence, or None if an error occurred.
    """

    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        # Exponential backoff
                        await asyncio.sleep(5 ** attempt)
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(
                f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(
                f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            # Add new column here
            dfff.insert(0, 'prot_pair_index', prot_pair_index)
            dfff = dfff.set_index('prot_pair_index')  # Set new column as index
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    Runs the HMMER scanner for protein sequences.
    -------------
    Parameters:
    -------------
    df: pd.DataFrame
        A DataFrame that contains protein sequences.
    k: int
        The number of protein sequences to search.
    max_concurrent_requests: int
        The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pd.DataFrame
        A DataFrame containing the search results for all protein sequences.
    """

    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    # Get corresponding prot_pair_index values
    indices = df['prot_pair_index'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq, idx in zip(sequences, indices):  # Include the index here
                task = asyncio.create_task(
                    send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for (seq, idx), response in zip(zip(sequences, indices), responses):  # Include the index here
                task = asyncio.create_task(process_response(
                    semaphore, seq, response, client, idx))  # idx is the prot
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(
        *(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[common_columns] for result in results if result is not None])
    return results_df


def run_hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [104]:
%time run_hmmerscanner(df, 2, 20)

CPU times: user 60 ms, sys: 29.3 ms, total: 89.2 ms
Wall time: 2.35 s


Unnamed: 0_level_0,alisqname,alihmmdesc,name,cevalue,alihmmname,alihmmfrom,jenv,aliId,aliSim,alintseq,...,acc,alisqfrom,evalue,aliaseq,bitscore,alimodel,jali,uniq,sequence,aliL
prot_pair_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48641291,>seq,Sigma-70 region 2,Sigma70_r2,0.0,Sigma70_r2,1,114,0.391304,0.884058,,...,PF04542.17,45,0.0,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,67.940552,lverylplvkrlarrllgsgadaeDlvQegflrlwraverfdperg...,113,1.0,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,206
48641291,>seq,"Sigma-70, region 4",Sigma70_r4_2,0.0,Sigma70_r4_2,1,196,0.407407,0.87037,,...,PF08281.15,143,0.0,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,50.895485,rqalrealaeLperqreifllryleglsykEIAellgisegtVksr...,196,2.0,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,206
48641291,>seq,"Sigma-70, region 4",Sigma70_r4,0.47,Sigma70_r4,41,110,0.3,1.0,,...,PF04545.19,101,0.0,HRRAVDRVRA,-2.0763,ekrAlrkLRk,110,,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,206
48641291,>seq,"Sigma-70, region 4",Sigma70_r4,0.0,Sigma70_r4,1,198,0.244898,0.795918,,...,PF04545.19,149,0.0,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,39.875805,aLasLpererevlelrfgeelTleEigerlgiSrerVrqiekrAlrkLR,197,3.0,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,206
92992745,>seq,Response regulator receiver domain,Response_reg,0.0,Response_reg,1,113,0.390909,0.854545,,...,PF00072.27,4,0.0,IALVDDDRNILTSVSMTLEAEGF-EVETYNDGQSALDAFNKRMPDM...,98.908218,vlivdDdplvrellrqlleeegyeevaeaedgkealellkeekvdl...,113,1.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,233
92992745,>seq,"Transcriptional regulatory protein, C terminal",Trans_reg_C,1.4,Trans_reg_C,49,122,0.272727,0.545455,,...,PF00486.31,108,0.0,LVERIRALLRR,-3.556926,hisrLRkkLed,118,,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,233
92992745,>seq,"Transcriptional regulatory protein, C terminal",Trans_reg_C,0.0,Trans_reg_C,1,230,0.486842,0.868421,,...,PF00486.31,154,0.0,GNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTI...,76.683037,geeveltpkefklLelLaenpgrvvsreqLleevwgededvddrtv...,229,2.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,233


# Best function


send_request function: This function sends a single sequence to the HMMER API as a POST request.

process_response function: Once the response for a particular sequence is received from the API, this function processes the response. If there's useful data in the response (like protein family information), it extracts that data and stores it in a DataFrame. If there's no relevant data in the response, it returns None.

hmmerscanner function: This is the main function that orchestrates the previous two functions for multiple sequences. It takes a DataFrame with protein sequences, a number of sequences to process (k), and the maximum number of concurrent requests to handle. It creates tasks to send requests and process responses, and then runs these tasks asynchronously. The results (DataFrames from each processed response) are then gathered and combined into a single DataFrame.

run_hmmerscanner function: This is the entry-point function that users would generally call. It sets up the necessary asyncio event loop (for managing the asynchronous tasks) and then runs the hmmerscanner function inside it. It returns the final DataFrame that is produced by the hmmerscanner function.

In [10]:
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor


async def send_request(semaphore, sequence, client):
    """
    Sends a POST request to the HMMER API with a protein sequence.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore to limit concurrent requests.
    sequence: str
        The protein sequence to be sent in the request.
    client: httpx.AsyncClient
        An HTTP client for sending the request.
    -------------
    Returns:
    -------------
    response: httpx.Response
        The response received from the HMMER API.
    """

    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, prot_pair_index, max_retries=3):
    """
    Processes the response received from the HMMER API.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore to limit concurrent requests.
    sequence: str
        The protein sequence associated with the response.
    response: httpx.Response
        The response received from the HMMER API.
    client: httpx.AsyncClient
        An HTTP client for sending subsequent requests.
    prot_pair_index: int
        The protein pair index associated with the sequence.
    max_retries: int, optional
        The maximum number of retries for failed requests (default is 3).
    -------------
    Returns:
    -------------
    dfff: pd.DataFrame or None
        A DataFrame containing the search results for the protein sequence, or None if an error occurred.
    """

    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        # Exponential backoff
                        await asyncio.sleep(5 ** attempt)
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(
                f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(
                f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            # Add new column here
            dfff.insert(0, 'prot_pair_index', prot_pair_index)
            dfff = dfff.set_index('prot_pair_index')  # Set new column as index
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    Runs the HMMER scanner for protein sequences.
    -------------
    Parameters:
    -------------
    df: pd.DataFrame
        A DataFrame that contains protein sequences.
    k: int
        The number of protein sequences to search.
    max_concurrent_requests: int
        The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pd.DataFrame
        A DataFrame containing the search results for all protein sequences.
    """

    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    # Get corresponding prot_pair_index values
    indices = df['prot_pair_index'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq, idx in zip(sequences, indices):  # Include the index here
                task = asyncio.create_task(
                    send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for (seq, idx), response in zip(zip(sequences, indices), responses):  # Include the index here
                task = asyncio.create_task(process_response(
                    semaphore, seq, response, client, idx))  # idx is the prot
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = set.intersection(
        *(set(df.columns) for df in results if df is not None))
    results_df = pd.concat(
        [result[list(common_columns)] for result in results if result is not None])
    output = results_df.to_csv("output.csv")
    return results_df


def run_hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [32]:
%time run_hmmerscanner(df, 1000, 20)

CPU times: user 46.4 ms, sys: 48.3 ms, total: 94.7 ms
Wall time: 3.15 s


Unnamed: 0_level_0,alippline,sequence,alihmmname,is_included,iali,alimline,alisqacc,alihmmacc,alimmline,outcompeted,...,aliSimCount,clan,evalue,pvalue,alirfline,is_reported,display,aliaseq,significant,alimodel
prot_pair_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48641291,688899****************************************...,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,Sigma70_r2,1,45,l++ + p+v++l rr+l+++a ae+++Qe+++ +wr+++rfdp++g...,,PF04542.17,,0.0,...,61,CL0123,0.0,-52.678535,,1,1.0,LYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQG...,1.0,lverylplvkrlarrllgsgadaeDlvQegflrlwraverfdperg...
48641291,6899******************************************...,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,Sigma70_r4_2,1,143,r+++r++l++L+ qre+++l y++g+sy ++Aell+++ gtVk+r...,,PF08281.15,,0.0,...,47,CL0123,0.0,-40.444908,,1,1.0,RRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTR...,1.0,rqalrealaeLperqreifllryleglsykEIAellgisegtVksr...
48641291,799***9995,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,Sigma70_r4,0,101,++rA++++R+,,PF04545.19,,,...,10,,0.0,-32.815961,,0,,HRRAVDRVRA,,ekrAlrkLRk
48641291,69**********************************************9,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,Sigma70_r4,1,149,+L+ L+ +rev++l +++++++ +++e l ++++V+ +++++l +LR,,PF04545.19,,1.0,...,39,CL0123,0.0,-32.815961,,1,0.0,CLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLR,1.0,aLasLpererevlelrfgeelTleEigerlgiSrerVrqiekrAlrkLR


In [111]:
#with open('output.csv')
amin = pd.read_csv('output.csv')
amin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 50 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   prot_pair_index  7 non-null      int64  
 1   alisqname        7 non-null      object 
 2   alihmmdesc       7 non-null      object 
 3   name             7 non-null      object 
 4   cevalue          7 non-null      float64
 5   alihmmname       7 non-null      object 
 6   alihmmfrom       7 non-null      int64  
 7   jenv             7 non-null      int64  
 8   aliId            7 non-null      float64
 9   aliSim           7 non-null      float64
 10  alintseq         0 non-null      float64
 11  bias             7 non-null      float64
 12  score            7 non-null      float64
 13  aliSimCount      7 non-null      int64  
 14  is_included      7 non-null      int64  
 15  significant      5 non-null      float64
 16  alisqto          7 non-null      int64  
 17  clan             5 n

### Unit test

In [None]:
def test_realData_hmmerscanner():
    """
    In `test_realData_hmmerscanner`, the hmmerscanner function is run on a
    sample DataFrame, and assertions are made to check if the output
    DataFrame contains the expected columns. If an assertion fails,
    an error message is printed to indicate which assertion failed.
    """
    # Read the input data and print here
    # df = pd.read_csv("/Users/amin/ValidProt/FAFSA/learn2therm_sample_50k.csv")
    # 'protein_seq' 'pid'
    
    data = {
        'm_protein_seq': ['MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA'],
        'prot_pair_index': [48641291]
    }

    df = pd.DataFrame(data)

    # run the hmmerscanner function on the sample DataFrame
    results_df = run_hmmerscanner(df, 1, 20)

    # assertion 1: check if the output DataFrame is not empty
    try:
        assert len(results_df) > 0
        print("assertion 1 passed")
    except AssertionError:
        print("assertion 1 failed, because the HMMER server may have a problem")

    # assertion 2: check if the output DataFrame contains 'acc' column
    try:
        assert 'acc' in results_df.columns
        print("assertion 2 passed")
    except AssertionError:
        print("assertion 2 failed, because the HMMER server may have a problem")

    # assertion 3: check if the output DataFrame contains 'name' column
    try:
        assert 'name' in results_df.columns
        print("assertion 3 passed")
    except AssertionError:
        print("assertion 3 failed, because the HMMER server may have a problem")

    # assertion 4: check if the output DataFrame contains 'score' column
    try:
        assert 'score' in results_df.columns
        print("assertion 4 passed")
    except AssertionError:
        print("assertion 4 failed, because the HMMER server may have a problem")

    # assertion 5: check if the output DataFrame contains 'evalue' column
    try:
        assert 'evalue' in results_df.columns
        print("assertion 5 passed")
    except AssertionError:
        print("assertion 5 failed, because the HMMER server may have a problem")

    # assertion 6: check if the output DataFrame contains 'pvalue' column
    try:
        assert 'pvalue' in results_df.columns
        print("assertion 6 passed")
    except AssertionError:
        print("assertion 6 failed, because the HMMER server may have a problem")

    # assertion 7: check if the output DataFrame contains 'desc' column
    try:
        assert 'desc' in results_df.columns
        print("assertion 7 passed")
    except AssertionError:
        print("assertion 7 failed, because the HMMER server may have a problem")

    # assertion 8: check if the output DataFrame does not contain 'tlen' column
    try:
        assert 'tlen' not in results_df.columns
        print("assertion 8 passed")
    except AssertionError:
        print("assertion 8 failed, because the HMMER server may have a problem")

    # assertion 9: check if the output DataFrame does not contain 'ali_len' column
    try:
        assert 'ali_len' not in results_df.columns
        print("assertion 9 passed")
    except AssertionError:
        print("assertion 9 failed, because the HMMER server may have a problem")

    # assertion 10: check if the output DataFrame does not contain 'env_from' column
    try:
        assert 'env_from' not in results_df.columns
        print("assertion 10 passed")
    except AssertionError:
        print("assertion 10 failed, because the HMMER server may have a problem")

In [None]:
test_realData_hmmerscanner()