# Getting the sequences information y using the API sysytem

-------

# Import Libraries and read the input CSV file:

In this part, the libraries have been imported, and the CSV file has been read.

In [1]:
# Library dependencies
import pandas as pd
import numpy as np
import os

# We need them to get the information from URL and for parsing the results and make the error notice in the function
import requests
import urllib.parse
import time

In [2]:
# Read the input data and print here
df = pd.read_csv("/Users/amin/ValidProt/FAFSA/learn2therm_sample_50k.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,bit_score_16s,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,875,...,1153.0,27.5,50.0,22.5,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLV...,ECF RNA polymerase sigma factor SigK,sigma-70 family RNA polymerase sigma factor,206,202
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,1014.0,25.0,54.0,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,875,...,1138.0,28.0,50.0,22.0,MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAE...,MTPEQIFSGQTAIVTGGASGIGAATVEHIARRGGRVFSVDLSYDSP...,SDR family oxidoreductase,SDR family oxidoreductase,287,252
3,3,0.327273,0.200743,0.214712,166,0.6171,163,0.696581,175,875,...,1077.0,28.0,50.0,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269
4,4,0.33871,0.318182,0.287671,60,0.909091,71,0.8875,61,9827,...,991.0,30.0,50.0,20.0,MIISLRRGLRFIRFIVFFAALVYLFYHVLDLFNGWISPVDQYQMPT...,MKRMVWRTLKVFIIFIACTLLFYFGLRFMHLEYEQFHRYEPPEGPA...,YqzK family protein,YqzK family protein,80,66


-------------

-----------

-----------

# Implemented different functions to have faster code:

# I uploaded function 1 last quarter which is too slow. Function 10 can be the best one, and it is designed for 10000 sequences.

### Function 1

In [43]:
# 1

"""
This script takes a user defined dataframe and an integer k, which send HTTPs requests to the HMMER API
the packages you need to run this script are:

- pandas
- requests
- urllib.parse
- time

They were imported in part 1.
"""


def hmmerscanner(df: pd.DataFrame, k: int):
    """
    This function sends HTTP requests to the HMMER API to get information for protein sequences.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that has string amino acid sequences. This function has been used Meso s
        equence, we can change that to Thermo sequence according to our needed.
    k: int
        The number of sequences to scan.
    -------------
    Raises:
    -------------
    Exception:
        Raises an exception if the status is pending for too long, if the internet isn't working,
        or if the URL system doesn't wholly answer.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
        All the families are in the rows, and we have many columns that show the information that
        we need in the future. We can drop some columns and keep the needed information.
    """
    # Check if we need to use the local function instead of the API for large values of k.
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    # Create an empty DataFrame to store the results.
    results_df = pd.DataFrame()

    # Loop through the sequences to check them.
    for i in range(k):
        # This is for meso protein sequences; we can change that in the future according to our request.
        sequence = df['m_protein_seq'][i]

        # Send an HTTP request to the HMMER API to get information for the current sequence.
        url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
        headers = {'Content-Type': 'application/x-www-form-urlencoded',
                   'Accept': 'application/json'}
        data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
        data = urllib.parse.urlencode(data).encode('ascii')
        response = requests.post(url, headers=headers,
                                 data=data, allow_redirects=False)
        redirect_url = response.headers.get('Location')

        if redirect_url is None:
            # If the server doesn't work, show this error.
            print("Error: No redirect URL found in response.")
        elif redirect_url == 'late':
            # Raises an exception if the status is pending for too long.
            response.raise_for_status()
            time.sleep(180)
            raise IOError("Error notice after 3 minutes.")
        else:
            response2 = requests.get(redirect_url, headers=headers)

            # Put the results in the empty DataFrame.
            results = response2.json()
            hits = results['results']['hits']
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            results_df = pd.concat([results_df, dfff])
            if redirect_url == 'late':
                # Raises an exception if the status is pending for too long.
                response2.raise_for_status()
                time.sleep(180)
                raise IOError("Error notice after 3 minutes.")

    return results_df

In [44]:
# Test Function 1
%time hmmerscanner(df, 50)

CPU times: user 3.7 s, sys: 476 ms, total: 4.18 s
Wall time: 2min 9s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 2

In [45]:
#2

import pandas as pd
import requests
import urllib.parse
import time
import concurrent.futures

def send_request(sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')
    response = requests.post(url, headers=headers,
                             data=data, allow_redirects=False)
    return response

def process_response(sequence, response):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print(f"Error: No redirect URL found in response for sequence {sequence}")
        return None

    try:
        response2 = requests.get(redirect_url, headers={'Accept': 'application/json'})
        response2.raise_for_status()
        results = response2.json()
        hits = results['results']['hits']
        dfff = pd.json_normalize(hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff = dfff.set_index('sequence')
        return dfff
    except requests.exceptions.RequestException as e:
        print(f"Error: Request error for sequence {sequence}: {e}")
        return None
    except ValueError as e:
        print(f"Error: Could not parse JSON response for sequence {sequence}: {e}")
        print(response2.content)
        return None


def hmmerscanner2(df: pd.DataFrame, k: int, max_workers=50):
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    results_df = pd.DataFrame()
    sequences = df['m_protein_seq'][:k]

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures_to_sequences = {executor.submit(send_request, seq): seq for seq in sequences}
        for future in concurrent.futures.as_completed(futures_to_sequences):
            seq = futures_to_sequences[future]
            try:
                response = future.result()
            except Exception as exc:
                print(f"Error sending request for sequence {seq}: {exc}")
            else:
                dfff = process_response(seq, response)
                if dfff is not None:
                    results_df = pd.concat([results_df, dfff])

    return results_df


In [46]:
# Test function 2
%time hmmerscanner2(df, 50)

CPU times: user 2.99 s, sys: 512 ms, total: 3.5 s
Wall time: 1min 12s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.ABC_tran.162.E,act_site.adh_short_C2.167.Y,act_site.adh_short.171.K,act_site.adh_short.167.Y,act_site.ABC_tran.174.E,act_site.adh_short.155.K,act_site.adh_short.151.Y,act_site.adh_short_C2.151.Y,act_site.adh_short.154.K,act_site.ABC_tran.168.E
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,66,,1,adh_short,137.959335,1.0,0.0,192,0.797872,...,,,,,,,,,,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,66,,1,adh_short_C2,121.099525,0.0,0.0,217,0.769231,...,,,,,,,,,,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,36,,1,KR,37.597282,0.0,0.0,158,0.668831,...,,,,,,,,,,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,9,,0,KR,2.940259,,95,234,0.736842,...,,,,,,,,,,
MGSTDRPDLAAMLAPLTRTLIAMERPVLETYGLTMWAYSVLVALSRGPARGQGVLAQEIGADKTRIIAVLDDLQDRGLLHRSPDPADRRARLLELTDEGHRIVAQAQAEIQSREEQLVLQHLSPAERRVFLSALHTLADLPRSLDENDSAAD,,24,,1,MarR_2,38.487373,1.0,0.0,90,0.766667,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MVSTHTAQKSAQVSITRVTQAFRSNGSDLAVLDDLTLDAAPGEFVALVGPSGCGKSTLLRLLAGLDRPLFGTLEVDGTPVRAPSPDRALVFQDPTLFPWRTVRQNVALGPSARGALKKSQQRIEEALELVNLTDFADSWPAQLSGGMAQRAALARALVNDPSVLLLDEPLGKLDALTRRVLQQEILSLWQRQRFTAFLVTHDVNEALLLSDRVVVFSPRPARIREIIEINLPRPRDTASPEFIALRERILGLLDEEGAGHDQPTRETERHD,,13,,0,AAA_21,18.693407,,0.0013,70,0.851852,...,,,,,,,,,,
MVSTHTAQKSAQVSITRVTQAFRSNGSDLAVLDDLTLDAAPGEFVALVGPSGCGKSTLLRLLAGLDRPLFGTLEVDGTPVRAPSPDRALVFQDPTLFPWRTVRQNVALGPSARGALKKSQQRIEEALELVNLTDFADSWPAQLSGGMAQRAALARALVNDPSVLLLDEPLGKLDALTRRVLQQEILSLWQRQRFTAFLVTHDVNEALLLSDRVVVFSPRPARIREIIEINLPRPRDTASPEFIALRERILGLLDEEGAGHDQPTRETERHD,,23,,0,AAA_21,6.316357,,7.7,202,0.725806,...,,,,,,,,,,
MRLPRDERRRQLLRAAHEVFVSNGYHGAAMDEIAEVARVSKPVLYQHFPGKRELYLALLESHLASLTELLVDALQSTTDNKQRVHATMRAYFQFIAQDSQAHRIVFESDLNNDPDVSRRLEEFNAHFADAIAGVISGDTRLSHLEATLLGRAMAGMAQVSARYWLETDGSLDIDAASELIYRLAWRGISRFPKEM,,23,,1,TetR_N,55.683132,1.0,0.0,58,0.872340,...,,,,,,,,,,
MRLPRDERRRQLLRAAHEVFVSNGYHGAAMDEIAEVARVSKPVLYQHFPGKRELYLALLESHLASLTELLVDALQSTTDNKQRVHATMRAYFQFIAQDSQAHRIVFESDLNNDPDVSRRLEEFNAHFADAIAGVISGDTRLSHLEATLLGRAMAGMAQVSARYWLETDGSLDIDAASELIYRLAWRGISRFPKEM,,5,,0,TetR_N,-2.078380,,3800,181,0.777778,...,,,,,,,,,,


### Function 3

In [47]:
#3

import pandas as pd
import requests
import urllib.parse
import concurrent.futures

def send_request(sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')
    response = requests.post(url, headers=headers,
                             data=data, allow_redirects=False)
    return response

def process_response(sequence, response):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        response2 = requests.get(redirect_url, headers=headers)
        results = response2.json()
        hits = results['results']['hits']
        dfff = pd.json_normalize(
            hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff = dfff.set_index('sequence')
        return dfff

def hmmerscanner3(df: pd.DataFrame, k: int, max_workers=50):
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures_to_sequences = {executor.submit(send_request, seq): seq for seq in sequences}
        results = []
        for future in concurrent.futures.as_completed(futures_to_sequences):
            seq = futures_to_sequences[future]
            try:
                response = future.result()
            except Exception as exc:
                print(f"Error sending request for sequence {seq}: {exc}")
            else:
                dfff = process_response(seq, response)
                if dfff is not None:
                    results.append(dfff)

    # Concatenate all DataFrames in the results list at once.
    results_df = pd.concat(results)
    return results_df


In [48]:
# Test function 3
%time hmmerscanner3(df, 50)

CPU times: user 2.97 s, sys: 480 ms, total: 3.45 s
Wall time: 1min 16s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.ABC_tran.186.E,act_site.CheB_methylest.166.S,act_site.CheB_methylest.193.H,act_site.CheB_methylest.290.D,act_site.ABC_tran.178.E,act_site.ABC_tran.174.E,act_site.ABC_tran.168.E,act_site.adh_short.152.K,act_site.adh_short.148.Y,act_site.adh_short_C2.148.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MVTTGERQRNARGEGARLRLEIVAATQALLADGETATLRSIARRAGISAPSIYRHFPDVDAVMSAVADDAFDELVDALVQKRDRHTDPVARLWAISDGYLDFARDRPHIYRVMFGGVWNAAAALELHPGEDAHFREMGMNAFRLLVAAIQACVDDGTSSSTDPRRDAAALWAGLHGLAQLLVTAPLFDWPAETDRAVVRSLARLKA,,16,,1,TetR_N,30.954098,1.0,0.0,64,0.804878,...,,,,,,,,,,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,66,,1,adh_short,137.959335,1.0,0.0,192,0.797872,...,,,,,,,,Similarity to Q06136,Similarity to P0AET8,
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,66,,1,adh_short_C2,121.099525,0.0,0.0,217,0.769231,...,,,,,,,,,,Similarity to Q12634
MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAELGDAARWFEADVTDQAALDAAVRGTVDAFGGIDAVVANAGIANRGTIAVGDLEALVRTIEVNLLGTVRTVGATVAAVSARRGYYLLVSSAAAFAALPGMAAYCAAKAGVEHFGNAIRLELAHRGVDVGTAHMSWVDTDLVRDVKDDLPTFRAALDRLPGPFGRSVPVERCAARFLDAIAHRRRRVYVPRSVAVASAFRSVANGPLAGWLTRRAAATSVPELEAQLDALGRGYGRNTAPQQR,,36,,1,KR,37.597282,0.0,0.0,158,0.668831,...,,,,,,,,,,


Here's a brief explanation of the changes:

1- I replaced `requests` with the `httpx` library, which supports asynchronous I/O operations.

2- I added the `async` keyword to the send_request, `process_response`, and `hmmerscanner` functions. This allows them to be used with asynchronous I/O operations.

3- In the `hmmerscanner` function, I used `asyncio.create_task` to create tasks for processing responses, and then used `asyncio.gather` to await the completion of all tasks.

4- I created a `run_hmmerscanner` function that wraps the `hmmerscanner` function call in an `asyncio.run` call. This allows you to run the asynchronous code from a synchronous context.

In [49]:
#4

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')
    async with httpx.AsyncClient() as client:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False)
    return response


async def process_response(sequence, response):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with httpx.AsyncClient() as client:
            response2 = await client.get(redirect_url, headers=headers)
        results = response2.json()
        hits = results['results']['hits']
        dfff = pd.json_normalize(
            hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
        dfff.insert(0, 'sequence', sequence)
        dfff = dfff.set_index('sequence')
        return dfff

async def hmmerscanner(df: pd.DataFrame, k: int):
    if k > 300:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    tasks = []

    for seq in sequences:
        response = await send_request(seq)
        task = asyncio.create_task(process_response(seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df

# Wrap the async function call in an event loop.
def run_hmmerscanner4(df: pd.DataFrame, k: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k))




In [50]:
# Test function 4
%time run_hmmerscanner4(df, 50)

CPU times: user 3.24 s, sys: 308 ms, total: 3.55 s
Wall time: 48.3 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 5

In [51]:
#5

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner5(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [52]:
# Test function 5
%time run_hmmerscanner5(df, 50,20)

CPU times: user 2.2 s, sys: 270 ms, total: 2.47 s
Wall time: 15.8 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 6

In [53]:
#6

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner6(df: pd.DataFrame, k: int, max_concurrent_requests: int = 50):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [64]:
# Test function 6
%time run_hmmerscanner6(df, 50,20)

CPU times: user 1.93 s, sys: 216 ms, total: 2.15 s
Wall time: 8.93 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 7

In [55]:
#7

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int = 1000):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner7(df: pd.DataFrame, k: int, max_concurrent_requests: int = 1000):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [56]:
# Test function 7
%time run_hmmerscanner7(df, 50, 20)

CPU times: user 1.77 s, sys: 188 ms, total: 1.96 s
Wall time: 9.3 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 8

In [57]:
# 8

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        results = response2.json()
        hits = results['results']['hits']
        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    if k > 10000:
        print("Use local function for the number of sequences more than 10000.")
        return pd.DataFrame()
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner8(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [58]:
# Test function 8
%time run_hmmerscanner8(df, 50, 20)

CPU times: user 1.93 s, sys: 215 ms, total: 2.14 s
Wall time: 14.9 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 9

In [59]:
# 9

import pandas as pd
import urllib.parse
import asyncio
import httpx
import nest_asyncio


async def send_request(semaphore, sequence):
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                results = response2.json()
                hits = results['results']['hits']
                if hits:
                    dfff = pd.json_normalize(
                        hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
                    dfff.insert(0, 'sequence', sequence)
                    dfff = dfff.set_index('sequence')
                    return dfff
                else:
                    return None
            except json.JSONDecodeError:
                if attempt < max_retries - 1:
                    await asyncio.sleep(300 ** attempt)  # Exponential backoff
                else:
                    print("Error: Could not parse response as JSON.")
                    return None



async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat([result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner9(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))


In [61]:
# Test function 9
%time run_hmmerscanner9(df, 50, 20)

CPU times: user 1.84 s, sys: 199 ms, total: 2.04 s
Wall time: 14 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


### Function 10

It can be the best function. The limitation in this function is for 10000 sequences which was 300 in the past because the code was too slow. 

In [62]:
"""
This script takes a user-defined data frame and an integer k, which sends multiple requests to
the HAMMER API at the same time.
The packages you need to run this script are:

- pandas
- requests
- urllib.parse
- time
- httpx
- nest_asyncio
"""


async def send_request(semaphore, sequence):
    """
    This function sends a POST request to the HMMER API with the given protein sequence.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore object used to limit the number of concurrent requests.
    sequence: str
        A protein sequence to scan.
    -------------
    Returns:
    -------------
    response: httpx.Response
        The response object containing the search results.
    """

    # Set up the POST request with the protein sequence and send it to the HMMER server
    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        async with httpx.AsyncClient() as client:
            response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, max_retries=3):
    """
    This function extracts the redirect URL from the POST response, sends a GET request to the URL to retrieve 
    the search results, processes the JSON response into a pandas DataFrame, and returns the DataFrame.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore object used to limit the number of concurrent requests.
    sequence: str
        A protein sequence.
    response: httpx.Response
        The response object containing the search results.
    max_retries: int
        The maximum number of times to retry the GET request if it times out.
    -------------
    Returns:
    -------------
    dfff: pandas.core.DataFrame
        The DataFrame containing the search results for the given protein sequence.
    """

    # Extract the redirect URL from the POST response, send a GET request to the URL to retrieve the results
    # and process the JSON response into a pandas DataFrame
    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        for attempt in range(max_retries):
            try:
                async with semaphore:
                    async with httpx.AsyncClient() as client:
                        response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                break
            except httpx.ReadTimeout:
                if attempt < max_retries - 1:
                    await asyncio.sleep(5 ** attempt)  # Exponential backoff
                else:
                    raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(
                f"Error: 'results' key not found in response for sequence {sequence}.")
            return None

        if hits:
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function performs an HMMER search for a given number of protein sequences in parallel.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that contains protein sequences.
    k: int
        The number of protein sequences to search.
    max_concurrent_requests: int
        The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Check if k is greater than 10000 and print a warning message if so
    if k > 10000:
        print("Use local function for the number of sequences more than 10000.")
        return pd.DataFrame()
    # Extract the protein sequences from the input DataFrame, send them as POST requests
    # to the HMMER server, and retrieve the results asynchronously
    sequences = df['m_protein_seq'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    for seq in sequences:
        task = asyncio.create_task(send_request(semaphore, seq))
        tasks.append(task)

    responses = await asyncio.gather(*tasks)

    tasks = []
    for seq, response in zip(sequences, responses):
        task = asyncio.create_task(process_response(semaphore, seq, response))
        tasks.append(task)

    results = await asyncio.gather(*tasks)
    results_df = pd.concat(
        [result for result in results if result is not None])
    return results_df


# Wrap the async function call in an event loop.
def run_hmmerscanner10(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [67]:
# Test function 10
%time run_hmmerscanner10(df, 50, 20)

CPU times: user 1.71 s, sys: 190 ms, total: 1.9 s
Wall time: 8.47 s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.SNO.90.C,act_site.SNO.232.E,act_site.SNO.230.H,act_site.ABC_tran.178.E,act_site.ABC_tran.160.E,act_site.adh_short_C2.159.Y,act_site.adh_short_C2.166.K,act_site.adh_short_C2.162.Y,act_site.adh_short.166.K,act_site.adh_short.162.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,28,,1,AAA_21,31.065155,0.0,0.0,192,0.806122,...,,,,,,,,,,
MTNALSIKNLSKTYDTGLTALNGVDLDIRRGEILALLGPNGAGKTTLISIVCGLVNPSTGTVAVEGHDIIQDYRKARALIGLVPQELHTETFETVWDTVSYSRGLFGKKPAPQLVEQILKDLSLFEKKDSKIMALSGGMKRRVMIAKALAHEPRILFLDEPTAGVDVELRKDMWRLVKRLRDTGVTIILTTHYIEEAEEIADRVGVINRGRLLLVEDKAELMRKLGQKQLVLELQKPLEQLPEALSDYALELSDGGTRITYHYDTQATRTGIASLLAALAGAGVTVKDLDTEQRSLEDIFVSLVVEESQ,,24,,0,AAA,18.856600,,0.0018,195,0.616071,...,,,,,,,,,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,97,,1,adh_short_C2,216.260788,1.0,0.0,253,0.828326,...,,,,,,Similarity to P71079,Similarity to P71079,Similarity to Q12634,,
MGKFQGKVVLVTGASRGIGAAIALAFAQEGAAVVINYLQNKEGAEKVASSCREAGGDGWSLQADVTSEAAVHGMIEQISLEMGRIDVVVNNAFKPYVFNPDTRKLLWELKWEDYQDQLDGALRSTHYICQAVLPLMKKQSSGNIVNVISNLVERPIVPYHEYNTAKTALMGYSRNLAAELGPFGIRVNCVAPGLVYPTSASQYTKEEMKEMIIAQTPLRRIARPEDIAGPVLFLASDWSRFMTGQTLFVDGGFIM,,83,,1,adh_short,154.287918,0.0,0.0,206,0.755208,...,,,,,,,,,Similarity to Q06136,Similarity to P0AET8


In [42]:
%time run_hmmerscanner10(df, 10000, 20)

CPU times: user 11min 40s, sys: 47.6 s, total: 12min 27s
Wall time: 27min 44s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.2-Hacid_dh_C.245.R,act_site.adh_short.179.K,act_site.adh_short.175.Y,act_site.Epimerase.175.Y,act_site.Abhydrolase_6.204.D,act_site.Beta-lactamase2.118.S,act_site.Beta-lactamase2.213.E,act_site.His_Phos_1.244.H,act_site.AhpC-TSA.113.C,act_site.Redoxin.113.C
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MNAPAVEITGLVKRYGSTTAVDGLDLRMERGTLLALLGPNGAGKTTTVEICEGFLRPDDGEVRVLGLDPSRDGSALRPRIGVMPQGGGAYPGVRADEMLGLVAACAANPLDPAWLLDVLGLSGARKTPFKRLSGGQQQRLSLACALVGRPELLFLDEPTAGMDPQARRLVWDLLEALRADGVSVLLTTHLMEEAETLADTVVIVDHGKVVVEGSPQSLTVEAGETAQLRFKARTRLDTALLTAALPEGHLVHESAPGTYLVEGAIDPQVVSTVTAWCAQQGVMPEELQVGRRTLEEVFLELTGRELRA,,8,,0,AAA_21,10.365308,,0.45,56,0.772727,...,,,,,,,,,,
MNAPAVEITGLVKRYGSTTAVDGLDLRMERGTLLALLGPNGAGKTTTVEICEGFLRPDDGEVRVLGLDPSRDGSALRPRIGVMPQGGGAYPGVRADEMLGLVAACAANPLDPAWLLDVLGLSGARKTPFKRLSGGQQQRLSLACALVGRPELLFLDEPTAGMDPQARRLVWDLLEALRADGVSVLLTTHLMEEAETLADTVVIVDHGKVVVEGSPQSLTVEAGETAQLRFKARTRLDTALLTAALPEGHLVHESAPGTYLVEGAIDPQVVSTVTAWCAQQGVMPEELQVGRRTLEEVFLELTGRELRA,,25,,1,AAA_21,37.568024,0.0,0.0,189,0.791045,...,,,,,,,,,,
MPSLADHITEVPPSGVRAIFNTALAMSAAGERVTHLAVGEPDLAPEPHVVEAARAAWARGEVRYAPNGGLPALREHLARVTTAQRGTEVTPDQIWVTIGGTQALYLAFTLVLGRRDKVLVPDPGYTTFTMAPAALGARPVPYPLRPERGFAPSLADIAPLLTRRTRAIVVNSPSNPLGTVLSRGRLQQIVDIASASDLWIISDEVYSGLVHDGEHVSIASLPGAEGRVLSVHSVSKTYALTGARVGCLVTPPGWSDVLNAVQEAMVSCVAPPDQHAALAALTGPQEGVARARAHYAENLRLATGLLAERGFEWLPPKGGFYVWVDVRERVAASGLGSVAAWARELLVQRRVSVAPGSAFGASGEGWARLCVAASPEAITTGIDALATF,,92,,1,Aminotran_1_2,186.193817,1.0,0.0,383,0.704545,...,,,,,,,,,,
MPSLADHITEVPPSGVRAIFNTALAMSAAGERVTHLAVGEPDLAPEPHVVEAARAAWARGEVRYAPNGGLPALREHLARVTTAQRGTEVTPDQIWVTIGGTQALYLAFTLVLGRRDKVLVPDPGYTTFTMAPAALGARPVPYPLRPERGFAPSLADIAPLLTRRTRAIVVNSPSNPLGTVLSRGRLQQIVDIASASDLWIISDEVYSGLVHDGEHVSIASLPGAEGRVLSVHSVSKTYALTGARVGCLVTPPGWSDVLNAVQEAMVSCVAPPDQHAALAALTGPQEGVARARAHYAENLRLATGLLAERGFEWLPPKGGFYVWVDVRERVAASGLGSVAAWARELLVQRRVSVAPGSAFGASGEGWARLCVAASPEAITTGIDALATF,,32,,1,DegT_DnrJ_EryC1,28.484140,0.0,0.000001,203,0.737374,...,,,,,,,,,,
