# Getting the sequences information by using the API sysytem

-------

# Import Libraries and read the input CSV file:

In this part, the libraries have been imported, and the CSV file has been read.

In [1]:
# They are for the Best Function
# Libraries
import pandas as pd
import requests
import urllib.parse
import time
import httpx
import nest_asyncio
import asyncio
import json
from concurrent.futures import ProcessPoolExecutor

In [2]:
# Read a CSV file on GitHub as input data
url = 'https://raw.githubusercontent.com/{username}/{repository}/{branch}/{path_to_file}'

# Replace the placeholders with the actual values for your file on GitHub
username = 'learn2therm'
repository = 'PairProphet'
branch = 'main'
path_to_file = 'notebooks/learn2therm_sample_50k_exploration.csv'

# Format the URL
url = url.format(username=username, repository=repository,
                 branch=branch, path_to_file=path_to_file)

# Read the CSV file from the GitHub URL
df = pd.read_csv(url)
df.head()

Unnamed: 0.1,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,bit_score_16s,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,875,...,1153.0,27.5,50.0,22.5,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLV...,ECF RNA polymerase sigma factor SigK,sigma-70 family RNA polymerase sigma factor,206,202
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,1014.0,25.0,54.0,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,875,...,1138.0,28.0,50.0,22.0,MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAE...,MTPEQIFSGQTAIVTGGASGIGAATVEHIARRGGRVFSVDLSYDSP...,SDR family oxidoreductase,SDR family oxidoreductase,287,252
3,3,0.327273,0.200743,0.214712,166,0.6171,163,0.696581,175,875,...,1077.0,28.0,50.0,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269
4,4,0.33871,0.318182,0.287671,60,0.909091,71,0.8875,61,9827,...,991.0,30.0,50.0,20.0,MIISLRRGLRFIRFIVFFAALVYLFYHVLDLFNGWISPVDQYQMPT...,MKRMVWRTLKVFIIFIACTLLFYFGLRFMHLEYEQFHRYEPPEGPA...,YqzK family protein,YqzK family protein,80,66


# Best Function between 20 different functions

### This function has been improved and now runs in just two minutes instead of 40.


send_request function: This function sends a single sequence to the HMMER API as a POST request.

process_response function: Once the response for a particular sequence is received from the API, this function processes the response. If there's useful data in the response (like protein family information), it extracts that data and stores it in a DataFrame. If there's no relevant data in the response, it returns None.

hmmerscanner function: This is the main function that orchestrates the previous two functions for multiple sequences. It takes a DataFrame with protein sequences, a number of sequences to process (k), and the maximum number of concurrent requests to handle. It creates tasks to send requests and process responses, and then runs these tasks asynchronously. The results (DataFrames from each processed response) are then gathered and combined into a single DataFrame.

run_hmmerscanner function: This is the entry-point function that users would generally call. It sets up the necessary asyncio event loop (for managing the asynchronous tasks) and then runs the hmmerscanner function inside it. It returns the final DataFrame that is produced by the hmmerscanner function.

* We modified that in hmmer.py slightly in accordance with the Local code, but the function or base remains the same.

In [3]:
"""
HMMER Scanner with Async HTTP Requests and Concurrent Execution

This script demonstrates how to use asynchronous HTTP requests and concurrent execution to perform protein sequence searches
using the HMMER API. It sends parallel requests, processes responses, and generates a DataFrame with search results.

Author: Your Name
"""


async def send_request(semaphore, sequence, client):
    """
    Sends a POST request to the HMMER API with a protein sequence.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore to limit concurrent requests.
    sequence: str
        The protein sequence to be sent in the request.
    client: httpx.AsyncClient
        An HTTP client for sending the request.
    -------------
    Returns:
    -------------
    response: httpx.Response
        The response received from the HMMER API.
    """

    url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
    headers = {'Content-Type': 'application/x-www-form-urlencoded',
               'Accept': 'application/json'}
    data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
    data = urllib.parse.urlencode(data).encode('ascii')

    async with semaphore:
        response = await client.post(url, headers=headers, data=data, follow_redirects=False, timeout=15000)

    return response


async def process_response(semaphore, sequence, response, client, prot_pair_index, max_retries=3):
    """
    Processes the response received from the HMMER API.
    -------------
    Parameters:
    -------------
    semaphore: asyncio.Semaphore
        A semaphore to limit concurrent requests.
    sequence: str
        The protein sequence associated with the response.
    response: httpx.Response
        The response received from the HMMER API.
    client: httpx.AsyncClient
        An HTTP client for sending subsequent requests.
    prot_pair_index: int
        The protein pair index associated with the sequence.
    max_retries: int, optional
        The maximum number of retries for failed requests (default is 3).
    -------------
    Returns:
    -------------
    dfff: pd.DataFrame or None
        A DataFrame containing the search results for the protein sequence, or None if an error occurred.
    """

    redirect_url = response.headers.get('Location')

    if redirect_url is None:
        print("Error: No redirect URL found in response.")
    else:
        headers = {'Accept': 'application/json'}
        async with semaphore:
            for attempt in range(max_retries):
                try:
                    response2 = await client.get(redirect_url, headers=headers, timeout=15000)
                    break
                except httpx.ReadTimeout:
                    if attempt < max_retries - 1:
                        # Exponential backoff
                        await asyncio.sleep(5 ** attempt)
                    else:
                        raise
        try:
            results = response2.json()
            hits = results['results']['hits']
        except KeyError:
            print(
                f"Error: 'results' key not found in response for sequence {sequence}.")
            return None
        except json.JSONDecodeError:
            print(
                f"Error: JSONDecodeError for sequence {sequence}. Response text: {response2.text}")
            return None

        if hits:
            loop = asyncio.get_event_loop()
            dfff = await loop.run_in_executor(None, pd.json_normalize, hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            # Add new column here
            dfff.insert(0, 'prot_pair_index', prot_pair_index)
            dfff = dfff.set_index('prot_pair_index')  # Set new column as index
            return dfff
        else:
            return None


async def hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    Runs the HMMER scanner for protein sequences.
    -------------
    Parameters:
    -------------
    df: pd.DataFrame
        A DataFrame that contains protein sequences.
    k: int
        The number of protein sequences to search.
    max_concurrent_requests: int
        The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------
    results_df: pd.DataFrame
        A DataFrame containing the search results for all protein sequences.
    """

    if k > 1000:
        print("Use local function for the number of sequences more than 1000.")
        return pd.DataFrame()

    sequences = df['m_protein_seq'][:k]
    # Get corresponding prot_pair_index values
    indices = df['prot_pair_index'][:k]
    tasks = []
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use a process pool to parallelize JSON processing and DataFrame creation
    with ProcessPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        async with httpx.AsyncClient() as client:
            for seq, idx in zip(sequences, indices):  # Include the index here
                task = asyncio.create_task(
                    send_request(semaphore, seq, client))
                tasks.append(task)

            responses = await asyncio.gather(*tasks)

            tasks = []
            for (seq, idx), response in zip(zip(sequences, indices), responses):  # Include the index here
                task = asyncio.create_task(process_response(
                    semaphore, seq, response, client, idx))  # idx is the prot
                tasks.append(task)

            results = await asyncio.gather(*tasks)
    common_columns = list(set.intersection(
        *(set(df.columns) for df in results if df is not None)))
    results_df = pd.concat(
        [result[list(common_columns)] for result in results if result is not None])
    output = results_df.to_csv("output.csv")
    return results_df


def run_hmmerscanner(df: pd.DataFrame, k: int, max_concurrent_requests: int):
    """
    This function runs the hmmerscanner function within an event loop and returns the search results as
    a DataFrame.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
    A DataFrame that contains protein sequences.
    k: int
    The number of protein sequences to search.
    max_concurrent_requests: int
    The maximum number of concurrent requests to the HMMER API.
    -------------
    Returns:
    -------------

    results_df: pandas.core.DataFrame
    A DataFrame containing the search results for all protein sequences.
    """

    # Set up the event loop and call the hmmerscanner function
    nest_asyncio.apply()
    return asyncio.run(hmmerscanner(df, k, max_concurrent_requests))

In [4]:
# Test the best fuction for the 1000 Sequences
%time run_hmmerscanner(df, 1000, 20)

CPU times: user 15.3 s, sys: 2.66 s, total: 17.9 s
Wall time: 1min 54s


Unnamed: 0_level_0,alimodel,alimmline,sequence,ienv,oasc,ievalue,aliSim,alicsline,aliSimCount,alisqfrom,...,alihmmdesc,aliIdCount,bias,pvalue,alihmmacc,alihmmfrom,alisqto,iali,score,alirfline
prot_pair_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48641291,lverylplvkrlarrllgsgadaeDlvQegflrlwraverfdperg...,,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,45,0.96,0.0,0.884058,HHHHHHHHHHHHHHHHCTCHHHHHHHHHHHHHHHHHHHHGCCTTTC...,61,45,...,Sigma-70 region 2,27,1.66,-52.678535,PF04542.17,1,113,45,68.9,
48641291,rqalrealaeLperqreifllryleglsykEIAellgisegtVksr...,,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,143,0.95,0.0,0.870370,HHHHHHHHTTS-HHHHHHHHHHHTS---HHHHHHHHT--HHHHHHH...,47,143,...,"Sigma-70, region 4",22,0.05,-40.444908,PF08281.15,1,196,143,51.7,
48641291,ekrAlrkLRk,,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,100,0.86,3000,1.000000,HHHHHHHHHH,10,101,...,"Sigma-70, region 4",3,0.16,-32.815961,PF04545.19,41,110,101,41.0,
48641291,aLasLpererevlelrfgeelTleEigerlgiSrerVrqiekrAlrkLR,,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,149,0.98,0.0,0.795918,HHHTS-HHHHHHHHHHTTST--HHHHHHHHTS-HHHHHHHHHHHHHHHH,39,149,...,"Sigma-70, region 4",12,0.02,-32.815961,PF04545.19,1,197,149,41.0,
92992745,vlivdDdplvrellrqlleeegyeevaeaedgkealellkeekvdl...,,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,4,0.98,2.0e-28,0.854545,EEEESSSHHHHHHHHHHHHHTTEEEEEEESSHHHHHHHHHHHHESE...,94,4,...,Response regulator receiver domain,43,0.46,-74.141257,PF00072.27,1,113,4,99.6,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11830119,tylitGGlGgLGrelakwlaekgarhlvllsRsaaakedsealiae...,,MTALVTGATAGIGREFAEQLAAKGIGLVLVARDVERLATVSAELRS...,1,0.86,0.0,0.691358,EEEEETTTSHHHHHHHHHHHHHHS-EEEEE-SS..--HHHHHHHHH...,112,2,...,KR domain,52,2.53,-45.710097,PF08659.13,2,163,2,60.1,
11830119,ilVtGatGfiGsalvkallekgyevigldrlssasntarledl......,,MTALVTGATAGIGREFAEQLAAKGIGLVLVARDVERLATVSAELRS...,3,0.82,2.0e-08,0.625767,EEEETTTSHHHHHHHHHHHHTTSEEEEEES-SSTTTCHHTHHG......,102,3,...,NAD dependent epimerase/dehydratase family,44,0.10,-27.862561,PF01370.24,1,169,3,34.5,
69817379,knvslklkegekvaivGenGaGKStLlkllagllkpteGeilldgk...,,MSDAVTERTAEPAEPRTAVPVLSAAGLSVRFAGRRGAPPARAVDGV...,43,0.95,0.0,0.838235,EEEEEEEETTSEEEEEESTTSSHHHHHHHHTTSS--SEEEEEETTC...,114,44,...,ABC transporter,53,0.00,-82.983452,PF00005.30,2,194,44,112.8,xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...
69817379,vErgptsklfeeplHPYTkaLlrsvprldrkkdklvkikgeapsle...,,MSDAVTERTAEPAEPRTAVPVLSAAGLSVRFAGRRGAPPARAVDGV...,245,0.97,0.0,0.807018,,46,245,...,"Oligopeptide/dipeptide transporter, C-terminal...",24,0.02,-42.461251,PF08352.15,1,301,245,55.4,


-------------

-----------

-----------

### Last quarter's function
#### This is the last quarter's function that is too slow and we have improved that. The best function can produce results for 1000 sequences in about 2 minutes, and we have made improvements to that, but the function from the previous quarter can produce results for 1000 sequences in 40 minutes, and we can notice a considerable change in that. 

In [9]:
"""
This script takes a user defined dataframe and an integer k, which send HTTPs requests to the HMMER API
the packages you need to run this script are:

- pandas
- requests
- urllib.parse
- time

They were imported in part 1.
"""


def hmmerscanner(df: pd.DataFrame, k: int):
    """
    This function sends HTTP requests to the HMMER API to get information for protein sequences.
    -------------
    Parameters:
    -------------
    df: pandas.core.DataFrame
        A DataFrame that has string amino acid sequences. This function has been used Meso s
        equence, we can change that to Thermo sequence according to our needed.
    k: int
        The number of sequences to scan.
    -------------
    Raises:
    -------------
    Exception:
        Raises an exception if the status is pending for too long, if the internet isn't working,
        or if the URL system doesn't wholly answer.
    -------------
    Returns:
    -------------
    results_df: pandas.core.DataFrame
        All the families are in the rows, and we have many columns that show the information that
        we need in the future. We can drop some columns and keep the needed information.
    """
    # Check if we need to use the local function instead of the API for large values of k.
    if k > 1000:
        print("Use local function for the number of sequences more than 300.")
        return pd.DataFrame()

    # Create an empty DataFrame to store the results.
    results_df = pd.DataFrame()

    # Loop through the sequences to check them.
    for i in range(k):
        # This is for meso protein sequences; we can change that in the future according to our request.
        sequence = df['m_protein_seq'][i]

        # Send an HTTP request to the HMMER API to get information for the current sequence.
        url = 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan'
        headers = {'Content-Type': 'application/x-www-form-urlencoded',
                   'Accept': 'application/json'}
        data = {'hmmdb': 'pfam', 'seq': f'>seq\n{sequence}'}
        data = urllib.parse.urlencode(data).encode('ascii')
        response = requests.post(url, headers=headers,
                                 data=data, allow_redirects=False)
        redirect_url = response.headers.get('Location')

        if redirect_url is None:
            # If the server doesn't work, show this error.
            print("Error: No redirect URL found in response.")
        elif redirect_url == 'late':
            # Raises an exception if the status is pending for too long.
            response.raise_for_status()
            time.sleep(180)
            raise IOError("Error notice after 3 minutes.")
        else:
            response2 = requests.get(redirect_url, headers=headers)

            # Put the results in the empty DataFrame.
            results = response2.json()
            hits = results['results']['hits']
            dfff = pd.json_normalize(
                hits, 'domains', ['acc', 'name', 'score', 'evalue', 'pvalue', 'desc'])
            dfff.insert(0, 'sequence', sequence)
            dfff = dfff.set_index('sequence')
            results_df = pd.concat([results_df, dfff])
            if redirect_url == 'late':
                # Raises an exception if the status is pending for too long.
                response2.raise_for_status()
                time.sleep(180)
                raise IOError("Error notice after 3 minutes.")

    return results_df

In [10]:
# Test Function 1 for 50 sequences
%time hmmerscanner(df, 1000)

CPU times: user 1min 36s, sys: 7.78 s, total: 1min 43s
Wall time: 40min 16s


Unnamed: 0_level_0,alisqacc,aliIdCount,alirfline,is_included,alihmmname,bitscore,display,ievalue,alisqto,aliSim,...,act_site.adh_short.176.K,act_site.adh_short_C2.172.Y,act_site.GDP_Man_Dehyd.147.Y,act_site.GDP_Man_Dehyd.123.S,act_site.GDP_Man_Dehyd.125.E,act_site.Epimerase.147.Y,act_site.RmlD_sub_bind.147.Y,act_site.adh_short.147.Y,act_site.adh_short.151.K,act_site.adh_short_C2.147.Y
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,27,,1,Sigma70_r2,67.940552,1.0,0.0,113,0.884058,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,22,,1,Sigma70_r4_2,50.895485,1.0,0.0,196,0.870370,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,3,,0,Sigma70_r4,-2.076300,,3000,110,1.000000,...,,,,,,,,,,
MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLYDLLAPRVYGLIRRVLRDPALAEEVTQEVLVEVWRRAARFDPAQGSANAWVFTIAHRRAVDRVRAEQKAADRTVRAGAAALDSPYDSVADEVSGRLERRQVRHCLDALTGLQREVVTLAYYQGHSYPQVAELLKTPLGTVKTRMRDGLIRLRDCLGVEATA,,12,,1,Sigma70_r4,39.875805,0.0,0.0,197,0.795918,...,,,,,,,,,,
MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMPDMAVLDIKMPRMDGMDLLQRLRQKTSMPVIFLTSKDDEIDEVLGLRMGADDYVKKPFSQRLLVERIRALLRRQDVIGGEVVEETEDNKVMVRGELTMDPLRHAVKWKGNDVSLTVTEFLLLQALAQRPGFVKSRDQLMDVAYDDQIYVDDRTIDSHIKRLRKKMRQADDEFSAIETLYGIGYRYNEA,,43,,1,Response_reg,98.908218,1.0,2.0e-28,113,0.854545,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MTALVTGATAGIGREFAEQLAAKGIGLVLVARDVERLATVSAELRSAHGVAVEVLPADLSDRDDLERVAQRLRDLDQPIDLLVNNAGYSLNSRFVASDIAAEEQLLDVLVRAVLVLSHAASTAMVARGHGRIINVSSIAGLLASGTYAAAKSYVTTFSESLAGQLAGTGVTVTALLPGYVRTEFHQRAGIDKGGRSGPFWLDATDLVREALVDAGDGKVISVPSTQYKVITGLVRHVPRSLLRSRRVSSLHRKN,,52,,1,KR,59.709538,0.0,0.0,163,0.691358,...,,,,,,,,,,
MTALVTGATAGIGREFAEQLAAKGIGLVLVARDVERLATVSAELRSAHGVAVEVLPADLSDRDDLERVAQRLRDLDQPIDLLVNNAGYSLNSRFVASDIAAEEQLLDVLVRAVLVLSHAASTAMVARGHGRIINVSSIAGLLASGTYAAAKSYVTTFSESLAGQLAGTGVTVTALLPGYVRTEFHQRAGIDKGGRSGPFWLDATDLVREALVDAGDGKVISVPSTQYKVITGLVRHVPRSLLRSRRVSSLHRKN,,44,,1,Epimerase,34.150902,0.0,2.0e-08,169,0.625767,...,,,,,,Similarity to Q7CRQ0,,,,
MSDAVTERTAEPAEPRTAVPVLSAAGLSVRFAGRRGAPPARAVDGVHLDVGAGEIVALVGESGCGKTTLARTLLGLERPSAGTVSYAGRPLSYRSRALRAYRREVQLVLQDPAGSLNPRHTVYEAVAEGLRIHGGAADERERVADALARAGLRPPERFFLRYPHELSGGQRQRVVIAGALVLEPKVIVADEPVASLDASVRGEILALLLRLRDELGLSALVVTHDLGLAWNIADRVAVMYLGRIVETGPVEKILVAPEHPYTQALLSVLPEARAGIPVVLSGEPPDPSRVPPGCRFHVRCPILASGAAEEAGVAERCQTEDPAILAGSGEAQAACHYAAARARA,,53,xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...,1,ABC_tran,112.488411,1.0,0.0,194,0.838235,...,,,,,,,,,,
MSDAVTERTAEPAEPRTAVPVLSAAGLSVRFAGRRGAPPARAVDGVHLDVGAGEIVALVGESGCGKTTLARTLLGLERPSAGTVSYAGRPLSYRSRALRAYRREVQLVLQDPAGSLNPRHTVYEAVAEGLRIHGGAADERERVADALARAGLRPPERFFLRYPHELSGGQRQRVVIAGALVLEPKVIVADEPVASLDASVRGEILALLLRLRDELGLSALVVTHDLGLAWNIADRVAVMYLGRIVETGPVEKILVAPEHPYTQALLSVLPEARAGIPVVLSGEPPDPSRVPPGCRFHVRCPILASGAAEEAGVAERCQTEDPAILAGSGEAQAACHYAAARARA,,24,,1,oligo_HPY,54.188602,1.0,0.0,301,0.807018,...,,,,,,,,,,
