In [None]:
import requests, os, pandas
from bs4 import BeautifulSoup
from datetime import datetime

AA_table = {
    "Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Gln": "Q",
    "Glu": "E", "Gly": "G", "His": "H", "Ile": "I", "Leu": "L", "Lys": "K",
    "Met": "M", "Phe": "F", "Pro": "P", "Pyl": "O", "Ser": "S", "Sec": "U",
    "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V", "Asx": "B", "Glx": "Z",
    "Xaa": "X", "Xle": "J", "-" : "*"
}
 
def get_CleavageSite(name) :

    if os.path.isfile(os.path.join(os.path.abspath('./'), 'CleavageSite', name + '_Site.csv')) :
        return 

    print('get cleavage site %s' % (name))

    url = 'https://www.ebi.ac.uk/merops/cgi-bin/substrates?id=' + name

    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    table = soup.find_all('table')[0]

    iter_rows = iter(table.find_all('tr'))
    next(iter_rows)

    Site = []

    for row in iter_rows :
        
        columns = row.find_all('td')
        
        CleavageSite = columns[3].get_text()

        line = []
        line.append(str(columns[0].get_text()))

        # get amino acids
        for i in range(6, 14) :
            amino_acid = str(columns[i].get_text())
            if AA_table.get(amino_acid) == None :
                line.append(amino_acid)
            else :
                line.append(AA_table[amino_acid])


        # print(line)
        Site.append(line)
    
    df = pandas.DataFrame(Site, columns = ['name', 'P4', 'P3', 'P2', 'P1', 'P1\'', 'P2\'', 'P3\'', 'P4\''])
    df.to_csv(os.path.join(os.path.abspath('./'), 'CleavageSite', name + '_Site' + '.csv'), index = False)


In [None]:
import requests, os, pandas, re
from bs4 import BeautifulSoup
from datetime import datetime


def get_sequence(id) :

    print('get %s' % (id))

    url = 'https://www.ebi.ac.uk/merops/cgi-bin/aaseq?mernum=' + id

    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    content = soup.find_all('pre')

    Sequence = ''
    text = str(content[0].text)

    return re.sub('[^A-Z]', '', text[text.find('1        ') :])


def get_protease(name) :
    
    if os.path.isfile(os.path.join(os.path.abspath('./'), 'Protease', name + '_Protease.csv')) :
        return 

    print('get protease %s' % (name))
        
    url = 'https://www.ebi.ac.uk/merops/cgi-bin/sequence_features?mid=' + name
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    table = soup.find_all('table')[0]

    iter_rows = iter(table.find_all('tr'))
    next(iter_rows)

    st = set()
    Sequence = []

    for row in iter_rows :
        
        columns = row.find_all('td')
        
        # Crawling data
        length = int(columns[2].get_text())
        Species = str(columns[1].get_text())
        sequence_name = str(columns[0].get_text())
        PeptidaseUnit = str(columns[3].get_text())
        ActiveSiteResidus = str(columns[4].get_text())
        sequence = get_sequence(columns[0].get_text())

        # remove duplicate
        if sequence_name in st : continue
        st.add(sequence_name)

        # check sequence length
        if len(sequence) != length :
            print('WTF ' + sequence_name)
            continue

        line = [sequence_name, Species, length, PeptidaseUnit, ActiveSiteResidus, sequence]

        # print(line)
        Sequence.append(line)
    
    df = pandas.DataFrame(Sequence, columns = ['MERNUM', 'Species', 'Length', 'Peptidase unit', 'Active site reidues', 'Sequence'])
    df.to_csv(os.path.join(os.path.abspath('./'), 'Protease', name + '_Protease' + '.csv'), index = False)


In [None]:
import concurrent.futures, time

# get protease list
def get_list(family) :
    
    ret = []

    URL = 'https://www.ebi.ac.uk/merops/cgi-bin/peptidase_specificity'
    
    # get webcontent from the web
    r = requests.get(URL)

    soup = BeautifulSoup(r.text, 'lxml')
    table = soup.find_all('tr', {"class": "alt"})

    for it in table :

        # get protease ID
        id = it.find_all('td')[0].text

        if id.startswith(family) :
            ret.append(id)

    return ret



# multi thread
def job(id) :
    print('Crawling %s.' % (id))
    get_protease(id)
    get_CleavageSite(id)
    print('%s done.' % id)


targets = get_list('S08')


with concurrent.futures.ThreadPoolExecutor(max_workers = 10) as executor :
    executor.map(job, targets)

