# Load Trained Model 
- model trained with proteases from S8 family 

In [None]:
import gdown
!rm model.h5
gdown.download('https://drive.google.com/u/0/uc?id=1DmgtXhVh1nSl-y8mfW0zNjPH6aWkUECh&export=download', 'model.h5', quiet = False)

load model

In [None]:
from tensorflow import keras
model = keras.models.load_model('./model.h5')

Make Prediction 

In [None]:
import numpy as np
def predict(X) :
    X = np.array(X)
    y = model.predict(X) 
    ret = np.where(y < 0.5, 0, 1)
    return ret

# Checking User Data format

In [None]:
import re

def check_Sequence(Sequence) :
    if Sequence.isupper() : return True
    print('Your sequence should contains only upper case latin characters only.')
    return False

def check_ActiveSite(ActiveSites, unit) :
    ret = []
    ActiveSite_List = ActiveSites.split(',')
    for ActiveSite in ActiveSite_List :
        ActiveSite = ActiveSite.strip()
        if not re.match(r"[A-Z][0-9]+", ActiveSite) :
            print('Active site \'{}\' does not match the format'.format(ActiveSite))
            return False
        else :
            pos = int(ActiveSite[1:])
            if pos < unit[0] or pos > unit[1] : 
                print('Position out of bound!')
                return False
            ret.append(ActiveSite)
    return ret
    
def check_PeptidaseUnit(PeptidaseUnit, length) :
    Unit = PeptidaseUnit.split('-')
    if len(Unit) > 2 :
        print('Too many numbers!')
        return False
    ret = (int(Unit[0]), int(Unit[1]))
    if ret[0] > ret[1] :
        print('{} > {}'.format(ret[0], ret[1]))
        return False
    return ret 


# Input User data


Read Protease Data

In [None]:

Sequence = None
ActiveSite = None
PeptidaseUnit = None

while True :
    global Sequence
    Sequence = input("Plesase input pretease sequence: ")
    if not check_Sequence(Sequence) : continue
    break

while True :
    global PeptidaseUnit
    PeptidaseUnit = input("Please input peptidase unit: ")
    return_value = check_PeptidaseUnit(PeptidaseUnit, len(Sequence))
    if return_value == False : continue
    PeptidaseUnit = return_value
    break

while True :
    global ActiveSite
    global PeptidaseUnit
    ActiveSites = input("Please input active sites: ")
    return_value = check_ActiveSite(ActiveSites, PeptidaseUnit)
    if return_value == False : continue
    ActiveSite = return_value
    break


# Data preprocessing
Read selected features and AAindex

In [None]:
!rm AADP.txt
!rm AAidx.csv
!wget https://raw.githubusercontent.com/mmi366127/iGem/main/AADP.txt
!wget https://raw.githubusercontent.com/mmi366127/iGem/main/AAidx.csv
import pandas as pd

features = None

with open("./AADP.txt", "r") as f :
    global features
    features = eval(f.readline())


AA_index = pd.read_csv('./AAidx.csv')
AA_index = AA_index.dropna()


Generate Pair features

In [None]:
def transform(CleavageSite) :
    ret = []
    for amino_acid in CleavageSite :
        if amino_acid not in AA_index.columns :
            print('Unknown Amino Acid')
        ret = np.concatenate((ret, AA_index[amino_acid].iloc[:].values), axis = 0)
    
    return ret

def make_feature(Sequence_features, CleavegeSite) :
    return np.concatenate((Sequence_features, transform(CleavageSite)), axis = 0)

Generaing features from the input sequence

In [None]:
import numpy as np

def gen(features, Sequence) :
    ret = []
    for item in features :
        cnt = 0
        for AS in Sequence[1] :
            pos = int(AS[1:]) - 1
            if pos + item[1] >= len(Sequence[0]) or pos + item[1] < 0 : continue
            if Sequence[0][pos] == item[0][0] and Sequence[0][pos + item[1]] == item[0][1] :
                cnt += 1
        ret.append(cnt)
    print(np.array(ret).shape)
    return np.array(ret)


sequence_features = gen(features, [Sequence, ActiveSite, PeptidaseUnit])


# Input target sequence and testing
Read target sequence

In [None]:
Target_Sequence = None

while True :
    global Target_Sequence
    Target_Sequence = input('Please input target sequence: ')
    if not check_Sequence(Target_Sequence) : continue
    if len(Target_Sequence) < 8 : 
        print('The length of the sequence should longer than 8.')
        continue
    break


Test the target sequence 

In [None]:
flag = False

for i in range(len(Target_Sequence) - 7) :
    CleavageSite = Target_Sequence[i: i + 8]
    X = make_feature(sequence_features, CleavageSite)
    prediction = predict(X.reshape(1, 14478))
    if prediction[0] :
        global flag 
        flag = True
        print('Find CleavegeSite: {} at position: {}'.format(CleavageSite, i))

if not flag :
    print('No cleavage site found in the given sequence.')