# This is a quick tutorial that tells you how to train your deep forest-based model with cross-validation

## Import the required packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from deepforest import CascadeForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import matthews_corrcoef, confusion_matrix
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score

## Load the extracted features and their labels

In [2]:
df_train = pd.read_csv("data/train_ATP_feature.csv")
df_test = pd.read_csv("data/test_ATP_feature.csv")

In [3]:
df_train

Unnamed: 0,Id,Sequence,label,AAC|A,AAC|C,AAC|D,AAC|E,AAC|F,AAC|G,AAC|H,...,PAAC|lambda4,PHYC|IEP,PHYC|Net Charge,PHYC|Hydrophobic Moment,PHYC|Hydrophobicity,PHYC|Transmembrane Propensity,PHYC|Aromacity,PHYC|Alpha Helical Propensity,PHYC|Aliphatic Index,PHYC|Boman Index
0,ParaPep_1406,GNNRPVYIPQPRPPHPRL,0,0.000000,0.000000,0.000000,0.000000,0.000000,5.555556,5.555556,...,0.236673,11.711365,2.846247,0.059213,0.094444,-0.962222,0.055556,0.819444,59.444444,2.975556
1,AP01299,GLFTLIKGAAKLIGKTVPKKQARLGMNLWLVKLPTNVKT,0,7.692308,0.000000,0.000000,0.000000,2.564103,10.256410,0.000000,...,0.149167,11.471863,7.753099,0.123503,-0.164103,-0.248205,0.051282,1.035128,120.000000,0.246667
2,ADAM_5620,SCNCVCGFCCSCSP,0,0.000000,42.857143,0.000000,0.000000,7.142857,7.142857,0.000000,...,0.139160,5.228699,-0.599259,0.161821,-0.635714,-0.228571,0.071429,0.934286,20.714286,0.085714
3,Positive_40,RTKKWIVWI,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.314946,11.166321,2.758094,0.422402,-0.366667,-0.150000,0.222222,1.007778,118.888889,1.116667
4,Positive_183,LLGDFFRKSKEKIGKEFKRIVQRIKDFLRNLVPRTES,1,0.000000,0.000000,5.405405,8.108108,10.810811,5.405405,0.000000,...,0.145662,10.605286,5.764731,0.562441,0.621622,-0.850811,0.108108,1.055676,89.459459,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,nonAMP_ID_38763,DRLLNIQPPPREKMF,0,0.000000,0.000000,6.666667,6.666667,6.666667,0.000000,0.000000,...,0.225993,8.745544,0.763014,0.237360,0.413333,-0.904000,0.066667,1.031333,78.000000,2.867333
858,Positive_58,EVEPSDTIENVKAKIQ,1,6.250000,0.000000,6.250000,18.750000,0.000000,0.000000,0.000000,...,0.190854,4.407410,-2.158429,0.221016,0.700000,-1.087500,0.000000,1.075000,91.250000,2.416875
859,AVP0965,AVASVPRARGKYWWG,0,20.000000,0.000000,0.000000,0.000000,0.000000,13.333333,0.000000,...,0.245339,11.000549,2.793857,0.189462,-0.286667,-0.222667,0.200000,0.933333,58.666667,1.258667
860,nonAMP_ID_129620,KEQLGEEGYREMGHKGGETRKEQLGEEGYREMGHKGG,0,0.000000,0.000000,0.000000,24.324324,0.000000,27.027027,5.405405,...,0.181168,5.601257,-2.046280,0.085521,0.978378,-1.324595,0.054054,1.057838,21.081081,3.447568


In [4]:
train_label = df_train.iloc[:,2].values
test_label = df_test.iloc[:,2].values

In [5]:
train_AAC = df_train.iloc[:,3:23].values
train_DPC = df_train.iloc[:,23:423].values
train_CKSAAGP = df_train.iloc[:,423:523].values
train_PAAC = df_train.iloc[:,523:547].values
train_PHYC = df_train.iloc[:,547:].values

test_AAC = df_test.iloc[:,3:23].values
test_DPC = df_test.iloc[:,23:423].values
test_CKSAAGP = df_test.iloc[:,423:523].values
test_PAAC = df_test.iloc[:,523:547].values
test_PHYC = df_test.iloc[:,547:].values

train_AAC_DPC_PAAC_CKSAAGP_PHYC = np.concatenate((train_AAC,train_DPC,train_CKSAAGP,train_PAAC,train_PHYC),axis=1)
test_AAC_DPC_PAAC_CKSAAGP_PHYC = np.concatenate((test_AAC,test_DPC,test_CKSAAGP,test_PAAC,test_PHYC),axis=1)

## Train your deep forest-based model with cross-validation

In [6]:
kf = KFold(n_splits=5, shuffle=True)
param_distribution = {
    "n_trees" : [50,100,300,500,700],
    "n_estimators": [2,3]
}

best_score, best_n_trees, best_n_estimators = 0,0,0
for n_trees in param_distribution["n_trees"]:
    for n_estimators in param_distribution["n_estimators"]:
        print("Begin to search n_trees = {}, n_estimators = {}\n".format(n_trees,n_estimators))
        all_MCC_scores = []
        for i,(train_index,val_index) in enumerate(kf.split(train_AAC_DPC_PAAC_CKSAAGP_PHYC)):
            xx_train, xx_val = train_AAC_DPC_PAAC_CKSAAGP_PHYC[train_index], train_AAC_DPC_PAAC_CKSAAGP_PHYC[val_index]
            yy_train, yy_val = train_label[train_index], train_label[val_index]            
            standardScaler = StandardScaler()
            standardScaler.fit(xx_train)
            xx_train_std = standardScaler.transform(xx_train)
            xx_val_std = standardScaler.transform(xx_val)
            model = CascadeForestClassifier(random_state=666,n_trees=n_trees, n_estimators=n_estimators)
            model.fit(xx_train_std,yy_train)
            all_MCC_scores.append(matthews_corrcoef(yy_val, model.predict(xx_val_std)))
        score = np.mean(all_MCC_scores)
        if score > best_score:
            best_score, best_n_trees, best_n_estimators = score, n_trees, n_estimators
            print("Update, best_score = {}, best_n_trees={}, best_n_estimators={}".format(best_score, best_n_trees, best_n_estimators))

Begin to search n_trees = 50, n_estimators = 2

[2023-03-02 07:46:56.347] Start to fit the model:
[2023-03-02 07:46:56.347] Fitting cascade layer = 0 
[2023-03-02 07:46:56.724] layer = 0  | Val Acc = 85.776 % | Elapsed = 0.376 s
[2023-03-02 07:46:56.725] Fitting cascade layer = 1 
[2023-03-02 07:46:57.033] layer = 1  | Val Acc = 86.647 % | Elapsed = 0.307 s
[2023-03-02 07:46:57.034] Fitting cascade layer = 2 
[2023-03-02 07:46:57.337] layer = 2  | Val Acc = 85.922 % | Elapsed = 0.303 s
[2023-03-02 07:46:57.337] Early stopping counter: 1 out of 2
[2023-03-02 07:46:57.338] Fitting cascade layer = 3 
[2023-03-02 07:46:57.639] layer = 3  | Val Acc = 87.663 % | Elapsed = 0.301 s
[2023-03-02 07:46:57.640] Fitting cascade layer = 4 
[2023-03-02 07:46:57.934] layer = 4  | Val Acc = 87.373 % | Elapsed = 0.294 s
[2023-03-02 07:46:57.934] Early stopping counter: 1 out of 2
[2023-03-02 07:46:57.934] Fitting cascade layer = 5 
[2023-03-02 07:46:58.232] layer = 5  | Val Acc = 87.228 % | Elapsed = 0.

[2023-03-02 07:47:10.450] layer = 0  | Val Acc = 85.051 % | Elapsed = 0.518 s
[2023-03-02 07:47:10.451] Fitting cascade layer = 1 
[2023-03-02 07:47:10.893] layer = 1  | Val Acc = 87.228 % | Elapsed = 0.442 s
[2023-03-02 07:47:10.895] Fitting cascade layer = 2 
[2023-03-02 07:47:11.336] layer = 2  | Val Acc = 88.534 % | Elapsed = 0.442 s
[2023-03-02 07:47:11.338] Fitting cascade layer = 3 
[2023-03-02 07:47:11.785] layer = 3  | Val Acc = 88.679 % | Elapsed = 0.447 s
[2023-03-02 07:47:11.786] Fitting cascade layer = 4 
[2023-03-02 07:47:12.238] layer = 4  | Val Acc = 88.244 % | Elapsed = 0.452 s
[2023-03-02 07:47:12.238] Early stopping counter: 1 out of 2
[2023-03-02 07:47:12.240] Fitting cascade layer = 5 
[2023-03-02 07:47:12.713] layer = 5  | Val Acc = 87.518 % | Elapsed = 0.473 s
[2023-03-02 07:47:12.713] Early stopping counter: 2 out of 2
[2023-03-02 07:47:12.713] Handling early stopping
[2023-03-02 07:47:12.714] The optimal number of layers: 4
[2023-03-02 07:47:12.714] Start to ev

[2023-03-02 07:47:31.122] layer = 3  | Val Acc = 87.971 % | Elapsed = 0.596 s
[2023-03-02 07:47:31.123] Early stopping counter: 1 out of 2
[2023-03-02 07:47:31.128] Fitting cascade layer = 4 
[2023-03-02 07:47:31.736] layer = 4  | Val Acc = 87.681 % | Elapsed = 0.608 s
[2023-03-02 07:47:31.736] Early stopping counter: 2 out of 2
[2023-03-02 07:47:31.736] Handling early stopping
[2023-03-02 07:47:31.736] The optimal number of layers: 3
[2023-03-02 07:47:31.737] Start to evalute the model:
[2023-03-02 07:47:31.739] Evaluating cascade layer = 0 
[2023-03-02 07:47:31.758] Evaluating cascade layer = 1 
[2023-03-02 07:47:31.775] Evaluating cascade layer = 2 
[2023-03-02 07:47:31.832] Start to fit the model:
[2023-03-02 07:47:31.832] Fitting cascade layer = 0 
[2023-03-02 07:47:32.542] layer = 0  | Val Acc = 85.507 % | Elapsed = 0.710 s
[2023-03-02 07:47:32.547] Fitting cascade layer = 1 
[2023-03-02 07:47:33.157] layer = 1  | Val Acc = 88.696 % | Elapsed = 0.609 s
[2023-03-02 07:47:33.161] F

[2023-03-02 07:48:01.705] layer = 3  | Val Acc = 88.986 % | Elapsed = 0.867 s
[2023-03-02 07:48:01.710] Fitting cascade layer = 4 
[2023-03-02 07:48:02.597] layer = 4  | Val Acc = 88.841 % | Elapsed = 0.887 s
[2023-03-02 07:48:02.598] Early stopping counter: 1 out of 2
[2023-03-02 07:48:02.605] Fitting cascade layer = 5 
[2023-03-02 07:48:03.488] layer = 5  | Val Acc = 88.261 % | Elapsed = 0.883 s
[2023-03-02 07:48:03.489] Early stopping counter: 2 out of 2
[2023-03-02 07:48:03.489] Handling early stopping
[2023-03-02 07:48:03.489] The optimal number of layers: 4
[2023-03-02 07:48:03.490] Start to evalute the model:
[2023-03-02 07:48:03.492] Evaluating cascade layer = 0 
[2023-03-02 07:48:03.519] Evaluating cascade layer = 1 
[2023-03-02 07:48:03.545] Evaluating cascade layer = 2 
[2023-03-02 07:48:03.570] Evaluating cascade layer = 3 
Begin to search n_trees = 300, n_estimators = 2

[2023-03-02 07:48:03.632] Start to fit the model:
[2023-03-02 07:48:03.632] Fitting cascade layer = 0 


[2023-03-02 07:49:11.410] layer = 4  | Val Acc = 89.550 % | Elapsed = 2.574 s
[2023-03-02 07:49:11.410] Early stopping counter: 2 out of 2
[2023-03-02 07:49:11.410] Handling early stopping
[2023-03-02 07:49:11.412] The optimal number of layers: 3
[2023-03-02 07:49:11.414] Start to evalute the model:
[2023-03-02 07:49:11.416] Evaluating cascade layer = 0 
[2023-03-02 07:49:11.492] Evaluating cascade layer = 1 
[2023-03-02 07:49:11.564] Evaluating cascade layer = 2 
[2023-03-02 07:49:11.677] Start to fit the model:
[2023-03-02 07:49:11.677] Fitting cascade layer = 0 
[2023-03-02 07:49:14.810] layer = 0  | Val Acc = 84.761 % | Elapsed = 3.133 s
[2023-03-02 07:49:14.818] Fitting cascade layer = 1 
[2023-03-02 07:49:17.500] layer = 1  | Val Acc = 87.083 % | Elapsed = 2.682 s
[2023-03-02 07:49:17.507] Fitting cascade layer = 2 
[2023-03-02 07:49:20.131] layer = 2  | Val Acc = 87.083 % | Elapsed = 2.624 s
[2023-03-02 07:49:20.131] Early stopping counter: 1 out of 2
[2023-03-02 07:49:20.138] F

[2023-03-02 07:50:54.718] layer = 1  | Val Acc = 87.101 % | Elapsed = 2.995 s
[2023-03-02 07:50:54.723] Fitting cascade layer = 2 
[2023-03-02 07:50:57.675] layer = 2  | Val Acc = 87.246 % | Elapsed = 2.952 s
[2023-03-02 07:50:57.680] Fitting cascade layer = 3 
[2023-03-02 07:51:00.654] layer = 3  | Val Acc = 87.971 % | Elapsed = 2.973 s
[2023-03-02 07:51:00.659] Fitting cascade layer = 4 
[2023-03-02 07:51:03.629] layer = 4  | Val Acc = 88.116 % | Elapsed = 2.970 s
[2023-03-02 07:51:03.634] Fitting cascade layer = 5 
[2023-03-02 07:51:06.620] layer = 5  | Val Acc = 87.536 % | Elapsed = 2.985 s
[2023-03-02 07:51:06.620] Early stopping counter: 1 out of 2
[2023-03-02 07:51:06.625] Fitting cascade layer = 6 
[2023-03-02 07:51:09.616] layer = 6  | Val Acc = 87.391 % | Elapsed = 2.991 s
[2023-03-02 07:51:09.616] Early stopping counter: 2 out of 2
[2023-03-02 07:51:09.616] Handling early stopping
[2023-03-02 07:51:09.618] The optimal number of layers: 5
[2023-03-02 07:51:09.620] Start to ev

[2023-03-02 07:53:21.579] layer = 4  | Val Acc = 88.986 % | Elapsed = 4.464 s
[2023-03-02 07:53:21.579] Early stopping counter: 2 out of 2
[2023-03-02 07:53:21.579] Handling early stopping
[2023-03-02 07:53:21.583] The optimal number of layers: 3
[2023-03-02 07:53:21.586] Start to evalute the model:
[2023-03-02 07:53:21.588] Evaluating cascade layer = 0 
[2023-03-02 07:53:21.714] Evaluating cascade layer = 1 
[2023-03-02 07:53:21.834] Evaluating cascade layer = 2 
[2023-03-02 07:53:21.999] Start to fit the model:
[2023-03-02 07:53:21.999] Fitting cascade layer = 0 
[2023-03-02 07:53:27.158] layer = 0  | Val Acc = 85.072 % | Elapsed = 5.160 s
[2023-03-02 07:53:27.166] Fitting cascade layer = 1 
[2023-03-02 07:53:31.729] layer = 1  | Val Acc = 87.246 % | Elapsed = 4.563 s
[2023-03-02 07:53:31.736] Fitting cascade layer = 2 
[2023-03-02 07:53:36.228] layer = 2  | Val Acc = 87.826 % | Elapsed = 4.492 s
[2023-03-02 07:53:36.235] Fitting cascade layer = 3 
[2023-03-02 07:53:40.664] layer = 3

[2023-03-02 07:55:56.529] Evaluating cascade layer = 2 
[2023-03-02 07:55:56.641] Evaluating cascade layer = 3 
Begin to search n_trees = 700, n_estimators = 3

[2023-03-02 07:55:56.821] Start to fit the model:
[2023-03-02 07:55:56.821] Fitting cascade layer = 0 
[2023-03-02 07:56:03.981] layer = 0  | Val Acc = 86.357 % | Elapsed = 7.159 s
[2023-03-02 07:56:03.988] Fitting cascade layer = 1 
[2023-03-02 07:56:10.281] layer = 1  | Val Acc = 87.808 % | Elapsed = 6.293 s
[2023-03-02 07:56:10.288] Fitting cascade layer = 2 
[2023-03-02 07:56:16.422] layer = 2  | Val Acc = 87.954 % | Elapsed = 6.134 s
[2023-03-02 07:56:16.430] Fitting cascade layer = 3 
[2023-03-02 07:56:22.621] layer = 3  | Val Acc = 87.808 % | Elapsed = 6.191 s
[2023-03-02 07:56:22.621] Early stopping counter: 1 out of 2
[2023-03-02 07:56:22.628] Fitting cascade layer = 4 
[2023-03-02 07:56:28.739] layer = 4  | Val Acc = 87.808 % | Elapsed = 6.111 s
[2023-03-02 07:56:28.739] Early stopping counter: 2 out of 2
[2023-03-02 