In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
import joblib

In [65]:
selected_columns = [0, 1, 2, 3, 6, 10, 11, 12, 15, 19, 20, 21, 24, 28, 29, 30, 33, 37, 38, 39, 42, 46, 47, 48, 51, 55, 56, 57, 60, 64, 65, 66, 69, 73, 74, 75, 78]
train_df = pd.read_csv("PDBbind_refined16-core16.csv", usecols=selected_columns)
test_df = pd.read_csv("PDBbind_core16.csv", usecols=selected_columns)
len(train_df), len(test_df)

(3767, 290)

In [66]:
train_df.head()

Unnamed: 0,pbindaff,6.6,7.6,8.6,16.6,6.7,7.7,8.7,16.7,6.8,...,8.17,16.17,6.35,7.35,8.35,16.35,6.53,7.53,8.53,16.53
0,2.0,931,261,244,14,213,59,52,4,135,...,0,0,0,0,0,0,0,0,0,0
1,2.0,762,262,262,20,0,0,0,0,2050,...,0,0,0,0,0,0,0,0,0,0
2,2.0,966,313,394,9,1001,308,396,9,147,...,0,0,0,0,0,0,0,0,0,0
3,2.0,1799,509,472,13,0,0,0,0,615,...,0,0,0,0,0,0,0,0,0,0
4,2.0,886,232,262,3,326,85,97,0,327,...,0,0,0,0,0,0,0,0,0,0


In [67]:
train_df.keys()

Index(['pbindaff', '6.6', '7.6', '8.6', '16.6', '6.7', '7.7', '8.7', '16.7',
       '6.8', '7.8', '8.8', '16.8', '6.9', '7.9', '8.9', '16.9', '6.15',
       '7.15', '8.15', '16.15', '6.16', '7.16', '8.16', '16.16', '6.17',
       '7.17', '8.17', '16.17', '6.35', '7.35', '8.35', '16.35', '6.53',
       '7.53', '8.53', '16.53'],
      dtype='object')

In [68]:
def create_custom_features(df):
    df['c1'] = df.apply(lambda x: x['6.6'] * x['7.6'], axis=1)
    df['c2'] = df.apply(lambda x: x['8.6'] * x['16.6'], axis=1)
    df['c3'] = df.apply(lambda x: x['6.7'] * x['7.7'], axis=1)
#     df['c4'] = df.apply(lambda x: x['8.7'] * x['16.7'], axis=1)
#     df['c5'] = df.apply(lambda x: x['6.8'] * x['7.8'], axis=1)
#     df['c6'] = df.apply(lambda x: x['8.8'] * x['16.8'], axis=1)
    
#     df['c7'] = df.apply(lambda x: x['6.6'] + x['7.6'], axis=1)
#     df['c8'] = df.apply(lambda x: x['8.6'] + x['16.6'], axis=1)
#     df['c9'] = df.apply(lambda x: x['6.7'] + x['7.7'], axis=1)
#     df['c10'] = df.apply(lambda x: x['8.7'] + x['16.7'], axis=1)
#     df['c11'] = df.apply(lambda x: x['6.8'] + x['7.8'], axis=1)
#     df['c12'] = df.apply(lambda x: x['8.8'] + x['16.8'], axis=1)
    
#     df['c13'] = df.apply(lambda x: x['6.6'] / x['7.6'], axis=1)
#     df['c14'] = df.apply(lambda x: x['8.6'] / x['16.6'], axis=1)
#     df['c15'] = df.apply(lambda x: x['6.7'] / x['7.7'], axis=1)
#     df['c16'] = df.apply(lambda x: x['8.7'] / x['16.7'], axis=1)
#     df['c17'] = df.apply(lambda x: x['6.8'] / x['7.8'], axis=1)
#     df['c18'] = df.apply(lambda x: x['8.8'] / x['16.8'], axis=1)
    
    return df

In [69]:
train_df = create_custom_features(train_df)
test_df = create_custom_features(test_df)

In [70]:
param_grid = {"n_estimators": [500], "max_features": [i for i in range(2, len(selected_columns), 1)]}
regressor = RandomForestRegressor(oob_score=True, random_state=1, verbose=False)
model = GridSearchCV(estimator=regressor, n_jobs=-1, param_grid=param_grid, verbose=True)
model.fit(train_df[train_df.columns[1:]].values, train_df["pbindaff"].values)
clf = model.best_estimator_
clf

Fitting 3 folds for each of 35 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Done 100 out of 105 | elapsed:   27.9s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:   29.9s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=3, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=True, random_state=1, verbose=False, warm_start=False)

In [71]:
train_r = pearsonr(train_df["pbindaff"].values, clf.predict(train_df[train_df.columns[1:]].values))
test_r = pearsonr(test_df["pbindaff"].values, clf.predict(test_df[test_df.columns[1:]].values))

print(train_r, test_r)

(0.9787921014819102, 0.0) (0.8149155528811943, 3.440282578794928e-70)


In [6]:
train_r = pearsonr(train_df["pbindaff"].values, clf.predict(train_df[train_df.columns[1:]].values))
test_r = pearsonr(test_df["pbindaff"].values, clf.predict(test_df[test_df.columns[1:]].values))

print(train_r, test_r)

(0.9794603803847578, 0.0) (0.8036327660052248, 7.132795428717045e-67)


In [73]:
# testing on 2007 core set
test_07_df = pd.read_csv("PDBbind_core07.csv", usecols=selected_columns)
test_07_df = create_custom_features(test_07_df)
print("Size of the test set: ", len(test_07_df))
test_07_r = pearsonr(test_07_df["pbindaff"].values, clf.predict(test_07_df[test_07_df.columns[1:]].values))
print(test_07_r)

Size of the test set:  195
(0.9007059291039606, 8.353913365948857e-72)


In [7]:
# testing on 2007 core set
test_07_df = pd.read_csv("PDBbind_core07.csv", usecols=selected_columns)
print("Size of the test set: ", len(test_07_df))
test_07_r = pearsonr(test_07_df["pbindaff"].values, clf.predict(test_07_df[test_07_df.columns[1:]].values))
print(test_07_r)

(0.8991170695314169, 3.5719553560904124e-71)


In [9]:
# Save the model
joblib.dump(clf, "RFScore-v1_16")

['RFScore-v1_16']