In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
selected_columns = [0, 1, 2, 3, 6, 10, 11, 12, 15, 19, 20, 21, 24, 28, 29, 30, 33, 37, 38, 39, 42, 46, 47, 48, 51, 55, 56, 57, 60, 64, 65, 66, 69, 73, 74, 75, 78]
train_df = pd.read_csv("PDBbind_refined16-core16.csv", usecols=selected_columns)
test_df = pd.read_csv("PDBbind_core16.csv", usecols=selected_columns)
len(train_df), len(test_df)

(3767, 290)

In [3]:
param_grid = {"n_estimators": [500], "max_features": [i for i in range(2, len(selected_columns), 1)]}
regressor = RandomForestRegressor(oob_score=True, random_state=1, verbose=False)
model = GridSearchCV(estimator=regressor, n_jobs=-1, param_grid=param_grid, verbose=False)

In [4]:
model.fit(train_df[train_df.columns[1:]].values, train_df["pbindaff"].values)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=True, random_state=1,
           verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [500], 'max_features': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=False)

In [5]:
clf = model.best_estimator_
clf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=True, random_state=1, verbose=False, warm_start=False)

In [6]:
train_r = pearsonr(train_df["pbindaff"].values, clf.predict(train_df[train_df.columns[1:]].values))
test_r = pearsonr(test_df["pbindaff"].values, clf.predict(test_df[test_df.columns[1:]].values))

print(train_r, test_r)

(0.9794603803847578, 0.0) (0.8036327660052248, 7.132795428717045e-67)


In [7]:
# testing on 2007 core set
test_07_df = pd.read_csv("PDBbind_core07.csv", usecols=selected_columns)
print("Size of the test set: ", len(test_07_df))
test_07_r = pearsonr(test_07_df["pbindaff"].values, clf.predict(test_07_df[test_07_df.columns[1:]].values))
print(test_07_r)

(0.8991170695314169, 3.5719553560904124e-71)


In [9]:
# Save the model
joblib.dump(clf, "RFScore-v1_16")

['RFScore-v1_16']