# Importing Libraries

In [6]:
import numpy as np
import pandas as pd

In [21]:
data = pd.read_csv("PDBbind2015_refined-core.dat", delim_whitespace=True, index_col=False)
data.head()

Unnamed: 0,affinity,score,gauss1,gauss2,repulsion,hydrophobic,hydrogen
0,-2.745424,-3.13245,48.02108,434.90009,1.00229,17.16027,1.04153
1,-2.745424,-4.57551,45.86394,906.5491,4.5499,0.0,7.21115
2,-2.745424,-4.5883,49.45446,708.90695,4.56065,10.12192,5.42312
3,-2.745424,-3.13634,54.99922,768.05907,5.70052,31.01157,2.34365
4,-2.81406,-5.04353,53.45864,1053.90858,1.63114,0.0,2.94989


In [8]:
data.corr()

Unnamed: 0,affinity,score,gauss1,gauss2,repulsion,hydrophobic,hydrogen
affinity,1.0,0.469934,-0.413158,-0.481285,-0.088593,-0.446885,0.012595
score,0.469934,1.0,-0.701213,-0.719736,-0.024761,-0.573668,-0.138938
gauss1,-0.413158,-0.701213,1.0,0.864898,0.521572,0.428093,0.400803
gauss2,-0.481285,-0.719736,0.864898,1.0,0.39746,0.575271,0.265164
repulsion,-0.088593,-0.024761,0.521572,0.39746,1.0,-0.103389,0.753915
hydrophobic,-0.446885,-0.573668,0.428093,0.575271,-0.103389,1.0,-0.281623
hydrogen,0.012595,-0.138938,0.400803,0.265164,0.753915,-0.281623,1.0


In [9]:
from sklearn.model_selection import train_test_split
y = data['affinity']
X = data.iloc[:, 1:6]   
X_train, X_val, y_train, y_val = train_test_split(
     X.values, y.values, test_size=0.15, random_state=42)

### --> To select appropriate model, we train our data using different models below.

# Linear regression using OLSR

In [10]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression(normalize=True)
lr_model.fit(X_train, y_train)
acc = lr_model.score(X_val, y_val)

print("The training Score is", lr_model.score(X_train, y_train))
print("The validation score is", acc)

The training Score is 0.2914017444997623
The validation score is 0.2709359009302277


# Using SVM

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

scaler = StandardScaler()
scaler.fit(X_train)
X_train_svm = scaler.transform(X_train)
X_val_svm = scaler.transform(X_val)

svm = SVR()
svm.fit(X_train_svm, y_train)

print("The training Score is", svm.score(X_train_svm, y_train))
print("The validation score is", svm.score(X_val_svm, y_val))

The training Score is 0.3878355766513931
The validation score is 0.38288324461670376


# Using Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(max_depth=9, n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
acc = rf_model.score(X_val, y_val)

print("The training Score is", rf_model.score(X_train, y_train))
print("The validation score is", acc)

The training Score is 0.6971083509277529
The validation score is 0.41449193753087543


# Model Selection

We have trained our data above using three models and analyzed training score and validation score for all the three models. After observing validation score and training score, its clear that linear regression using OLSR will not be an appropriate model to use for the given data. SVM and Random Forest Regressor is giving a much better result as compared to linear regression. For SVM, training score is very near to validation score so there is no chance of overfitting at all. For Random Forest Regressor, training score is quite higher than validation score so there is chance of overfitting but the validation score for Random Forest Regressor model is better than SVM. Looking at the validation score and observing that difference between validation score and training score is reasonable, Random Forest Regressor is the best model for us to choose for the given data.

## CODE FOR TESTING THE PERFORMANCE

In [13]:
## reading the test data file
test_data = pd.read_csv("PDBbind2015_core.dat",delim_whitespace=True,index_col=False)

In [14]:
y_test = data['affinity']
X_test = data.iloc[:, 1:6] 

In [15]:
## making the predicition on the test data by random forest
y_predict = rf_model.predict(X_test)

print("Testing score is", rf_model.score(X_test, y_test))
print("Correlation : \n", np.corrcoef(y_test, y_predict))

Testing score is 0.6533492065157274
Correlation : 
 [[1.         0.82261183]
 [0.82261183 1.        ]]
