In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from joblib import dump

In [12]:
df_data = pd.read_csv('EGFR_Feature_Extraction.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,41.0,active
1,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,9300.0,inactive
2,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,500000.0,inactive
3,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,3000000.0,inactive
4,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,96000.0,inactive


In [13]:
df = df_data.drop(df_data.columns[-2], axis=1, inplace=True)
df = df_data.iloc[:, 2:]
print("Null values: " + str(df.isnull().values.any()))
print(df['class'].value_counts())
df.head()

Null values: False
class
active      3810
inactive    3524
Name: count, dtype: int64


Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,class
0,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,active
1,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,inactive
2,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,inactive
3,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,inactive
4,215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,inactive


In [14]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [15]:
le = LabelEncoder()
y = le.fit_transform(y)
y = pd.DataFrame(y, columns=['class'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
eval_set = [(X_train, y_train), (X_test, y_test)]
print("Train size: " + str(X_train.shape[0]))
print("Test size: " + str(X_test.shape[0]))
print("Number of features: " + str(X_train.shape[1]))
print("Number of classes: " + str(len(le.classes_)))

Train size: 5500
Test size: 1834
Number of features: 13
Number of classes: 2


In [17]:
param_grid = {
    'n_estimators': [500, 700, 900], # Between 100 and 1000
    'max_depth': [4, 5, 6], # Between 3 and 10
    'learning_rate': [0.15, 0.2, 0.25], # Between 0.01 and 0.5
}

In [18]:
xgb = XGBClassifier(objective='binary:logistic', eval_metric='auc')
xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', n_jobs=-1, cv=5)
xgb_grid_search.fit(
    X_train,
    y_train,
    eval_set=eval_set,
)

[0]	validation_0-auc:0.74859	validation_1-auc:0.74275
[1]	validation_0-auc:0.77806	validation_1-auc:0.76918
[2]	validation_0-auc:0.78063	validation_1-auc:0.77240
[0]	validation_0-auc:0.75103	validation_1-auc:0.74133
[0]	validation_0-auc:0.74953	validation_1-auc:0.73836
[3]	validation_0-auc:0.79030	validation_1-auc:0.77781
[0]	validation_0-auc:0.74992	validation_1-auc:0.73962
[1]	validation_0-auc:0.78010	validation_1-auc:0.76641
[1]	validation_0-auc:0.75382	validation_1-auc:0.74753
[4]	validation_0-auc:0.79837	validation_1-auc:0.78663
[2]	validation_0-auc:0.78642	validation_1-auc:0.77249
[2]	validation_0-auc:0.76231	validation_1-auc:0.75311
[1]	validation_0-auc:0.76561	validation_1-auc:0.75047
[5]	validation_0-auc:0.80410	validation_1-auc:0.79091
[3]	validation_0-auc:0.79716	validation_1-auc:0.78035
[3]	validation_0-auc:0.78897	validation_1-auc:0.77561
[2]	validation_0-auc:0.78536	validation_1-auc:0.76521
[6]	validation_0-auc:0.80797	validation_1-auc:0.79362
[4]	validation_0-auc:0.80292

In [19]:
print("Best parameters found: ", xgb_grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(xgb_grid_search.best_score_))

Best parameters found:  {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 700}
Best cross-validation score: 0.82


In [20]:
best_model = xgb_grid_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test set accuracy: {:.2f}".format(test_accuracy))

Test set accuracy: 0.82


In [None]:
saved_model = 'MainXGBRFClassifier.joblib'
dump(best_model, saved_model)