In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
# from joblib import dump

In [2]:
df_data = pd.read_csv('EGFR_Feature_Extraction.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,41.0,active
1,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,9300.0,inactive
2,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,500000.0,inactive
3,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,3000000.0,inactive
4,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,96000.0,inactive


In [3]:
df = df_data.drop(df_data.columns[-2], axis=1, inplace=True)
df = df_data.iloc[:, 2:]
print("Null values: " + str(df.isnull().values.any()))
print(df['class'].value_counts())
df.head()

Null values: False
class
active      3674
inactive    3461
Name: count, dtype: int64


Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,class
0,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,active
1,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,inactive
2,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,inactive
3,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,inactive
4,215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,inactive


In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42)

In [7]:
model = XGBClassifier()

eval_set = [(X_train, y_train), (X_val, y_val)]

model.fit(
    X_train,
    y_train,
    eval_metric=["error", "logloss"],
    eval_set=eval_set,
    early_stopping_rounds=10,
    verbose=True
)

[0]	validation_0-error:0.26792	validation_0-logloss:0.62129	validation_1-error:0.25630	validation_1-logloss:0.61704
[1]	validation_0-error:0.26692	validation_0-logloss:0.57888	validation_1-error:0.25210	validation_1-logloss:0.57710
[2]	validation_0-error:0.22467	validation_0-logloss:0.53866	validation_1-error:0.24790	validation_1-logloss:0.54512
[3]	validation_0-error:0.21906	validation_0-logloss:0.51421	validation_1-error:0.23529	validation_1-logloss:0.52444
[4]	validation_0-error:0.20425	validation_0-logloss:0.49444	validation_1-error:0.22129	validation_1-logloss:0.51094
[5]	validation_0-error:0.20064	validation_0-logloss:0.47821	validation_1-error:0.21849	validation_1-logloss:0.50111
[6]	validation_0-error:0.20284	validation_0-logloss:0.46909	validation_1-error:0.21849	validation_1-logloss:0.49591
[7]	validation_0-error:0.18582	validation_0-logloss:0.44747	validation_1-error:0.22409	validation_1-logloss:0.48563
[8]	validation_0-error:0.17942	validation_0-logloss:0.43894	validation_1



[23]	validation_0-error:0.12555	validation_0-logloss:0.33363	validation_1-error:0.18347	validation_1-logloss:0.42535
[24]	validation_0-error:0.12575	validation_0-logloss:0.33172	validation_1-error:0.18487	validation_1-logloss:0.42551
[25]	validation_0-error:0.12255	validation_0-logloss:0.32753	validation_1-error:0.18067	validation_1-logloss:0.42167
[26]	validation_0-error:0.11794	validation_0-logloss:0.32146	validation_1-error:0.18067	validation_1-logloss:0.41996
[27]	validation_0-error:0.11294	validation_0-logloss:0.31364	validation_1-error:0.17647	validation_1-logloss:0.42028
[28]	validation_0-error:0.11013	validation_0-logloss:0.30989	validation_1-error:0.18067	validation_1-logloss:0.41974
[29]	validation_0-error:0.10893	validation_0-logloss:0.30681	validation_1-error:0.18627	validation_1-logloss:0.42038
[30]	validation_0-error:0.10493	validation_0-logloss:0.30169	validation_1-error:0.18768	validation_1-logloss:0.42005
[31]	validation_0-error:0.10433	validation_0-logloss:0.29543	val

In [8]:
y_pred = model.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       716
           1       0.84      0.78      0.81       711

    accuracy                           0.82      1427
   macro avg       0.82      0.82      0.82      1427
weighted avg       0.82      0.82      0.82      1427



In [None]:
# saved_model = "XGBClassifierEGFR.joblib"
# dump(model, saved_model)