In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df_data = pd.read_csv('EGFR_Feature_Extraction.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,41.0,active
1,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,9300.0,inactive
2,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,500000.0,inactive
3,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,3000000.0,inactive
4,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,96000.0,inactive


In [3]:
df = df_data.drop(df_data.columns[-2], axis=1, inplace=True)
df = df_data.iloc[:, 2:]
print(df['class'].value_counts())
df.head()

class
active      3810
inactive    3524
Name: count, dtype: int64


Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,class
0,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,active
1,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,inactive
2,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,inactive
3,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,inactive
4,215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,inactive


In [4]:
X = df.drop('class', axis=1)
y = df['class']

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size: " + str(X_train.shape[0]))
print("Test size: " + str(X_test.shape[0]))
print("Number of features: " + str(X_train.shape[1]))
print("Number of classes: " + str(len(le.classes_)))

Train size: 5867
Test size: 1467
Number of features: 13
Number of classes: 2


In [7]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [8]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'dart',
    'num_leaves': 100,
    'learning_rate': 0.3,
    'feature_fraction': 0.9,
}

In [11]:
model = lgb.train(params=params, train_set=train_data, valid_sets=[test_data], num_boost_round=500)

[LightGBM] [Info] Number of positive: 2821, number of negative: 3046
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1468
[LightGBM] [Info] Number of data points in the train set: 5867, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.480825 -> initscore=-0.076738
[LightGBM] [Info] Start training from score -0.076738


In [12]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8302658486707567
