In [1]:
import pandas as pd
import lightgbm as lgb
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from molfeat.trans.fp import FPVecTransformer

In [2]:
df_data = pd.read_csv('data/preprocessed.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL169028,C/C(=C\CCCC#N)[C@H]1CC[C@]2(C)[C@@H]([C@H](C)C...,415.34503,11,30,31,6,2,0,1,2,2.127864,2622.0,7.62238,61.09,7730.0,inactive
1,CHEMBL422548,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC(=O)O[...,415.34503,9,30,32,7,3,0,0,3,1.693417,2552.0,7.29708,50.09,4050.0,inactive
2,CHEMBL1094636,NC(=O)c1cccc2cn(-c3ccc([C@@H]4CCCNC4)cc3)nc12,320.163711,3,24,27,1,4,3,2,4,1.67003,1433.0,2.5914,72.94,2000.0,inactive
3,CHEMBL172446,CN(C)CCNC(=O)c1nc(NC(=O)c2nc(NC(=O)c3ccc(N(CCC...,577.208341,13,39,41,0,3,3,3,9,1.628615,6303.0,2.2336,129.42,33700.0,inactive
4,CHEMBL4782235,Nc1cccc(-c2nc3sccn3c2-c2ccnc(NCCCCNS(=O)(=O)c3...,587.1385,10,40,44,0,5,5,3,9,1.190365,6631.0,5.2914,127.3,52430.0,inactive


In [3]:
df = df_data.drop(df_data.columns[-2], axis=1, inplace=True)
df = df_data.iloc[:, 2:]
print(df['class'].value_counts())
df.head()

class
inactive    1238
active       758
Name: count, dtype: int64


Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,class
0,415.34503,11,30,31,6,2,0,1,2,2.127864,2622.0,7.62238,61.09,inactive
1,415.34503,9,30,32,7,3,0,0,3,1.693417,2552.0,7.29708,50.09,inactive
2,320.163711,3,24,27,1,4,3,2,4,1.67003,1433.0,2.5914,72.94,inactive
3,577.208341,13,39,41,0,3,3,3,9,1.628615,6303.0,2.2336,129.42,inactive
4,587.1385,10,40,44,0,5,5,3,9,1.190365,6631.0,5.2914,127.3,inactive


In [4]:
X = df.drop('class', axis=1)
y = df['class']

In [5]:
smiles = df_data['canonical_smiles']
transformer = FPVecTransformer(kind='desc2D', dtype=float)
features = transformer(smiles)
features_df = pd.DataFrame(features, index=smiles.index)  # Convert to DataFrame
X = pd.concat([X, features_df], axis=1)

  from .autonotebook import tqdm as notebook_tqdm
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:58] Initializing Normalizer
[17:35:58] Running Normalizer
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:58] Initializing Normalizer
[17:35:58] Running Normalizer
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:58] Initializing Normalizer
[17:35:58] Running Normalizer
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:58] Initializing Normalizer
[17:35:58] Running Normalizer
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:58] Initializing Normalizer
[17:35:58] Running Normalizer
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:58] Initializing Normalizer
[17:35:58] Running Normalizer
[17:35:58] Initializing MetalDisconnector
[17:35:58] Running MetalDisconnector
[17:35:

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size: " + str(X_train.shape[0]))
print("Test size: " + str(X_test.shape[0]))
print("Number of features: " + str(X_train.shape[1]))
print("Number of classes: " + str(len(le.classes_)))

Train size: 1596
Test size: 400
Number of features: 236
Number of classes: 2


In [8]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [9]:
params = {
    'task': 'train',
    'objective': 'binary',
    'boosting_type': 'dart',
    'data_sample_strategy': 'bagging',
    'tree_learner': 'data',
    'metric': 'binary_logloss',
    'num_leaves': 71,
    'learning_rate': 0.3,
    'feature_fraction': 0.9,
}

In [10]:
model = lgb.train(params=params, 
                  train_set=train_data, 
                  num_boost_round=400,
                  valid_sets=[test_data]) 

[LightGBM] [Info] Number of positive: 996, number of negative: 600
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24078
[LightGBM] [Info] Number of data points in the train set: 1596, number of used features: 206
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624060 -> initscore=0.506818
[LightGBM] [Info] Start training from score 0.506818


In [11]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9125


In [None]:
transformer.to_state_yaml_file("models/fpv_desc2d_dict.yml")
joblib.dump(model, "models/lgb_92_model.pkl")

['MLModels/lgb_92_model.pkl']