# BSEP

The BSEP dataset represents the inhibition of the respective membrane transporters BSEP.

reference: https://jcheminf.biomedcentral.com/articles/10.1186/s13321-015-0083-5

endpoint values are bile salt efflux pump inhibition at 100 μM

## Generate rdkit continuous descriptors, splitting dataset, and descriptor preprocessing 

In [1]:
from rdkit import Chem
import pandas as pd
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
import os
currentDirectory = os.getcwd()
d = os.path.join(currentDirectory, "Datasets","BSEP_0801.csv")
dataset = pd.read_csv(d, index_col = 0)

molecules = [Chem.MolFromSmiles(mol) for mol in dataset.SMILES]

calculator = MoleculeDescriptors.MolecularDescriptorCalculator([desc[0] for desc in Descriptors.descList])
X = pd.DataFrame([list(calculator.CalcDescriptors(mol)) for mol in molecules],
                     index=dataset.index,
                     columns=list(calculator.GetDescriptorNames()))

train_set_X, test_set_X = train_test_split(X, test_size=0.2, random_state=42)
train_set_y = dataset.loc[train_set_X.index]['endpoint'].values
test_set_y = dataset.loc[test_set_X.index]['endpoint'].values

In [2]:
from sklearn import pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
pipeline = pipeline.Pipeline([
        ('scaling', MinMaxScaler()),
        ('std_scaler', StandardScaler()),
    ])
train_X_prepared = pipeline.fit_transform(train_set_X)
test_X_prepared = pipeline.transform(test_set_X)

# Random Forest

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 20, 30, 40, 50],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2],
    'n_estimators': [5, 100, 150, 200, 250, 300, 1000],
    'class_weight':['balanced']
}
# Create a based model
rfc = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rfc, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [4]:
# Fit the grid search to the data
grid_search.fit(train_X_prepared, train_set_y)
grid_search.best_params_

Fitting 5 folds for each of 210 candidates, totalling 1050 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   55.5s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1050 out of 1050 | elapsed:  1.6min finished


{'bootstrap': True,
 'class_weight': 'balanced',
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 100}

In [5]:
from sklearn.metrics import confusion_matrix

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    tn, fp, fn, tp = confusion_matrix(test_labels, predictions).ravel()
    ppv = tp / (tp + fp)
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    ccr = (specificity + sensitivity) / 2
    
    print('Model Performance')
    print('ppv: {:0.2f}.'.format(ppv))
    print('specificity: {:0.2f}.'.format(specificity))
    print('sensitivity: {:0.2f}.'.format(sensitivity))
    print('ccr: {:0.2f}.'.format(ccr))
    
    return ccr

In [6]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_X_prepared, test_set_y)

Model Performance
ppv: 0.81.
specificity: 0.85.
sensitivity: 0.81.
ccr: 0.83.


In [7]:
best_grid = grid_search.best_estimator_
from sklearn.externals import joblib
joblib.dump(best_grid, "BSEP_rf_model_0806.pkl")
#my_model_loaded = joblib.load("my_model.pkl") 



['BSEP_rf_model_0806.pkl']

# SVM

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
# Create the parameter grid based on the results of random search 
param_grid = {
    'probability':[True],
    'class_weight':['balanced'],
    'kernel': ['rbf'],
    'gamma': [1e-2, 1e-3],
    'C': [1,10]}
# Create a based model
svm = SVC()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = svm, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [9]:
# Fit the grid search to the data
grid_search.fit(train_X_prepared, train_set_y)
grid_search.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.6s finished


{'C': 1,
 'class_weight': 'balanced',
 'gamma': 0.01,
 'kernel': 'rbf',
 'probability': True}

In [10]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, test_X_prepared, test_set_y)

Model Performance
ppv: 0.81.
specificity: 0.86.
sensitivity: 0.75.
ccr: 0.81.


In [11]:
best_grid = grid_search.best_estimator_
from sklearn.externals import joblib
joblib.dump(best_grid, "BSEP_svm_model_0806.pkl")

['BSEP_svm_model_0806.pkl']