In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import svm
import tensorflow as tf
#tf.enable_eager_execution()
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
data=pd.read_csv('chembl_24_bbb2.csv')
data.dropna(axis=1,how='all',inplace=True)
data.drop_duplicates(inplace=True)

In [None]:
BBB=pd.read_csv('../../DrugDatabasesTools/BBB_Files/BBB2.csv')
BBB.drop(columns=['compounds','BBB_value','SMILES'],inplace=True)
BBB.drop_duplicates(inplace=True)
tt=BBB.canonical_smiles.value_counts()>1
#Nc1nnc(-c2cccc(Cl)c2Cl)c(N)n1 True
#CC(Cl)(Cl)Cl True
#CN(C)CCCN1c2ccccc2CCc2ccccc21 True
#O=C1CC(=O)N(c2ccccc2)c2cc(Cl)ccc2N1 True

In [None]:
full_1=pd.merge(data, BBB, left_on='canonical_smiles', right_on='canonical_smiles')
full_1

In [None]:
pubchem=pd.read_csv('../../DrugDatabasesTools/PubChem/pubChem_chembl_24_bbb2.csv')
full_2=pd.merge(full_1, pubchem, left_on='canonical_smiles', right_on='original_smiles')
full_2

In [None]:
rdkit=pd.read_csv('../../DrugDatabasesTools/Rdkit/rdkit_chembl_24_bbb2.csv')
rdkit.drop_duplicates(inplace=True)
full=pd.merge(full_2, rdkit, left_on='canonical_smiles', right_on='smiles')
full.head();

In [None]:
full['smiles_length'] = [len(i) for i in full['canonical_smiles']]
full['fullmolformula_length'] = [len(i) for i in full['full_molformula']]
full['smile_to_formula']=full['smiles_length']/full['fullmolformula_length']

def molecule_type_to_numeric(x):
    if x=='Small molecule':
        return 1
    else:
        return 0
full['molecule_type_nr'] = full['molecule_type'].apply(molecule_type_to_numeric)
full.drop(columns='molecule_type',inplace=True)

def y_n_to_nr(x):
    if x=='Y':
        return 1
    elif x=='N':
        return 0
    else:
        return np.nan
full['ro3_pass_nr'] = full['ro3_pass'].apply(y_n_to_nr)
full.drop(columns='ro3_pass',inplace=True)
def molecular_species_to_nr(x):
    if x=='NEUTRAL':
        return 1
    elif x=='BASE':
        return 2
    elif x=='ACID':
        return 3
    else:
        return np.nan
full['molecular_species_nr'] = full['molecular_species'].apply(molecular_species_to_nr)
full.drop(columns='molecular_species',inplace=True)
def BBB_classification_to_nr(x):
    if x=='-':
        return 1
    elif x=='0':
        return 0
    elif x=='+':
        return 2
    else:
        return np.nan
full['BBB_classification_nr'] = full['BBB_classification'].apply(BBB_classification_to_nr)
full.drop(columns='BBB_classification',inplace=True)

full.drop(columns=["IUPACName","InChI","InChIKey","CanonicalSMILES","original_smiles","MolecularFormula","smiles","molregno", "pref_name","chembl_id","chebi_par_id","standard_inchi_key","canonical_smiles",'usan_year',"structure_type",'usan_substem',"usan_stem_definition","withdrawn_country","withdrawn_reason","molregno-2",
                  "molregno-3","molfile","standard_inchi","molfile","standard_inchi","Dissociation constant, pKa of the compound (units)","Partition coefficient (logP) (units)","full_molformula",'indication_class'],inplace=True)

category enconding, provided by [this](http://pbpython.com/categorical-encoding.html)  
cateogry way

In [None]:
full["usan_stem"] = full["usan_stem"].astype('category')
full["usan_stem_cat"] = full["usan_stem"].cat.codes
full.drop(columns='usan_stem',inplace=True)

in the same link, get dummy way

In [None]:
full=pd.get_dummies(full, columns=["withdrawn_class"])

In [209]:
cols = list(full)
nunique = full.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
full.drop(cols_to_drop, axis=1,inplace=True)

scaling features and removing NA

In [210]:
features=full.columns.tolist()
features.remove('BBB_classification_nr')
full[features] = StandardScaler().fit_transform(full[features])

full.dropna(axis=1,how='any',inplace=True)
full.isna().sum()>0#178

max_phase                                                        False
therapeutic_flag                                                 False
dosed_ingredient                                                 False
oral                                                             False
parenteral                                                       False
topical                                                          False
natural_product                                                  False
first_in_class                                                   False
chirality                                                        False
prodrug                                                          False
inorganic_flag                                                   False
availability_type                                                False
withdrawn_flag                                                   False
mw_freebase                                                      False
alogp 

### Deep-learning with tf

In [None]:
X=full.drop(columns='BBB_classification_nr')
y=full['BBB_classification_nr']

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y)

In [None]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(X[features].values, tf.float32),
            tf.cast(y.values, tf.int32)
        )
    )
)

In [None]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(47,)),
    keras.layers.Dense(15, activation=tf.nn.relu),
    keras.layers.Dense(3, activation=tf.nn.softmax)
])
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test)

print('Test accuracy:', test_acc)

### Machine-learning --scikit-learn

In [211]:
def check_correlation(df,target,corr_cutoff):
    
    data_train=df.sample(frac=0.7,random_state=200)
    data_test=df.drop(data_train.index)

    data_x=df.drop(columns=[target])
    data_y=df[target]

    data_x_train=data_train.drop(columns=[target])
    data_y_train=data_train[target]

    data_x_test=data_test.drop(columns=[target])
    data_y_test=data_test[target]
    
    corr=data_train.corr()
    #fig,ax=plt.subplots(figsize=(8,6))
    #sns.heatmap(corr)
    features=''
    features_text=''
    if len(corr[target].where(lambda x : x.abs()>corr_cutoff).dropna())>1:
        features=corr[target].where(lambda x : x.abs()>corr_cutoff).dropna()
        features_text=features.index.str.cat(sep=', ')+'\n'
    else:
        features='1'
        features_text='None'
    print('The features correlated with target above the threshold %s are %s' %(corr_cutoff,features_text))
    return len(features)

check_correlation(full,'BBB_classification_nr',0.5)

The features correlated with target above the threshold 0.5 are None


1

In [212]:
result={}
from sklearn import utils
def xLinSVC(df,target,cv=10,C=1.0,multi_class ='ovr'):
    i=0
    f1=0
    score=0
    while i<cv:
        X=df.drop(columns=[target])
        y=df[target]
        X_train, X_test, y_train, y_test=train_test_split(X,y)
        utils.multiclass.type_of_target(y_train.astype('int'))
   
        clf = svm.LinearSVC(C=C,multi_class =multi_class )
        clf.fit(X_train, y_train) 
        score+=clf.score(X_test,y_test)
        f1+=f1_score(y_test,clf.predict(X_test),average='macro')
        i+=1
    return score/cv, f1/cv

for C in [5,10,11]:
        svr_score,f1_s=xLinSVC(full,'BBB_classification_nr',C=C)
        print(" Score is %s. and f1 is %s for %s" % ( svr_score, f1_s,C))
result['Linsvc_score']=svr_score
result['Linsvc_f1']=f1_s

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 Score is 0.5192307692307693. and f1 is 0.35128138600007675 for 5


  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 Score is 0.573076923076923. and f1 is 0.37639999783399486 for 10


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


 Score is 0.523076923076923. and f1 is 0.3465759953554071 for 11


In [213]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [214]:
def xSVC(df,target,cv=10):
    i=0
    f1=0
    score=0
    while i<cv:
        X=df.drop(columns=[target])
        y=df[target]
        X_train, X_test, y_train, y_test=train_test_split(X,y)
   
        clf= svm.SVC(gamma='scale', decision_function_shape='ovo')

        clf.fit(X_train, y_train) 
        score+=clf.score(X_test,y_test)
        f1+=f1_score(y_test,clf.predict(X_test),average='macro')
        i+=1
        cnf_matrix=confusion_matrix(y_test,clf.predict(X_test))
    return score/cv, f1/cv

svr_score,f1_s=xSVC(full,'BBB_classification_nr')
print(" Score is %s. and f1 is %s for %s" % ( svr_score, f1_s,C))
result['svc_score']=svr_score
result['svc_f1']=f1_s

 Score is 0.6807692307692308. and f1 is 0.46060779199099383 for 11


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [215]:
def xLGR(df,target,cv=10):
    i=0
    f1=0
    score=0
    while i<cv:
        class_names = ['-','0','+']
        X=df.drop(columns=[target])
        y=df[target]
        X_train, X_test, y_train, y_test=train_test_split(X,y)
   
        lgr= LogisticRegression()

        lgr.fit(X_train, y_train) 
        score+=lgr.score(X_test,y_test)
        f1+=f1_score(y_test,lgr.predict(X_test),average='macro')
        i+=1
        cnf_matrix=confusion_matrix(y_test,lgr.predict(X_test))
        #plt.figure()
        #plot_confusion_matrix(cnf_matrix,classes=class_names)
    return score/cv, f1/cv

svr_score,f1_s=xLGR(full,'BBB_classification_nr')
print(" Score is %s. and f1 is %s" % ( svr_score, f1_s))
result["LogR_score"]=svr_score
result["LogR_f1"]=f1_s

 Score is 0.5884615384615385. and f1 is 0.3916921551620772


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [216]:
pd.DataFrame.from_dict(data=result,orient='index',columns=['result'])

Unnamed: 0,result
Linsvc_score,0.523077
Linsvc_f1,0.346576
svc_score,0.680769
svc_f1,0.460608
LogR_score,0.588462
LogR_f1,0.391692
