In [44]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

import tensorflow as tf
#tf.enable_eager_execution()
from tensorflow import keras



pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [67]:
data=pd.read_csv('chembl_24_bbb2.csv')
data.dropna(axis=1,how='all',inplace=True)
data;

In [68]:
BBB=pd.read_csv('../../DrugDatabasesTools/BBB_Files/BBB2.csv')
BBB.drop(columns=['compounds','BBB_value','SMILES'],inplace=True)
BBB.drop_duplicates(inplace=True)
tt=BBB.canonical_smiles.value_counts()>1
#Nc1nnc(-c2cccc(Cl)c2Cl)c(N)n1 True
#CC(Cl)(Cl)Cl True
#CN(C)CCCN1c2ccccc2CCc2ccccc21 True
#O=C1CC(=O)N(c2ccccc2)c2cc(Cl)ccc2N1 True

In [69]:
full=pd.merge(data, BBB, left_on='canonical_smiles', right_on='canonical_smiles')
full.head();

In [70]:
full['smiles_length'] = [len(i) for i in full['canonical_smiles']]
full['fullmolformula_length'] = [len(i) for i in full['full_molformula']]
full['smile_to_formula']=full['smiles_length']/full['fullmolformula_length']

def molecule_type_to_numeric(x):
    if x=='Small molecule':
        return 1
    else:
        return 0
full['molecule_type_nr'] = full['molecule_type'].apply(molecule_type_to_numeric)
full.drop(columns='molecule_type',inplace=True)

def y_n_to_nr(x):
    if x=='Y':
        return 1
    elif x=='N':
        return 0
    else:
        return np.nan
full['ro3_pass_nr'] = full['ro3_pass'].apply(y_n_to_nr)
full.drop(columns='ro3_pass',inplace=True)
def molecular_species_to_nr(x):
    if x=='NEUTRAL':
        return 1
    elif x=='BASE':
        return 2
    elif x=='ACID':
        return 3
    else:
        return np.nan
full['molecular_species_nr'] = full['molecular_species'].apply(molecular_species_to_nr)
full.drop(columns='molecular_species',inplace=True)
def BBB_classification_to_nr(x):
    if x=='-':
        return 1
    elif x=='0':
        return 0
    elif x=='+':
        return 2
    else:
        return np.nan
full['BBB_classification_nr'] = full['BBB_classification'].apply(BBB_classification_to_nr)
full.drop(columns='BBB_classification',inplace=True)

full.drop(columns=["molregno", "pref_name","chembl_id","chebi_par_id","standard_inchi_key","canonical_smiles",'usan_year',"structure_type",'usan_substem',"usan_stem_definition","withdrawn_country","withdrawn_reason","molregno-2",
                  "molregno-3","molfile","standard_inchi","molfile","standard_inchi","Dissociation constant, pKa of the compound (units)","Partition coefficient (logP) (units)","full_molformula",'indication_class'],inplace=True)

In [15]:
full.dropna(axis=1,how='any',inplace=True)
full.isna().sum()>0#178

max_phase                False
therapeutic_flag         False
dosed_ingredient         False
oral                     False
parenteral               False
topical                  False
natural_product          False
first_in_class           False
chirality                False
prodrug                  False
inorganic_flag           False
availability_type        False
polymer_flag             False
withdrawn_flag           False
mw_freebase              False
alogp                    False
hba                      False
hbd                      False
psa                      False
rtb                      False
num_ro5_violations       False
acd_logp                 False
acd_logd                 False
full_mwt                 False
aromatic_rings           False
heavy_atoms              False
qed_weighted             False
mw_monoisotopic          False
hba_lipinski             False
hbd_lipinski             False
smiles_length            False
fullmolformula_length    False
smile_to

category enconding, provided by [this](http://pbpython.com/categorical-encoding.html)  
cateogry way

In [78]:
full["usan_stem"] = full["usan_stem"].astype('category')
full["usan_stem_cat"] = full["usan_stem"].cat.codes
full.drop(columns='usan_stem',inplace=True)

in the same link, get dummy way

In [79]:
full=pd.get_dummies(full, columns=["withdrawn_class"])

In [80]:
features=full.columns.tolist()
features.remove('BBB_classification_nr')


In [81]:
X=full.drop(columns='BBB_classification_nr')
y=full['BBB_classification_nr']

In [82]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y)

In [72]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(X[features].values, tf.float32),
            tf.cast(y.values, tf.int32)
        )
    )
)

In [90]:
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(47,1)),
    keras.layers.Dense(20, activation=tf.nn.relu),
    keras.layers.Dense(3, activation=tf.nn.softmax)
])
model.compile(optimizer=tf.train.AdamOptimizer(), 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b82b5dfbe0>

In [91]:
test_loss, test_acc = model.evaluate(X_test, y_test)

print('Test accuracy:', test_acc)

Test accuracy: 0.3333333366447025


In [None]:
def check_correlation(df,target,corr_cutoff):
    data_train=df.sample(frac=0.7,random_state=200)
    data_test=df.drop(data_train.index)

    data_x=df.drop(columns=[target])
    data_y=df[target]

    data_x_train=data_train.drop(columns=[target])
    data_y_train=data_train[target]

    data_x_test=data_test.drop(columns=[target])
    data_y_test=data_test[target]
    
    corr=data_train.corr()
    #fig,ax=plt.subplots(figsize=(8,6))
    #sns.heatmap(corr)
    features=''
    features_text=''
    if len(corr[target].where(lambda x : x.abs()>corr_cutoff).dropna())>1:
        features=corr[target].where(lambda x : x.abs()>corr_cutoff).dropna()
        features_text=features.index.str.cat(sep=', ')+'\n'
    else:
        features='1'
        features_text='None'
    print('The features correlated with target above the threshold %s are %s' %(corr_cutoff,features_text))
    return len(features)

check_correlation(full,'BBB_classification_nr',0.5)

In [None]:
def xMlr(df,target,frac=0.7,cv=10):
    i=0
    mse=0
    score=0
    while i<cv:
        np.random.seed(seed=123)
        pka_data_train=df.sample(frac=0.7,random_state=200)
        pka_data_test=df.drop(pka_data_train.index)

        pka_data_x=df.drop(columns=[target])
        pka_data_y=df[target]

        pka_data_x_train=df.drop(columns=[target])
        pka_data_y_train=df[target]

        pka_data_x_test=df.drop(columns=[target])
        pka_data_y_test=df[target]
        regr = linear_model.LinearRegression()
        regr.fit(pka_data_x_train, pka_data_y_train)
        #print(regr.coef_)
        mse+=(np.mean((regr.predict( pka_data_x_test)-pka_data_y_test)**2))
        score+=regr.score(pka_data_x_test, pka_data_y_test)
        i+=1
    return mse/cv, score/cv
mlr_mse,mlr_score=xMlr(full,'BBB_classification_nr')
print("RMSE is %s. Score is %s." % (mlr_mse, mlr_score))

In [None]:
def xSVR(df,target,frac=0.7,cv=10):
    i=0
    mse=0
    score=0
    while i<cv:
        np.random.seed(seed=123)
        pka_data_train=df.sample(frac=0.7,random_state=200)
        pka_data_test=df.drop(pka_data_train.index)

        pka_data_x=df.drop(columns=[target])
        pka_data_y=df[target]

        pka_data_x_train=df.drop(columns=[target])
        pka_data_y_train=df[target]

        pka_data_x_test=df.drop(columns=[target])
        pka_data_y_test=df[target]
        clf = SVR(gamma='scale', C=1.0, epsilon=0.1)
        clf.fit(pka_data_x_train, pka_data_y_train) 
        mse+=(np.mean((clf.predict( pka_data_x_test)-pka_data_y_test)**2))
        score+=clf.score(pka_data_x_test, pka_data_y_test, sample_weight=None)
        i+=1
    return mse/cv, score/cv
svr_mse,svr_score=xSVR(full,'BBB_classification_nr')
print("MSE is %s. Score is %s." % (svr_mse, svr_score))