# Dependencies

In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import pickle

# Functions

In [23]:
# Function to find drugs in DrugBank that are commercially available

def filt_MCE(db_set,MCE_set):
    print("Size of DrugBank set:",db_set.shape[0])
    print("Size of MCE set:",MCE_set.shape[0])
    count = 0
    db_set = db_set.reset_index()
    MCE_set = MCE_set.reset_index()
    
    db_filt = pd.DataFrame(columns = list(db_set.columns))
    MCE_filt = pd.DataFrame(columns = list(MCE_set.columns))

    for index, row in db_set.iterrows():
        print(index+1,'out of',db_set.shape[0])
        if len(row['name']) < 40:
            indices = [i for i,s in enumerate(MCE_set['Product Name']) if row['name'] in MCE_set.loc[i]['Product Name']]
            if len(indices) == 0:
                indices += [j for j,t in enumerate(MCE_set['Synonyms']) if row['name'] in MCE_set.loc[j]['Synonyms']]
            if len(indices) != 0:
                temp = MCE_set.loc[indices]
                MCE_filt = MCE_filt.append(MCE_set.loc[temp.index[temp['Product Name'] == min(temp["Product Name"],key=len)]])
                db_filt = db_filt.append(row)
                count += 1;
        else:
            print("error: ",row["name"])
    print(count)
    return db_filt, MCE_filt

# Import data

In [40]:
# Import list of drugs from first set
prelim_set = pd.read_csv('Data_files/MCE_all_drugs_1.csv')

In [47]:
# Load preliminary model
clf = pickle.load(open("Data_files/prelim_best_model.p", "rb"))
descs = pickle.load(open("Data_files/prelim_best_features.p", "rb"))
scaler = pickle.load(open("Data_files/prelim_scaler.p", "rb"))
X = pickle.load(open("Data_files/prelim_X.p", "rb"))
data = pd.read_csv('Data_files/RG_for_ML.csv')

# generated AlvaDesc descriptors from molecules in DrugBank that are hydrophobic, ionizable, 
# and have physiological charge greater than or equal to 0
db_molecules = pd.read_csv('Data_files/descriptors_db_filt.csv',low_memory=False)

MCE_plus = pd.read_csv('Data_files/MCE_screening_plus.csv',keep_default_na=False)

# Pre-processing

In [48]:
# remove descriptors with NA values or where error occured
db_molecules = db_molecules.dropna()
db_molecules = db_molecules[db_molecules['ERROR']=='-']

# transform descriptors with scaler used in model fit
db_molecules[X.columns] = scaler.transform(db_molecules[X.columns])

# Searching for purchasable compounds from predicted siRNA complexing drugs

In [49]:
# output number of drugs in filtered DrugBank set predicted to complex siRNA with 
print('Number predicted to complex siRNA: ',sum(clf.predict(db_molecules[descs])))
print('Total in set: ',len(db_molecules))

Number predicted to complex siRNA:  329
Total in set:  1570


In [52]:
# Generate list of DrugBank molecules with siRNA complexation prediction using preliminary model

db_names = pd.DataFrame()
db_names['Name'] = db_molecules['NAME']
db_names['SMILES'] = db_molecules['SMILES']
db_names['DrugBank ID'] = db_molecules['drugbank_id']
db_names['Prediction'] = clf.predict(db_molecules[descs])
db_names

Unnamed: 0,Name,SMILES,DrugBank ID,Prediction
0,INDECAINIDE,CC(C)NCCCC1(C(N)=O)C2=CC=CC=C2C3=CC=CC=C13,DB00192,False
1,reserpine,C12CC(OC(=O)C3=CC(OC)=C(OC)C(OC)=C3)C(OC)C(C(=...,DB00206,False
2,Azithromycin,CCC1OC(=O)C(C)C(OC2CC(C)(OC)C(O)C(C)O2)C(C)C(O...,DB00207,False
3,Ticlopidine,ClC1=CC=CC=C1CN2CCC3=C(C2)C=CS3,DB00208,False
4,Citalopram,CN(C)CCCC1(OCC2=C1C=CC(=C2)C#N)C3=CC=C(F)C=C3,DB00215,False
...,...,...,...,...
1581,BP-14979,COCCC(=O)NC1CCC(CCN2CCN(CC2)C3=CC=CC(=C3)C#N)CC1,DB16302,False
1582,Revaprazan,CC1N(CCC2=C1C=CC=C2)C3=NC(NC4=CC=C(F)C=C4)=NC(...,DB16308,False
1583,Vicagrel,COC(=O)C(N1CCC2=C(C1)C=C(OC(C)=O)S2)C3=C(Cl)C=...,DB16349,False
1584,Volinanserin,COC1=C(OC)C(=CC=C1)C(O)C2CCN(CCC3=CC=C(F)C=C3)CC2,DB16351,False


In [63]:
# Make list of DrugBank compounds that are predicted to complex siRNA and were not in preliminary set

db_new = db_names[db_names['Prediction']==True][~db_names[db_names['Prediction']==True]['DrugBank ID'].isin(prelim_set['drugbank_id'])]
db_new['name']=db_new['Name']
db_new

In [65]:
# Find which of the predicted siRNA complexing drugs from DrugBank are in MedChemExpress screening library

[db_out,MCE_out] = filt_MCE(db_new,MCE_plus)

Size of DrugBank set: 290
Size of MCE set: 14614
1 out of 290
2 out of 290
3 out of 290
4 out of 290
5 out of 290
6 out of 290
7 out of 290
8 out of 290
9 out of 290
10 out of 290
11 out of 290
12 out of 290
13 out of 290
14 out of 290
15 out of 290
error:  2-{2-hydroxy-[1,1'-biphenyl]-3-yl}-1H-1,3-benzodiazole-5-carboximidamide
16 out of 290
error:  1,2-dihexadecanoyl-sn-glycero-3-phosphoethanolamine
17 out of 290
18 out of 290
error:  4-[5-[2-(1-Phenyl-Ethylamino)-Pyrimidin-4-Yl]-1-Methyl-4-(3-Trifluoromethylphenyl)-1h-Imidazol-2-Yl]-Piperidine
19 out of 290
20 out of 290
error:  1-(2,6-Dichlorophenyl)-5-(2,4-Difluorophenyl)-7-Piperidin-4-Yl-3,4-Dihydroquinolin-2(1h)-One
21 out of 290
error:  5-[4-TERT-BUTYLPHENYLSULFANYL]-2,4-QUINAZOLINEDIAMINE
22 out of 290
error:  DI-STEAROYL-3-SN-PHOSPHATIDYLETHANOLAMINE
23 out of 290
24 out of 290
25 out of 290
26 out of 290
27 out of 290
error:  3,8-Diamino-6-Phenyl-5-[6-[1-[2-[(1,2,3,4-Tetrahydro-9-Acridinyl)Amino]Ethyl]-1h-1,2,3-Triazol-4-Yl]

147 out of 290
error:  3-FLUORO-5-MORPHOLIN-4-YL-N-[3-(2-PYRIDIN-4-YLETHYL)-1H-INDOL-5-YL]BENZAMIDE
148 out of 290
149 out of 290
error:  N-{4-METHYL-3-[(3-PYRIMIDIN-4-YLPYRIDIN-2-YL)AMINO]PHENYL}-3-(TRIFLUOROMETHYL)BENZAMIDE
150 out of 290
error:  2-({4-[4-(pyridin-4-ylmethyl)-1H-pyrazol-3-yl]phenoxy}methyl)quinoline
151 out of 290
error:  2-{[4-(4-pyridin-4-yl-1H-pyrazol-3-yl)phenoxy]methyl}quinoline
152 out of 290
153 out of 290
error:  6,7-DIMETHOXY-4-[(3R)-3-(2-NAPHTHYLOXY)PYRROLIDIN-1-YL]QUINAZOLINE
154 out of 290
155 out of 290
156 out of 290
157 out of 290
error:  6-amino-2-[(1-naphthylmethyl)amino]-3,7-dihydro-8H-imidazo[4,5-g]quinazolin-8-one
158 out of 290
error:  4-[4-(4-Fluorophenyl)-2-[4-[(R)-methylsulfinyl]phenyl]-1H-imidazol-5-yl]pyridine
159 out of 290
error:  1-(3-bromophenyl)-7-chloro-6-methoxy-3,4-dihydroisoquinoline
160 out of 290
error:  (2E)-N-{4-[(3-bromophenyl)amino]quinazolin-6-yl}-4-(dimethylamino)but-2-enamide
161 out of 290
162 out of 290
error:  3-FLUORO-5

# Output

In [69]:
# 83 of these drugs were purchased from MedChemExpress for screening

MCE_out.to_csv('Data_files/MCE_compounds_new_predictions.csv')