# This Python script can be used to generate predictions for new molecules for the chance of RNA vs Protein binding. 

## Software License:

MIT License

Copyright (c) 2022 Yazdani et al. 

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

## Jupyter notebook within the Anaconda platform was used in writing the following script. Python version 3.8.2 was used as the coding language. 
$\;\;\;\;\;\;$



![Python logo](https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Python-logo-notext.svg/300px-Python-logo-notext.svg.png)

$\;\;\;\;\;\;$

In [229]:
# Check Python version
from platform import python_version
print("Python version:")
print(python_version())

Python version:
3.8.2


## macOS Catalina version 10.15.7 was used when running this code on the Anaconda platform. 

## This code was written with the following package versions. Please install the following packages with the mentioned versions for consistency of results.

### pandas --> version 1.4.2

In [230]:
# Check pandas version
import pandas as pd
print("pandas version:")
print(pd.__version__)

pandas version:
1.4.2


### numpy --> version 1.20.3

In [231]:
# Check numpy version
import numpy as np
print("numpy version:")
print(np.__version__)

numpy version:
1.20.3


### tensorflow --> version 2.3.1

In [232]:
# Check tensorflow version
import tensorflow as tf
print("tensorflow version:")
print(tf.__version__)

tensorflow version:
2.3.1


### keras --> version 2.4.0


In [233]:
# Check keras version
print("keras version:")
print(tf.keras.__version__)

keras version:
2.4.0


In [234]:
# Remove warnings
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [235]:
# Import the required packages for data processing and running the MLP model
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import pickle

# Gets current directory of the jupyter notebook to use for directories of different files
folder_dir = os.getcwd()[:-10]

# Load medians of features from the training set to fill in missing values of the new compound set
df_medians = pd.read_csv(folder_dir + "/ML_CSVs/df_all_3_feature_medians.csv")

# Get features assined to medians as a dictionary
median_values_dict = dict(zip(df_medians.feature, df_medians.median_value))

# Get the features of the training set for use later
features = df_medians["feature"].tolist()

# Load in the saved Keras model
reconstructed_model = keras.models.load_model(folder_dir + "/MLP_saved_model")

print("Processes complete!")

Processes complete!


In [238]:
# Preparation of the new compounds array to be run in the MLP model

# Read in Mordred features of the new compound set
# As an example, the input here is the 8 compounds shown as test cases in the paper. 
# df_Mordred_extra = pd.read_csv(folder_dir + "/data/Mordred_files/Mordred_Test_Compounds_3D.csv")
df_Mordred_extra = pd.read_csv("/Users/yazdanik2/Desktop/Mordred_FDA_Approved_3D.csv")

# Take the compound names as a list as they will be used later
Mordred_extra_names = df_Mordred_extra["name"].tolist()

# Remove the name column
df_Mordred_extra_no_name = df_Mordred_extra.iloc[:, 1:]

# Only keep features of the new compounds that also exist in df_all_3
for item in df_Mordred_extra_no_name:
    if item not in features:
        df_Mordred_extra_no_name = df_Mordred_extra_no_name.drop(item, axis=1)

# Replace infinity in the feature values by nan values
df_Mordred_extra_no_name.replace([np.inf, -np.inf], np.nan, inplace=True)     

# Fill nan values of the test matrix with median of df_all_3 columns
for i in range(df_Mordred_extra_no_name.shape[0]):
    for item in features:
        if np.isnan(df_Mordred_extra_no_name.at[i, item]) == True:
            df_Mordred_extra_no_name.at[i, item] = median_values_dict[item]
            
df_Mordred_extra_no_name

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,21.867940,16.331012,0,0,37.431719,2.436602,4.800101,37.431719,1.336847,4.266345,...,10.162500,77.168493,386.120132,8.393916,2325,42,146,171,7.388889,6.277778
1,20.769965,17.451224,0,0,34.008889,2.508394,4.961642,34.008889,1.259588,4.218296,...,10.241352,76.164837,456.000008,10.857143,1808,44,140,166,9.861111,6.027778
2,31.336126,22.837287,0,2,52.636761,2.513332,4.919404,52.636761,1.315919,4.619682,...,10.601846,91.481771,540.260531,7.300818,5935,66,212,251,12.555556,8.805556
3,21.041115,17.918386,0,0,34.697216,2.325186,4.650372,34.697216,1.239186,4.221055,...,9.890352,62.447305,384.196886,7.248998,2118,38,134,149,10.388889,6.333333
4,27.662693,20.272208,0,2,46.602898,2.480060,4.960120,46.602898,1.294525,4.503189,...,10.500372,72.593450,530.172022,8.032909,4488,60,186,219,11.833333,8.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2323,11.738453,11.251707,0,0,17.008240,2.222634,4.445268,17.008240,1.063015,3.629248,...,9.245804,47.426776,278.013009,9.267100,510,16,74,75,9.347222,3.361111
2324,15.475469,12.527073,0,1,27.290379,2.358294,4.716589,27.290379,1.364519,3.922549,...,9.679406,53.145804,278.284227,4.969361,802,27,100,114,4.444444,4.555556
2325,19.327242,15.916846,1,0,32.135027,2.487998,4.917628,32.135027,1.285401,4.141513,...,10.068366,73.229474,339.109878,8.477747,1591,37,128,149,8.138889,5.555556
2326,23.500933,18.102267,1,0,39.796535,2.414591,4.731796,39.796535,1.283759,4.341191,...,10.129308,80.252042,438.174453,7.426686,3316,45,154,177,10.250000,7.111111


In [239]:
# Run the model on the new compound dataset

# Load in the standardscalar package
from sklearn.preprocessing import StandardScaler

# Load in the standardization used in standardizing the training set
sc = pickle.load(open(folder_dir + "/ML_CSVs/scaler.pkl", 'rb'))

# Standardize the new compound dataset using the standardization parameters derived from the training set
X_extra_test = df_Mordred_extra_no_name.values
X_extra_test = sc.transform(X_extra_test)

# Use the MLP model to predict the chance of RNA binding over protein binding
prediction = [item[0] for item in reconstructed_model.predict(X_extra_test).tolist()]

# Print compound name followed by the probability predicted by model
for i in range(len(prediction)):
    print(Mordred_extra_names[i])
    print(round(prediction[i]*100, 1))
    print()

axitinib
0.0

azd6244 (selumetinib)
30.1

nintedanib (bibf 1120)
0.3

bortezomib (velcade)
0.0

bosutinib (ski-606)
0.7

dasatinib
0.0

deforolimus (mk-8669)
0.0

gefitinib (zd1839)
0.6

imatinib mesylate (sti571)
0.0

lapatinib (gw-572016) ditosylate
94.8

lenalidomide (cc-5013)
99.5

panobinostat (lbh589)
94.6

nilotinib (amn-107)
0.0

pazopanib hcl
0.0

rapamycin (sirolimus)
0.0

sorafenib tosylate
1.4

sunitinib malate
5.8

vorinostat
2.5

entinostat
7.0

enzastaurin (ly317615)
0.0

olaparib
0.7

masitinib (ab1010)
0.1

vismodegib (gdc-0449)
0.1

belinostat (pxd101)
0.0

rucaparib
0.0

everolimus (rad001)
0.0

cabozantinib
0.1

malotilate
92.3

ivacaftor (vx-770)
43.9

docetaxel (taxotere)
0.0

paclitaxel (taxol)
0.0

regorafenib (bay 73-4506)
0.0

danoprevir (itmn-191)
0.0

ritonavir
0.0

anastrozole
3.5

aprepitant (mk-0869)
0.0

bicalutamide (casodex)
0.0

fulvestrant (faslodex)
0.0

raltitrexed (tomudex)
0.0

exemestane
0.0

dutasteride
0.0

tivozanib (av-951)
8.2

doxorubicin 

17.9

oxcarbazepine
0.2

trichlormethiazide (achletin)
0.3

suprofen (profenal)
0.5

pranlukast
0.4

oxfendazole
6.6

tizanidine hcl
0.8

erythromycin ethylsuccinate
0.0

pralatrexate(folotyn)
0.0

alprostadil(caverject)
0.0

saxagliptin
0.0

roxatidine acetate hcl
70.3

protionamide (prothionamide)
99.9

sulfamethizole (proklar)
99.9

tropicamide
63.1

sulbactam
1.1

pranoprofen
0.0

rimantadine (flumadine)
82.6

meglumine
0.1

diclazuril
0.0

flunarizine 2hcl
0.0

fenticonazole nitrate
5.6

rebamipide
0.0

bromhexine hcl
1.0

lovastatin (mevacor)
0.0

tiopronin (thiola)
4.7

balofloxacin
47.0

lafutidine
0.0

moxonidine
82.0

argatroban
0.0

famotidine (pepcid)
0.0

moexipril hcl
0.8

clevidipine butyrate
86.1

adiphenine hcl
0.8

rivastigmine tartrate (exelon)
0.0

ambrisentan
1.0

bexarotene
0.0

temocapril hcl
0.1

gabexate mesylate
0.6

rasagiline mesylate
35.0

flunixin meglumin
62.3

dronedarone hcl (multaq)
0.0

conivaptan hcl (vaprisol)
0.0

ibutilide fumarate
0.0

probucol
0

4.8

daminozide
98.6

menadiol diacetate
35.3

benzyl isothiocyanate
0.0

alpha-asarone
99.9

harmaline
1.1

drostanolone propionate
0.0

trenbolone acetate
0.4

methandrostenolone
0.1

melibiose
62.3

maltitol
66.0

flavone
25.6

protocatechuic acid
0.0

(-)-borneol
99.9

cefuroxime axetil
0.0

potassium acetate
27.7

cefcapene pivoxil hydrochloride
0.0

cefotiam hydrochloride
0.0

methyl 4-hydroxybenzoate
0.0

l(+)-arabinose
0.0

l-tryptophan
0.2

d-(+)-trehalose dihydrate
99.9

guaiazulene
0.0

thioctic acid
98.1

oxaceprol
13.5

ceftizoxime
0.0

atenolol
0.0

acotiamide hydrochloride
84.4

bedaquiline fumarate
0.0

asunaprevir
0.0

delamanid
3.3

tmc-435
0.0

azathramycin
0.0

anamorelin
0.0

chlorthalidone
0.1

madecassoside
0.0

propacetamol hydrochloride
58.0

elagolix sodium
0.0

tiamulin fumarate
0.0

nadolol
0.7

sophoricoside
0.0

propantheline bromide
0.0

eperisone hydrochloride
0.9

neticonazole hydrochloride
11.2

valpromide
44.0

ibudilast
5.8

gadopentetate dimeglumine

dolasetron
2.7

cefmetazole sodium
0.0

cefminox sodium
0.0

ceftiofur
0.0

diflorasone
0.0

bendazac
30.2

ethoxyquin
81.9

betrixaban
0.1

cyclofenil
0.4

propylparaben
97.9

sultamicillin
0.0

loxoprofen sodium
7.9

ertugliflozin
70.5

diflucortolone valerate
0.0

saikosaponin a
0.0

emedastine difumarate
19.7

iguratimod
77.1

zofenopril calcium
57.8

l-ornithine hydrochloride
0.0

hydrocortisone acetate
0.1

landiolol hydrochloride
0.0

midecamycin
0.3

cilastatin
0.0

benzalkonium chloride
28.5

cefpiramide sodium
0.0

vitamin k1
0.0

lithium carbonate
7.6

latamoxef sodium
0.0

ethyl gallate
45.5

ghrp-2
0.0

eptifibatide acetate
0.0

atosiban acetate
0.0

azasetron hcl
0.0

apremilast (cc-10004)<nl>apremilast (cc-10004)
0.7

difloxacin hydrochloride
91.1

edoxaban
0.0

orcinol
99.5

oxalic acid
100.0

ligustilide
4.1

rhynchophylline
0.5

yangonin
0.3

triptonide
100.0

malic acid
100.0

cabergoline
1.5

cinitapride hydrogen tartrate
0.1

metronidazole benzoate
28.8

hyaluronic