# This Python script can be used to generate predictions for new molecules for the chance of RNA vs Protein binding. 

## Software License:

MIT License

Copyright (c) 2022 Yazdani et al. 

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

## Jupyter notebook within the Anaconda platform was used in writing the following script. Python version 3.8.2 was used as the coding language. 
$\;\;\;\;\;\;$



![Python logo](https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Python-logo-notext.svg/300px-Python-logo-notext.svg.png)

$\;\;\;\;\;\;$

In [359]:
# Check Python version
from platform import python_version
print("Python version:")
print(python_version())

Python version:
3.8.2


## macOS Catalina version 10.15.7 was used when running this code on the Anaconda platform. 

## This code was written with the following package versions. Please install the following packages with the mentioned versions for consistency of results.

### pandas --> version 1.4.2

In [360]:
# Check pandas version
import pandas as pd
print("pandas version:")
print(pd.__version__)

pandas version:
1.4.2


### numpy --> version 1.20.3

In [361]:
# Check numpy version
import numpy as np
print("numpy version:")
print(np.__version__)

numpy version:
1.20.3


### sklearn --> version 1.0.2

In [370]:
# Check sklearn version
import sklearn
print("sklearn version:")
print(sklearn.__version__)

sklearn version:
1.0.2


### tensorflow --> version 2.3.1

In [362]:
# Check tensorflow version
import tensorflow as tf
print("tensorflow version:")
print(tf.__version__)

tensorflow version:
2.3.1


### keras --> version 2.4.0


In [363]:
# Check keras version
print("keras version:")
print(tf.keras.__version__)

keras version:
2.4.0


In [364]:
# Remove warnings
import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [365]:
# Import the required packages for data processing and running the MLP model
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import pickle
# Remove warnings
import logging
tf.get_logger().setLevel(logging.ERROR)

# Gets current directory of the jupyter notebook to use for directories of different files
folder_dir = os.getcwd()[:-10]

# Load medians of features from the training set to fill in missing values of the new compound set
df_medians = pd.read_csv(folder_dir + "/ML_CSVs/df_all_3_feature_medians.csv")

# Get features assined to medians as a dictionary
median_values_dict = dict(zip(df_medians.feature, df_medians.median_value))

# Get the features of the training set for use later
features = df_medians["feature"].tolist()

# Load in the saved Keras model
reconstructed_model = keras.models.load_model(folder_dir + "/MLP_saved_model")

print("Processes complete!")

Processes complete!


In [366]:
# Preparation of the new compounds array to be run in the MLP model

# Read in Mordred features of the new compound set
# As an example, the input here is the 8 compounds shown as test cases in the paper. 
df_Mordred_extra = pd.read_csv(folder_dir + "/data/Mordred_files/Mordred_Test_Compounds_3D.csv")

# Take the compound names as a list as they will be used later
Mordred_extra_names = df_Mordred_extra["name"].tolist()

# Remove the name column
df_Mordred_extra_no_name = df_Mordred_extra.iloc[:, 1:]

# Only keep features of the new compounds that also exist in df_all_3
for item in df_Mordred_extra_no_name:
    if item not in features:
        df_Mordred_extra_no_name = df_Mordred_extra_no_name.drop(item, axis=1)

# Replace infinity in the feature values by nan values
df_Mordred_extra_no_name.replace([np.inf, -np.inf], np.nan, inplace=True)     

# Fill nan values of the test matrix with median of df_all_3 columns
for i in range(df_Mordred_extra_no_name.shape[0]):
    for item in features:
        if np.isnan(df_Mordred_extra_no_name.at[i, item]) == True:
            df_Mordred_extra_no_name.at[i, item] = median_values_dict[item]
            
df_Mordred_extra_no_name

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,32.697969,21.600512,0,0,53.26632,2.535517,4.951115,53.26632,1.331658,4.658581,...,10.871099,94.294166,540.102961,9.644696,6054,75,232.0,286.0,12.0,8.333333
1,18.966332,15.550824,0,0,29.660194,2.474341,4.823314,29.660194,1.235841,4.11535,...,10.181233,73.176731,351.065318,9.751814,1391,38,130.0,153.0,9.201389,5.027778
2,21.241713,16.020566,0,1,35.725953,2.40332,4.792133,35.725953,1.323183,4.233444,...,10.113384,75.174897,382.15758,7.799134,2021,39,142.0,165.0,7.138889,5.972222
3,25.416672,21.303532,0,1,38.989563,2.676186,5.352371,38.989563,1.218424,4.419651,...,11.001783,69.521365,445.160542,7.809834,2350,76,186.0,239.0,14.680556,6.666667
4,29.198227,19.51697,0,2,49.161634,2.372244,4.744487,49.161634,1.328693,4.541483,...,10.415502,73.587263,494.266285,7.163279,5324,56,194.0,224.0,9.972222,8.083333
5,26.805756,19.185768,0,0,45.149909,2.538045,4.923438,45.149909,1.327938,4.46747,...,10.481701,84.821561,440.196074,7.722738,3665,56,182.0,217.0,9.222222,7.527778
6,22.374448,18.015057,0,0,36.166453,2.476482,4.952965,36.166453,1.247119,4.284712,...,10.29123,64.526738,404.256274,6.219327,2246,47,150.0,175.0,10.972222,6.388889
7,16.199155,13.295586,1,0,26.927238,2.535864,4.933973,26.927238,1.346362,3.994834,...,10.161882,73.059238,265.109485,8.033621,676,39,114.0,141.0,5.388889,4.277778


In [367]:
# Run the model on the new compound dataset

# Load in the standardscalar package
from sklearn.preprocessing import StandardScaler

# Load in the standardization used in standardizing the training set
sc = pickle.load(open(folder_dir + "/ML_CSVs/scaler.pkl", 'rb'))

# Standardize the new compound dataset using the standardization parameters derived from the training set
X_extra_test = df_Mordred_extra_no_name.values
X_extra_test = sc.transform(X_extra_test)

# Use the MLP model to predict the chance of RNA binding over protein binding
prediction = [item[0] for item in reconstructed_model.predict(X_extra_test).tolist()]

# Print compound name followed by the probability predicted by model
for i in range(len(prediction)):
    print(Mordred_extra_names[i])
    print(round(prediction[i]*100, 1))
    print()

ADQ
100.0

HIV TAR compound 4
100.0

Ribocil-A
88.5

Tetracycline
97.1

Imatinib
0.1

Ibrutinib
7.4

Lovastatin
0.0

Nevirapine
0.1

