In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
!conda install -c conda-forge mamba -y
!mamba install -q -y -c conda-forge pandas matplotlib seaborn rdkit

In [None]:
!pip install --upgrade keras
!pip install --upgrade scikit_learn

In [None]:
from pathlib import Path
from warnings import filterwarnings

filterwarnings("ignore")

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys, Draw, rdFingerprintGenerator
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint

%matplotlib inline

In [None]:
df = pd.read_csv("/content/final_cleaned_file_from_substructures.csv")
df = df.drop(["IC50", "units", "class", "molecular_weight", "logp", "n_hbd", "n_hba", "ROMol"], axis=1)
df


In [None]:
chembl_df = df[["smiles", "pIC50"]]
chembl_df

In [None]:
def smiles_to_fp(smiles, method="maccs", n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if method == "maccs":
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    if method == "morgan2":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(fpg.GetCountFingerprint(mol))
    if method == "morgan3":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_bits)
        return np.array(fpg.GetCountFingerprint(mol))
    else:
        print(f"Warning: Wrong method specified: {method}." " Default will be used instead.")
        return np.array(MACCSkeys.GenMACCSKeys(mol))

In [None]:
chembl_df["fingerprints_df"] = chembl_df["smiles"].apply(smiles_to_fp)

print("Shape of dataframe:", chembl_df.shape)
chembl_df

In [None]:
chembl_df.to_csv('/content/sample_data/MACCS_fingerprints_from_bioactive_data.csv', index=False)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    chembl_df["fingerprints_df"], chembl_df[["pIC50"]], test_size=0.3, random_state=42
)
print("Shape of training data:", x_train.shape)
print("Shape of test data:", x_test.shape)

In [None]:
def neural_network_model(hidden1, hidden2):
    model = Sequential()
    model.add(Dense(hidden1, activation="relu", name="layer1"))
    model.add(Dense(hidden2, activation="relu", name="layer2"))
    model.add(Dense(1, activation="linear", name="layer3"))
    model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse", "mae"])
    return model

In [None]:
batch_sizes = [16, 32, 64]
nb_epoch = 50
layer1_size = 64
layer2_size = 32

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, len(batch_sizes), figsize=(12, 6))
sns.set_style("white")
sns.set_palette("Set2")

for index, batch in enumerate(batch_sizes):
    ax = axes[index] if len(batch_sizes) > 1 else axes
    model = neural_network_model(layer1_size, layer2_size)

    history = model.fit(
        np.array(list((x_train))).astype(float),
        y_train.values,
        batch_size=batch,
        validation_data=(np.array(list((x_test))).astype(float), y_test.values),
        verbose=0,
        epochs=nb_epoch,
    )

    ax.plot(history.history["loss"], label="train")
    ax.plot(history.history["val_loss"], label="test")
    ax.legend(["train", "test"], loc="upper right")
    ax.set_ylabel("Loss")
    ax.set_xlabel("Epoch")
    ax.set_ylim((0, 15))
    ax.set_title(f"Test Loss = {history.history['val_loss'][nb_epoch-1]:.2f}, Batch Size = {batch}")
    sns.despine(ax=ax)

plt.tight_layout()
plt.savefig("training_loss_plot.png", dpi=600)
plt.show()


In [None]:
from keras.callbacks import ModelCheckpoint
import numpy as np

filepath = "/content/sample_data/best_weights.weights.h5"
checkpoint = ModelCheckpoint(
    filepath,
    monitor="loss",
    verbose=0,
    save_best_only=True,
    mode="min",
    save_weights_only=True,
)
callbacks_list = [checkpoint]
model.fit(
    np.array(list((x_train))).astype(float),
    y_train.values,
    epochs=nb_epoch,
    batch_size=64,
    callbacks=callbacks_list,
    verbose=0,
)

In [None]:
print(f"Evaluate the model on the test data")
scores = model.evaluate(np.array(list((x_test))), y_test.values, verbose=0)
print(f" loss: {scores[0]:.2f}")
print(f" mse (same as loss): {scores[1]:.2f}")
print(f" mae: {scores[2]:.2f}")

In [None]:
y_pred = model.predict(np.array(list((x_test))))
first_5_prediction = [print(f"{value[0]:.2f}") for value in y_pred[0:5]]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

limits = 0, 15
fig, ax = plt.subplots(figsize=(10, 10))
sns.set_style("white")
ax.scatter(y_pred, y_test, marker="o", color="orange", edgecolor="black", alpha=0.7, s=100)
lin = np.linspace(*limits, 100)
ax.plot(lin, lin, color="red", linestyle="--", linewidth=2)
ax.set_aspect("equal", adjustable="box")
ax.set_xlabel("Predicted Values", fontsize=14, weight='bold')
ax.set_ylabel("True Values", fontsize=14, weight='bold')
ax.set_title("Scatter Plot: pIC50 Values", fontsize=16, weight='bold')
ax.set_xlim(limits)
ax.set_ylim(limits)
ax.grid(False)
sns.despine(ax=ax)
ax.set_facecolor('white')
plt.savefig("pIC50_scatter_plot.png", dpi=600, bbox_inches='tight')
plt.show()


In [None]:
from keras.models import model_from_json

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.weights.h5")
print("Saved model to disk")

In [None]:
external_data = pd.read_csv("/content/molecules.csv")
external_data = external_data.reset_index(drop=True)
external_data.head()

In [None]:
external_data["fingerprints_df"] = external_data["canonical_smiles"].apply(smiles_to_fp)
print("Shape of dataframe : ", external_data.shape)
external_data.head(3)

In [None]:
chembl_df.to_csv('/content/sample_data/MACCS_fingerprints_from_drug_library.csv', index=False)

In [None]:
json_file = open("model.json", "r")
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("model.weights.h5")
print("Loaded model from disk")

In [None]:
predictions = model.predict(
    np.array(list((external_data["fingerprints_df"]))).astype(float), callbacks=callbacks_list
)

predicted_pIC50 = pd.DataFrame(predictions, columns=["predicted_pIC50"])
predicted_pIC50_df = external_data.join(predicted_pIC50)

predicted_pIC50_df

In [None]:
predicted_pIC50_df.to_csv("/content/sample_data/predicted_pIC50_df.csv")

In [None]:
predicted_pIC50_df = pd.read_csv("/content/sample_data/predicted_pIC50_df.csv", index_col=0)
top3_drug = predicted_pIC50_df.nlargest(3, "predicted_pIC50")
top3_drug

In [None]:
highest_pIC50 = predicted_pIC50_df["canonical_smiles"][top3_drug.index]

mols_EGFR = [Chem.MolFromSmiles(smile) for smile in highest_pIC50]
pIC50_EGFR = top3_drug["predicted_pIC50"].tolist()
pIC50_values = [(f"pIC50 value: {value:.2f}") for value in pIC50_EGFR]

Draw.MolsToGridImage(mols_EGFR, molsPerRow=3, subImgSize=(450, 300), legends=pIC50_values)