In [1]:
import pycaret
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import inchi

from tqdm import tqdm
from time import sleep
from tqdm.notebook import tqdm

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit import DataStructs

[typing.Tuple[int, int, int], <class 'NoneType'>]
[<class 'datetime.date'>, <class 'NoneType'>]
[typing.List[typing.Any], ~_T]
[<class 'str'>, typing.List[str]]
[<class 'str'>, ~_T]
[ForwardRef('Distribution'), <class 'NoneType'>]
[<class 'setuptools._vendor.packaging._elffile.ELFFile'>, <class 'NoneType'>]
[<class 'setuptools._vendor.packaging._musllinux._MuslVersion'>, <class 'NoneType'>]
[<class 'setuptools.extern.packaging._structures.InfinityType'>, <class 'setuptools.extern.packaging._structures.NegativeInfinityType'>]
[<class 'setuptools.extern.packaging._structures.InfinityType'>, <class 'setuptools.extern.packaging._structures.NegativeInfinityType'>, typing.Tuple[str, int]]
[<class 'setuptools.extern.packaging._structures.InfinityType'>, <class 'setuptools.extern.packaging._structures.NegativeInfinityType'>, <class 'int'>, <class 'str'>]
[<class 'setuptools.extern.packaging._structures.InfinityType'>, <class 'setuptools.extern.packaging._structures.NegativeInfinityType'>, <cla

Tanimoto similarity

Validate the 7 triple positive hits: are they similar to the nown Mcl-1/Bcl-2 inhibitors?

In [2]:
#Load the Mcl-1 and Bc-2 datasets
mcl_raw = pd.read_csv('../data_preparation/inhibitors/mcl/mcl_molecules.csv', index_col=0) #865 compounds
bcl_raw = pd.read_csv('../data_preparation/inhibitors/bcl/bcl_molecules.csv', index_col=0) #428 compounds

#Filter for positive molecules: inhibitors
mcl = mcl_raw[mcl_raw['Class'] == 'Inhibitor'] #298 compounds
mcl = mcl.reset_index(drop=True)
mcl = mcl.rename(columns={'SMILES': 'papyrus_SMILES'})

bcl = bcl_raw[bcl_raw['Class'] == 'Inhibitor'] #125 compounds
bcl = bcl.reset_index(drop=True)
bcl = bcl.rename(columns={'SMILES': 'papyrus_SMILES'})

In [3]:
#Load the seven triple positive hits
hits = pd.read_csv('triple_pos.csv')

columns_to_keep = ['papyrus_SMILES', 'composite_score']
hits = hits[columns_to_keep]

In [4]:
#Generate SMILES list
def generate_smiles(df):
    smiles_list = df['papyrus_SMILES'].tolist()

    return smiles_list

In [5]:
#Generate molecules from SMILES

def generate_mols(smiles_list):
    mols = []
    smiles_to_fix = []

    for smiles in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smiles)
            mols.append(mol)

        except Exception as e:
            print(f"Error with SMILES string {smiles}")
            continue 


    #check for None Mol objects
    for i, mol in enumerate(mols):
        if mol is None:
            print(f"Error with SMILES string at index {i}: {smiles_list[i]}")
            smiles_to_fix.append(i)
            continue

    return mols

In [6]:
#Add hydrogens to molecules

def add_hydrogens(mols):

    mols_H = []

    for mol in mols:
        mol_H = Chem.AddHs(mol)
        mols_H.append(mol_H)

    return mols_H

In [7]:
#Create Morgan Fingerprints
def morgan_fp(mols_H):

    fp = []

    print(f'Calculating Morgan fingerprints...')

    radius = 2 #define radius
    nBits = 2048   #define number of bits

    for mol in mols_H:
        new_data = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        fp.append(new_data)

    return fp

In [8]:
#Create Morgan FPs for known Mcl-1 inhibitors
df = mcl

smiles_list= generate_smiles(df)
mols = generate_mols(smiles_list)
mols_H = add_hydrogens(mols)

mcl_morgan= morgan_fp(mols_H)



Calculating Morgan fingerprints...


In [9]:
#Create Morgan FPs for known Bcl-2 inhibitors
df = bcl

smiles_list= generate_smiles(df)
mols = generate_mols(smiles_list)
mols_H = add_hydrogens(mols)

bcl_morgan= morgan_fp(mols_H)



Calculating Morgan fingerprints...


In [10]:
#Create Morgan FPs for triple positive hits
df = hits

smiles_list= generate_smiles(df)
mols = generate_mols(smiles_list)
mols_H = add_hydrogens(mols)

hits_morgan= morgan_fp(mols_H)


Calculating Morgan fingerprints...


In [11]:
#Generate similarity matrix: 7 triple hits compared to known Mcl-1 inhibitors

mcl_similarity_matrix = []

#Iterate through hits compounds
for hit_fp in hits_morgan:
    mcl_hit_similarities = []

    #Iterate through Mcl-1 inhibitors
    for mcl_fp in mcl_morgan:

        #Calculate similarity
        similarity = DataStructs.TanimotoSimilarity(hit_fp, mcl_fp)
        mcl_hit_similarities.append(similarity)

    mcl_similarity_matrix.append(mcl_hit_similarities)

print("Mcl-1 Similarity matrix:")
for row in mcl_similarity_matrix:
    print(row)

Mcl-1 Similarity matrix:
[0.16923076923076924, 0.1557377049180328, 0.20618556701030927, 0.1346153846153846, 0.14563106796116504, 0.13385826771653545, 0.1262135922330097, 0.12264150943396226, 0.1532258064516129, 0.16363636363636364, 0.12931034482758622, 0.13402061855670103, 0.17518248175182483, 0.15503875968992248, 0.13533834586466165, 0.1, 0.10576923076923077, 0.13740458015267176, 0.16129032258064516, 0.11023622047244094, 0.15702479338842976, 0.14173228346456693, 0.1568627450980392, 0.16, 0.11320754716981132, 0.14130434782608695, 0.1323529411764706, 0.12037037037037036, 0.11650485436893204, 0.12781954887218044, 0.17117117117117117, 0.15492957746478872, 0.14035087719298245, 0.11570247933884298, 0.10752688172043011, 0.1532258064516129, 0.08421052631578947, 0.1553398058252427, 0.1297709923664122, 0.13709677419354838, 0.11224489795918367, 0.1732283464566929, 0.10576923076923077, 0.12380952380952381, 0.1623931623931624, 0.16363636363636364, 0.12878787878787878, 0.14018691588785046, 0.136752

In [12]:
print('Mcl-1 average, maximum and min similarity:')

for i, row in enumerate(mcl_similarity_matrix):
    mcl_avg_similarity = sum(row) / len(row)  #Calculate average
    mcl_max_similarity = max(row)     # Find max
    mcl_min_similarity = min(row)     #Find min
    
    print(f"Row {i + 1}: Average = {mcl_avg_similarity}, Maximum = {mcl_max_similarity}, Minimum = {mcl_min_similarity}")

Mcl-1 average, maximum and min similarity:
Row 1: Average = 0.1415352616166376, Maximum = 0.20618556701030927, Minimum = 0.07476635514018691
Row 2: Average = 0.11846183225324305, Maximum = 0.20512820512820512, Minimum = 0.0449438202247191
Row 3: Average = 0.11049620029356313, Maximum = 0.16521739130434782, Minimum = 0.04310344827586207
Row 4: Average = 0.1226363752497254, Maximum = 0.18095238095238095, Minimum = 0.05813953488372093
Row 5: Average = 0.12008113045778163, Maximum = 0.20300751879699247, Minimum = 0.06481481481481481
Row 6: Average = 0.11498880197731452, Maximum = 0.18181818181818182, Minimum = 0.04950495049504951
Row 7: Average = 0.11717151966551975, Maximum = 0.1893939393939394, Minimum = 0.0625


In [13]:
#Generate similarity matrix: 7 triple hits compared to known Bcl-2 inhibitors

bcl_similarity_matrix = []

#Iterate through hits compounds
for hit_fp in hits_morgan:
    bcl_hit_similarities = []
    
    #Iterate through known Bcl-2 inhibitors
    for bcl_fp in bcl_morgan:

        #Calculate similarity
        similarity = DataStructs.TanimotoSimilarity(hit_fp, bcl_fp)
        bcl_hit_similarities.append(similarity)

    bcl_similarity_matrix.append(bcl_hit_similarities)

print("Bcl-1 Similarity matrix:")
for row in bcl_similarity_matrix:
    print(row)

Bcl-1 Similarity matrix:
[0.152, 0.12195121951219512, 0.1532258064516129, 0.1532258064516129, 0.12264150943396226, 0.1328125, 0.17073170731707318, 0.12931034482758622, 0.144, 0.10526315789473684, 0.14960629921259844, 0.16260162601626016, 0.16, 0.15267175572519084, 0.1553398058252427, 0.1484375, 0.14615384615384616, 0.15, 0.14285714285714285, 0.13846153846153847, 0.15079365079365079, 0.1323529411764706, 0.1415929203539823, 0.14, 0.10101010101010101, 0.15217391304347827, 0.12295081967213115, 0.15254237288135594, 0.1566265060240964, 0.14705882352941177, 0.11538461538461539, 0.13333333333333333, 0.140625, 0.1532258064516129, 0.1487603305785124, 0.13793103448275862, 0.15126050420168066, 0.15966386554621848, 0.1650485436893204, 0.14516129032258066, 0.12121212121212122, 0.15503875968992248, 0.13541666666666666, 0.13178294573643412, 0.1328125, 0.1, 0.14173228346456693, 0.14634146341463414, 0.14960629921259844, 0.14, 0.16071428571428573, 0.15384615384615385, 0.13978494623655913, 0.1512605042016

In [14]:
print('Bcl-2 average, max and min similarity:')
for i, row in enumerate(bcl_similarity_matrix):
    bcl_avg_similarity = sum(row) / len(row)  #Calculate average
    bcl_max_similarity = max(row)        #Find max
    bcl_min_similarity = min(row)        #Find min
    print(f"Row {i + 1}: Average = {bcl_avg_similarity}, Maximum = {bcl_max_similarity}, Minimum = {bcl_min_similarity}")

Bcl-2 average, max and min similarity:
Row 1: Average = 0.14431487916505342, Maximum = 0.1919191919191919, Minimum = 0.08490566037735849
Row 2: Average = 0.12442965842007124, Maximum = 0.19047619047619047, Minimum = 0.057692307692307696
Row 3: Average = 0.1122278548296454, Maximum = 0.1678832116788321, Minimum = 0.051470588235294115
Row 4: Average = 0.12934033439964399, Maximum = 0.1875, Minimum = 0.06451612903225806
Row 5: Average = 0.12487079448391519, Maximum = 0.18627450980392157, Minimum = 0.06923076923076923
Row 6: Average = 0.12400665008335794, Maximum = 0.18584070796460178, Minimum = 0.05309734513274336
Row 7: Average = 0.11877359537917402, Maximum = 0.17037037037037037, Minimum = 0.0625


Mcl-to-Mcl Similarity

In [15]:
#Generate similarity matrix: known Mcl-1 inhibitors compared to Mcl-1 inhibitors

mcl_mcl_similarity_matrix = []

for mcl1_fp in mcl_morgan:
    mcl_mcl_similarities = []

    for mcl2_fp in mcl_morgan:
        similarity = DataStructs.TanimotoSimilarity(mcl1_fp, mcl2_fp)
        mcl_mcl_similarities.append(similarity)

    mcl_mcl_similarity_matrix.append(mcl_mcl_similarities)

In [16]:
# Initialize variables to count occurrences of "1" and store maximum value
max_value_total = float('-inf')
total_sum_total = 0
count_non_ones_total = 0

# Initialize variables to track min and max average values over rows
min_average = float('inf')
max_average = float('-inf')

# Iterate through each row of the similarity matrix
for row in mcl_mcl_similarity_matrix:

    # Initialize variables for each row
    max_value_row = float('-inf')
    total_sum_row = 0
    count_non_ones_row = 0
    
    # Iterate through each element in the row
    for value in row:

        # Check if the value is equal to 1
        if value != 1:

            # Add the non-"1" value to the total sum for the row
            total_sum_row += value

            # Increment the count of non-"1" values for the row
            count_non_ones_row += 1

    # Calculate the average for the row without 1s
    average_row = total_sum_row / count_non_ones_row if count_non_ones_row > 0 else 0
    
    # Update min and max average values over rows
    min_average = min(min_average, average_row)
    max_average = max(max_average, average_row)

# Print the min and max average values over rows
print(f"Mcl-Mcl Minimum Average over Rows: {min_average}")
print(f"Mcl-Mcl Maximum Average over Rows: {max_average}")

Mcl-Mcl Minimum Average over Rows: 0.08886871218874835
Mcl-Mcl Maximum Average over Rows: 0.3538841447258775


Bcl-to-Bcl similarity

In [17]:
#Generate similarity matrix: known Bcl-2 inhibitors compared to Bcl-2 inhibitors

bcl_bcl_similarity_matrix = []

#Iterate throguh inhibitors 
for bcl1_fp in bcl_morgan:
    bcl_bcl_similarities = []

    #Iterate through inhibitors
    for bcl2_fp in bcl_morgan:

        #Calculate similarity
        similarity = DataStructs.TanimotoSimilarity(bcl1_fp, bcl2_fp)
        bcl_bcl_similarities.append(similarity)

    bcl_bcl_similarity_matrix.append(bcl_bcl_similarities)




In [18]:
# Initialize variables to count occurrences of "1" and store maximum value
max_value_total = float('-inf')
total_sum_total = 0
count_non_ones_total = 0

# Initialize variables to track min and max average values over rows
min_average = float('inf')
max_average = float('-inf')

# Iterate through each row of the similarity matrix
for row in bcl_bcl_similarity_matrix:

    # Initialize variables for each row
    max_value_row = float('-inf')
    total_sum_row = 0
    count_non_ones_row = 0
    
    # Iterate through each element in the row
    for value in row:

        # Check if the value is equal to 1
        if value != 1:

            # Add the non-"1" value to the total sum for the row
            total_sum_row += value

            # Increment the count of non-"1" values for the row
            count_non_ones_row += 1

    # Calculate the average for the row without 1s
    average_row = total_sum_row / count_non_ones_row if count_non_ones_row > 0 else 0
    
    # Update min and max average values over rows
    min_average = min(min_average, average_row)
    max_average = max(max_average, average_row)

# Print the min and max average values over rows
print(f"Bcl-Bcl Minimum Average over Rows: {min_average}")
print(f"Bcl-Bcl Maximum Average over Rows: {max_average}")

Bcl-Bcl Minimum Average over Rows: 0.09811442474390832
Bcl-Bcl Maximum Average over Rows: 0.35757393678992727
