In [1]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Thesis_Implementation/Dataset/test/forranking2.csv')

In [5]:
def calculate_pka(smiles):
    mol = Chem.MolFromSmiles(smiles)
    pH_list = np.arange(0, 14, 0.5)
    pka_list = []
    for pH in pH_list:
        mol_H = Chem.AddHs(mol, addCoords=True)
        for atom in mol_H.GetAtoms():
            if atom.GetAtomicNum() == 1:
                mol_H.GetAtomWithIdx(atom.GetIdx()).SetFormalCharge(int(pH - 7))
        logp = Descriptors.MolLogP(mol_H)
        pka = 7 - logp
        pka_list.append(pka)
    pka = np.mean(pka_list)
    return pka

In [6]:
def validate_molecule(row):
    if (row['MolWeight'] < 500) & (row['LogP'] < 5) & (row['HBD'] <= 10) & (row['HBA'] <= 5) & (1 <= row['pKa'] <= 13):
        return 'valid'
    else:
        return 'invalid'

In [7]:
def measure_length(smiles):
    return len(smiles)

In [8]:
smiles = df['SMILES']
mols = [Chem.MolFromSmiles(x) for x in smiles]
df['MolWeight'] = [Descriptors.MolWt(x) for x in mols]
df['LogP'] = [Descriptors.MolLogP(x) for x in mols]
df['HBD'] = [Descriptors.NumHDonors(x) for x in mols]
df['HBA'] = [Descriptors.NumHAcceptors(x) for x in mols]
df['pKa'] = df['SMILES'].apply(calculate_pka)
df['Validity'] = df.apply(validate_molecule, axis=1)
df['Length'] = df['SMILES'].apply(measure_length)

In [9]:
df

Unnamed: 0,SMILES,Affinity,MolWeight,LogP,HBD,HBA,pKa,Validity,Length
0,ONC(N1CC2OC(C(=O)NCc3ccc(-c4ccccc4)cc3)C(C1)O2)=O,6.962898,383.404,1.4943,3,5,5.5057,valid,49
1,C(NC(=O)C1C2OC(CN(C(NO)=O)C2)O1)c1ccc(-c2ccccc...,8.267059,383.404,1.4943,3,5,5.5057,valid,51
2,c1(CNC(C2OC3OC2CN(C(NO)=O)C3)=O)ccc(-c2ccccc2)cc1,7.110216,383.404,1.4943,3,5,5.5057,valid,49
3,ONC(N1CC2OC(C(NCc3ccc(-c4ccccc4)cc3)=O)C(C1)O2)=O,6.818207,383.404,1.4943,3,5,5.5057,valid,49
4,c1cc(CNC(=O)C2C3OC(CN(C(=O)NO)C3)O2)ccc1-c1ccccc1,9.155649,383.404,1.4943,3,5,5.5057,valid,49
5,C12CN(C(NO)=O)CC(O1)C(C(=O)NCc1ccc(-c3ccccc3)c...,7.597659,383.404,1.4943,3,5,5.5057,valid,51
6,O1C2CN(C(NO)=O)CC1OC2C(=O)NCc1ccc(-c2ccccc2)cc1,7.44872,383.404,1.4943,3,5,5.5057,valid,47
7,N(C(=O)N1CC2OC(C(NCc3ccc(-c4ccccc4)cc3)=O)C(O2...,6.552645,383.404,1.4943,3,5,5.5057,valid,51
8,c1cc(CNC(=O)C2OC3OC2CN(C(=O)NO)C3)ccc1-c1ccccc1,9.241339,383.404,1.4943,3,5,5.5057,valid,47
9,C12OC(C(NCc3ccc(-c4ccccc4)cc3)=O)C(O1)CN(C(NO)...,6.931837,383.404,1.4943,3,5,5.5057,valid,51


In [10]:
df.to_csv('/content/drive/MyDrive/Thesis_Implementation/Dataset/test/validated_molecules3.csv', index=False)