<a href="https://colab.research.google.com/github/jrdeborja/thesis-supp.-data/blob/main/descriptors_fingerprints_calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Calculation and generation of molecular descriptors (Mordred) and fingerprints (MACCS key using RDKit)**

- authored by Joshua R. de Borja

# **Installs and Imports**

In [None]:
! pip install 'mordred[full]'
! pip install rdkit

Collecting mordred[full]
  Downloading mordred-1.2.0.tar.gz (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.* (from mordred[full])
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25l[?25hdone
  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176721 sha256=e68ae4b3cfeb4b77221ab30496c3e2cfbd86f0638255fda8587c4569227f1399
  Stored in directory: /root/.cache/pip/wheels/a7/4f/b8/d4c6591f6ac944aaced7865b349477695f662388ad958743c7
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2.1
    

In [None]:
import pandas as pd
import numpy as np

from mordred import Calculator, descriptors

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

# **Dataset**
- dataset for 1D, 2D and 3D molecular descriptor generation
- and MACCS keys generation

In [None]:
train_test_dataset = pd.read_csv('https://raw.githubusercontent.com/jrdeborja/Thesis-code-repository/main/training_dataset_screened.csv')
rock1_dataset = pd.read_csv('https://raw.githubusercontent.com/jrdeborja/Thesis-code-repository/main/rock1_screened.csv')

In [None]:
df = pd.concat([train_test_dataset, rock1_dataset])

# **Mordred descriptors**

In [None]:
# Define a function to calculate all Mordred descriptors

def AllMordredDescriptors(data):
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
    df = calc.pandas(mols)
    return df

In [None]:
df_descriptors = AllMordredDescriptors(df['Smiles'])

  4%|▍         | 137/3463 [00:57<18:53,  2.93it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 16%|█▌        | 538/3463 [08:01<51:03:58, 62.85s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3463/3463 [41:05<00:00,  1.40it/s]


In [None]:
df_descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,36.871660,25.961217,1,0,60.964843,2.539247,4.958653,60.964843,1.325323,4.778552,...,10.851374,100.226083,637.214362,8.066005,8351,79,254.0,306.0,12.888889,9.888889
1,36.423773,25.622051,1,0,59.589557,2.539218,4.956426,59.589557,1.295425,4.764855,...,10.804523,98.372666,639.193627,8.301216,8437,78,248.0,295.0,14.388889,9.944444
2,34.900170,24.311455,1,0,57.883519,2.530083,4.940969,57.883519,1.315535,4.723315,...,10.761704,96.170286,607.223784,7.784920,7598,75,238.0,284.0,13.277778,9.527778
3,35.676226,25.257507,1,0,58.685792,2.539222,4.958165,58.685792,1.304129,4.745318,...,10.807321,97.331362,625.214362,8.015569,7899,78,244.0,292.0,14.138889,9.722222
4,38.285874,26.245294,1,0,63.880561,2.539229,4.963236,63.880561,1.330845,4.815478,...,10.873300,100.678434,665.245663,7.826420,9437,83,262.0,314.0,13.388889,10.388889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458,22.005839,16.402493,0,0,36.903668,2.488989,4.977978,36.903668,1.317988,4.271395,...,10.351789,63.651485,374.163043,7.483261,2171,48,150.0,179.0,8.611111,6.166667
3459,25.622253,19.095691,0,1,43.511354,2.396265,4.769294,43.511354,1.318526,4.419778,...,10.276877,82.041476,447.227040,7.213339,3817,49,170.0,197.0,9.861111,7.361111
3460,28.410240,20.775155,0,1,47.527962,2.414949,4.787615,47.527962,1.320221,4.521408,...,10.452505,88.225903,502.178710,8.099657,4861,55,192.0,226.0,10.333333,7.888889
3461,27.593743,20.293268,0,1,46.663461,2.402144,4.774809,46.663461,1.333242,4.493924,...,10.408466,87.046354,488.163060,8.273950,4480,53,186.0,219.0,9.472222,7.722222


# **RDKit MACCS keys**

In [None]:
# Define a function to calculate the MACCS keys

def MACCSFingerprints(data):
    maccs_fpts = []
    for i in data:
        mol = Chem.MolFromSmiles(i)
        fpts =  AllChem.GetMACCSKeysFingerprint(mol)
        mfpts = np.array(fpts)
        maccs_fpts.append(mfpts)
    return np.array(maccs_fpts)

In [None]:
df_maccs = pd.DataFrame(MACCSFingerprints(df['Smiles']))

In [None]:
# Reset all of the indexes

df.reset_index(drop=True, inplace=True)
df_descriptors.reset_index(drop=True, inplace=True)
df_maccs.reset_index(drop=True, inplace=True)

In [None]:
final_df_for_ml = pd.concat([df, df_descriptors, df_maccs], axis=1)

In [None]:
final_df_for_ml

Unnamed: 0.1,Unnamed: 0,Molecule.ChEMBL.ID,Smiles,Standard.Relation,Standard.Value,Document.ChEMBL.ID,Molar,pIC50,Bioactivity.Class,ABC,...,157,158,159,160,161,162,163,164,165,166
0,2102,CHEMBL210297,O=C(O)c1ccc2c(c1)nc(-c1ccc(OCc3cc(N4CCCC4=O)cc...,'=',83368118.46,CHEMBL3045038,8.336812e-02,1.079000,inactive,36.871660,...,1,1,1,0,1,1,1,1,1,0
1,3101,CHEMBL2296935,CC(=O)NC(=O)c1ccc(-c2ccc(Cl)cc2)c(COc2ccc(-c3n...,'=',71449632.61,CHEMBL3045038,7.144963e-02,1.146000,inactive,36.423773,...,1,1,1,1,1,1,1,1,1,0
2,4101,CHEMBL376688,CN(C)C(=O)c1ccc(-c2ccc(Cl)cc2)c(COc2ccc(-c3nc4...,'=',66680676.92,CHEMBL3045038,6.668068e-02,1.176000,inactive,34.900170,...,1,1,1,1,1,1,1,1,1,0
3,5101,CHEMBL374567,CC(=O)N(C)c1ccc(-c2ccc(Cl)cc2)c(COc2ccc(-c3nc4...,'=',62517269.28,CHEMBL3045038,6.251727e-02,1.204000,inactive,35.676226,...,1,1,1,1,1,1,1,1,1,0
4,6101,CHEMBL411712,O=C(O)c1ccc2c(c1)nc(-c1ccc(OCc3cc(C(=O)N4CCCCC...,'=',55590425.73,CHEMBL3045038,5.559043e-02,1.255000,inactive,38.285874,...,1,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458,966,CHEMBL4763198,COc1cccc([C@@H](C)NC(=O)c2ccc3c(c2)C(C)Oc2cncc...,'=',0.49,CHEMBL4665758,4.900000e-10,9.309804,active,22.005839,...,1,1,1,1,1,1,1,1,1,0
3459,967,CHEMBL3949136,COc1cc(C(=O)NCc2cccc(C(=O)NC3CCN(C)CC3)c2)ccc1...,'=',0.43,CHEMBL3886981,4.300000e-10,9.366532,active,25.622253,...,1,1,1,1,1,1,1,1,1,0
3460,968,CHEMBL3961538,COc1cc(C(=O)NCc2cccc(C(=O)Nc3nc4c(s3)CN(C)CC4)...,'=',0.39,CHEMBL3886981,3.900000e-10,9.408935,active,28.410240,...,1,1,1,1,1,1,1,1,1,0
3461,969,CHEMBL3902471,COc1cc(C(=O)NCc2cccc(C(=O)Nc3nc4c(s3)CNCC4)c2)...,'=',0.26,CHEMBL3886981,2.600000e-10,9.585027,active,27.593743,...,1,1,1,1,1,1,1,1,1,0


# **Google Drive export**

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
final_df_for_ml.to_csv('/content/drive/MyDrive/Mordred & RDKit files/final_dataset_for_ml.csv', index=False)