In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, GraphDescriptors, Descriptors, FindMolChiralCenters

In [2]:
df_data = pd.read_csv('data/raw.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL169028,C/C(=C\CCCC#N)[C@H]1CC[C@]2(C)[C@@H]([C@H](C)C...,7730.0,inactive
1,CHEMBL422548,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC(=O)O[...,4050.0,inactive
2,CHEMBL1094636,NC(=O)c1cccc2cn(-c3ccc([C@@H]4CCCNC4)cc3)nc12,2000.0,inactive
3,CHEMBL172446,CN(C)CCNC(=O)c1nc(NC(=O)c2nc(NC(=O)c3ccc(N(CCC...,33700.0,inactive
4,CHEMBL4782235,Nc1cccc(-c2nc3sccn3c2-c2ccnc(NCCCCNS(=O)(=O)c3...,52430.0,inactive


In [3]:
def descriptors(smiles_list):
    mol_list = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

    wiener_res = []
    amat_list = [Chem.GetDistanceMatrix(mol) for mol in mol_list]
    for i, mol in enumerate(mol_list):
        res = 0
        amat = amat_list[i]
        num_atoms = mol.GetNumAtoms()
        for j in range(num_atoms):
            for k in range(j + 1, num_atoms):
                res += amat[j][k]
        wiener_res.append(res)

    descriptor_data = {
        'Molecular Weight': [Descriptors.ExactMolWt(mol) for mol in mol_list],
        'Number of Rotatable Bonds': [Descriptors.NumRotatableBonds(mol) for mol in mol_list],
        'Number of Atoms': [mol.GetNumAtoms() for mol in mol_list],
        'Number of Bonds': [mol.GetNumBonds() for mol in mol_list],
        'Count of Chiral Centers': [len(FindMolChiralCenters(mol, includeUnassigned=True)) for mol in mol_list],
        'Number of Rings': [rdmd.CalcNumRings(mol) for mol in mol_list],
        'Number of Aromatic Rings': [rdmd.CalcNumAromaticRings(mol) for mol in mol_list],
        'Number of Hydrogen Bond Donors': [rdmd.CalcNumHBD(mol) for mol in mol_list],
        'Number of Hydrogen Bond Acceptors': [rdmd.CalcNumHBA(mol) for mol in mol_list],
        'Balaban J Index': [GraphDescriptors.BalabanJ(mol) for mol in mol_list],
        'Wiener Index': wiener_res,
                'LogP': [rdmd.CalcCrippenDescriptors(mol)[0] for mol in mol_list],
        'TPSA': [rdmd.CalcTPSA(mol) for mol in mol_list],
    }

    descriptor_values = pd.DataFrame(descriptor_data)

    return descriptor_values

In [4]:
df_descriptors = descriptors(df_data.canonical_smiles)
df_descriptors.head()

Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA
0,415.34503,11,30,31,6,2,0,1,2,2.127864,2622.0,7.62238,61.09
1,415.34503,9,30,32,7,3,0,0,3,1.693417,2552.0,7.29708,50.09
2,320.163711,3,24,27,1,4,3,2,4,1.67003,1433.0,2.5914,72.94
3,577.208341,13,39,41,0,3,3,3,9,1.628615,6303.0,2.2336,129.42
4,587.1385,10,40,44,0,5,5,3,9,1.190365,6631.0,5.2914,127.3


In [5]:
insert_after_column = 1

df_data_before = df_data.iloc[:, :insert_after_column + 1]
df_data_after = df_data.iloc[:, insert_after_column + 1:]
df = pd.concat([df_data_before, df_descriptors, df_data_after], axis=1)
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL169028,C/C(=C\CCCC#N)[C@H]1CC[C@]2(C)[C@@H]([C@H](C)C...,415.34503,11,30,31,6,2,0,1,2,2.127864,2622.0,7.62238,61.09,7730.0,inactive
1,CHEMBL422548,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC(=O)O[...,415.34503,9,30,32,7,3,0,0,3,1.693417,2552.0,7.29708,50.09,4050.0,inactive
2,CHEMBL1094636,NC(=O)c1cccc2cn(-c3ccc([C@@H]4CCCNC4)cc3)nc12,320.163711,3,24,27,1,4,3,2,4,1.67003,1433.0,2.5914,72.94,2000.0,inactive
3,CHEMBL172446,CN(C)CCNC(=O)c1nc(NC(=O)c2nc(NC(=O)c3ccc(N(CCC...,577.208341,13,39,41,0,3,3,3,9,1.628615,6303.0,2.2336,129.42,33700.0,inactive
4,CHEMBL4782235,Nc1cccc(-c2nc3sccn3c2-c2ccnc(NCCCCNS(=O)(=O)c3...,587.1385,10,40,44,0,5,5,3,9,1.190365,6631.0,5.2914,127.3,52430.0,inactive


In [6]:
df.to_csv('data/preprocessed.csv', index=False)