In [7]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, GraphDescriptors, Descriptors, FindMolChiralCenters

In [8]:
df_data = pd.read_csv('CSV Files/EGFR_Data_Preprocessed.csv')
df_data.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,41.0,active
1,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,9300.0,inactive
2,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,500000.0,inactive
3,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,3000000.0,inactive
4,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],96000.0,inactive


In [9]:
def descriptors(smiles_list):
    mol_list = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

    wiener_res = []
    amat_list = [Chem.GetDistanceMatrix(mol) for mol in mol_list]
    for i, mol in enumerate(mol_list):
        res = 0
        amat = amat_list[i]
        num_atoms = mol.GetNumAtoms()
        for j in range(num_atoms):
            for k in range(j + 1, num_atoms):
                res += amat[j][k]
        wiener_res.append(res)

    descriptor_data = {
        'Molecular Weight': [Descriptors.ExactMolWt(mol) for mol in mol_list],
        'Number of Rotatable Bonds': [Descriptors.NumRotatableBonds(mol) for mol in mol_list],
        'Number of Atoms': [mol.GetNumAtoms() for mol in mol_list],
        'Number of Bonds': [mol.GetNumBonds() for mol in mol_list],
        'Count of Chiral Centers': [len(FindMolChiralCenters(mol, includeUnassigned=True)) for mol in mol_list],
        'Number of Rings': [rdmd.CalcNumRings(mol) for mol in mol_list],
        'Number of Aromatic Rings': [rdmd.CalcNumAromaticRings(mol) for mol in mol_list],
        'Number of Hydrogen Bond Donors': [rdmd.CalcNumHBD(mol) for mol in mol_list],
        'Number of Hydrogen Bond Acceptors': [rdmd.CalcNumHBA(mol) for mol in mol_list],
        'Balaban J Index': [GraphDescriptors.BalabanJ(mol) for mol in mol_list],
        'Wiener Index': wiener_res,
                'LogP': [rdmd.CalcCrippenDescriptors(mol)[0] for mol in mol_list],
        'TPSA': [rdmd.CalcTPSA(mol) for mol in mol_list],
    }

    descriptor_values = pd.DataFrame(descriptor_data)

    return descriptor_values

In [10]:
df_descriptors = descriptors(df_data.canonical_smiles)
df_descriptors.head()

Unnamed: 0,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA
0,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7
1,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66
2,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98
3,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53
4,215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95


In [11]:
insert_after_column = 1

df_data_before = df_data.iloc[:, :insert_after_column + 1]
df_data_after = df_data.iloc[:, insert_after_column + 1:]
df = pd.concat([df_data_before, df_descriptors, df_data_after], axis=1)
df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,Molecular Weight,Number of Rotatable Bonds,Number of Atoms,Number of Bonds,Count of Chiral Centers,Number of Rings,Number of Aromatic Rings,Number of Hydrogen Bond Donors,Number of Hydrogen Bond Acceptors,Balaban J Index,Wiener Index,LogP,TPSA,standard_value,class
0,CHEMBL68920,Cc1cc(C)c(/C=C2\C(=O)Nc3ncnc(Nc4ccc(F)c(Cl)c4)...,383.094916,3,27,30,0,4,3,3,4,1.789174,1832.0,4.45034,82.7,41.0,active
1,CHEMBL137635,CN(c1ccccc1)c1ncnc2ccc(N/N=N/Cc3ccccn3)cc12,369.170194,6,28,31,0,4,4,1,6,1.558763,2326.0,4.772,78.66,9300.0,inactive
2,CHEMBL306988,CC(=C(C#N)C#N)c1ccc(NC(=O)CCC(=O)O)cc1,283.095691,5,21,21,0,1,1,2,4,2.78202,1108.0,2.31056,113.98,500000.0,inactive
3,CHEMBL66879,O=C(O)/C=C/c1ccc(O)cc1,164.047344,2,12,12,0,1,1,2,2,2.772026,223.0,1.49,57.53,3000000.0,inactive
4,CHEMBL77085,N#CC(C#N)=Cc1cc(O)ccc1[N+](=O)[O-],215.033091,2,16,16,0,1,1,1,5,3.386653,441.0,1.73096,110.95,96000.0,inactive


In [12]:
df.to_csv('CSV Files/EGFR_Feature_Extraction.csv', index=False)