In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import filtered enamine compounds and get mol objects

supplier = Chem.SDMolSupplier('filtered_enamine_primary_amines.sdf')
mols = [mol for mol in supplier if mol is not None]

In [3]:
# generate a dataframe with the fingerprints and catalog number and smiles

catalog_ids = [mol.GetProp('Catalog_ID') for mol in mols]
smiles = [Chem.MolToSmiles(mol) for mol in mols]

df = pd.DataFrame({
    'Catalog_ID': catalog_ids,
    'SMILES': smiles,
})

In [4]:
# calculate rdkit 2d descriptors for the molecules

descriptors_dict = {}

for mol in mols:
    index = mol.GetProp('Catalog_ID')
    descriptors = Descriptors.CalcMolDescriptors(mol)
    descriptors_dict[index] = descriptors

In [5]:
# add the descriptors to the dataframe
# Convert descriptors_dict to DataFrame and merge with df
desc_df = pd.DataFrame.from_dict(descriptors_dict, orient='index')
desc_df.index.name = 'Catalog_ID'
desc_df.reset_index(inplace=True)
df = df.merge(desc_df, on='Catalog_ID', how='left')

In [6]:
# remove all features with "fr" at the start of their name (these are fragment counts)
df = df.loc[:, ~df.columns.str.startswith('fr')]

# remove all features with "Num" in their name (these are counts of various properties)
df = df.loc[:, ~df.columns.str.contains('Num')]

# remove all features with Count in their name (these are counts of various properties)
df = df.loc[:, ~df.columns.str.contains('Count')]

In [7]:
df.to_csv('enamine_primary_amines_with_descriptors.csv', index=False)