In [1]:
import os

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image

In [3]:
from RanDepict import RandomDepictor
import cv2

In [4]:
class CustomRandomDepictor(RandomDepictor):
    def __init__(self, *args, **kwargs):
        super(CustomRandomDepictor, self).__init__(*args, **kwargs)
        # Set background images
        self.PATH_BKG = self.HERE.joinpath("backgrounds/")
        self.BKGS = [
            'bg-1041.png',
            'bg-12.png',
            'bg-167.png',
            'bg-27.png',
            'bg-378.png',
            'bg-46.png',
            'bg-468.png',
            'bg-635.png',
            'bg-664.png',
            'bg-76.png',
            'bg-799.png',
        ] ## total: 11

    
    def to_depiction(self, smiles, shape=(299,299)):
        orig_styles = self._config.styles
        # TODO: add this to depiction feature fingerprint
        if self.random_choice([True] + [False] * 5):
            smiles = self._cdk_add_explicite_hydrogen_to_smiles(smiles)
            self._config.styles = [style for style in orig_styles if style != 'pikachu']
        depiction_functions = self.get_depiction_functions(smiles)
        self._config.styles = orig_styles
        for _ in range(3):
            if len(depiction_functions) != 0:
                # Pick random depiction function and call it
                depiction_function = self.random_choice(depiction_functions)
                depiction = depiction_function(smiles=smiles, shape=shape)
                if depiction is False or depiction is None:
                    depiction_functions.remove(depiction_function)
                else:
                    break
            else:
                return None
        return depiction

    
    def to_hand_written_batch(self, depiction, K):
        # Augment molecule image
        mol_aug = self.hand_drawn_augment(depiction)
        
        # Randomly select background image and use is as it is
        backgrounds_selected = np.random.choice(self.BKGS, size=K, replace=False)

        outputs = []
        for i, background_selected in enumerate(backgrounds_selected):
            bkg = cv2.imread(os.path.join(os.path.normpath(self.PATH_BKG), background_selected))
            bkg = cv2.resize(bkg, (256, 256))
            
            # Combine augmented molecule and augmented background
            p = 0.7
            depiction = cv2.addWeighted(mol_aug, p, bkg, 1 - p, gamma=0)
            outputs.append(depiction)
    
        return outputs
            
    
    def to_hand_written(self, depiction):
        # Augment molecule image
        mol_aug = self.hand_drawn_augment(depiction)

        # Randomly select background image and use is as it is
        backgroud_selected = self.random_choice(self.BKGS)
        bkg = cv2.imread(os.path.join(os.path.normpath(self.PATH_BKG), backgroud_selected))
        bkg = cv2.resize(bkg, (256, 256))
        # Combine augmented molecule and augmented background
        p = 0.7
        mol_bkg = cv2.addWeighted(mol_aug, p, bkg, 1 - p, gamma=0)

        # Degrade total image
        #depiction = self.degrade_img(mol_bkg)
        #return depiction
        return mol_bkg


    def hand_drawn_augment(self, img) -> np.array:
        """
        This function randomly applies different image augmentations with
        different probabilities to the input image.

        It has been modified from the original augment.py present on
        https://github.com/mtzgroup/ChemPixCH

        From the publication:
        https://pubs.rsc.org/en/content/articlelanding/2021/SC/D1SC02957F

        Args:
            img: the image to modify in array format.
        Returns:
            img: the augmented image.
        """
        # resize
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.5:
            img = self.resize_hd(img)
        # blur
        #if self.random_choice(np.arange(0, 1, 0.01)) < 0.4:
        #    img = self.blur(img)
        # erode
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.4:
            img = self.erode(img)
        # dilate
        #if self.random_choice(np.arange(0, 1, 0.01)) < 0.4:
        #    img = self.dilate(img)
        # aspect_ratio
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.7:
            img = self.aspect_ratio(img, "mol")
        # affine
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.7:
            img = self.affine(img, "mol")
        # distort
        #if self.random_choice(np.arange(0, 1, 0.01)) < 0.8:
        #    img = self.distort(img)
        if img.shape != (255, 255, 3):
            img = cv2.resize(img, (256, 256))
        return img

In [5]:
depictor = CustomRandomDepictor(seed=2025, hand_drawn=True)

In [6]:
filename = os.path.join('data', 'filtered_train_1m.csv')

In [7]:
df = pd.read_csv(filename)

In [8]:
df

Unnamed: 0,pubchem_cid,InChI,SMILES,num_atoms
0,7124385,InChI=1S/C17H18ClN3O4S/c1-2-21(15(23)7-8-16(24...,CCN(C1=NC(=CS1)CC(=O)NC2=CC=C(C=C2)Cl)C(=O)CCC...,26
1,10224292,InChI=1S/C23H17Cl2NO3/c24-18-12-17(15-6-7-20-1...,C1=CC=C(C=C1)C[C@H](C(=O)O)OC2=C(C=C(C=C2Cl)C3...,29
2,8237725,InChI=1S/C21H18N2O2/c24-21-18-10-4-5-11-19(18)...,C1=CC=C(C=C1)/C=C/[C@H]2NC3=CC=CC=C3C(=O)N2CC4...,25
3,10447381,InChI=1S/C14H18O7/c1-7(16)8-2-4-9(5-3-8)20-14-...,CC(=O)C1=CC=C(C=C1)O[C@@H]2[C@H]([C@@H]([C@H](...,21
4,12198725,InChI=1S/C12H14N2O3S/c1-10-12(15)17-13-14(10)8...,CC1=[N+](NOC1=O)CCCS(=O)C2=CC=CC=C2,18
...,...,...,...,...
958719,8232987,InChI=1S/C20H22F3N3O/c1-2-25-11-13-26(14-12-25...,CCN1CCN(CC1)C2=CC=CC=C2NC(=O)C3=CC=C(C=C3)C(F)...,27
958720,8334469,InChI=1S/C13H9NO6S/c15-12-9(3-4-19-12)20-13(16...,C1COC(=O)[C@H]1OC(=O)C2=CC3=C(S2)C=CC(=C3)[N+]...,21
958721,4525575,InChI=1S/C30H32FN3O3/c1-2-17-33(30(36)22-37-26...,CCCN(CC(=O)N(CCC1=CNC2=CC=CC=C21)CC3=CC=C(C=C3...,37
958722,10722434,InChI=1S/C33H41NO10/c35-31(36)13-17-40-19-21-4...,C1C[C@H](N(C1)C(=O)OCC2C3=CC=CC=C3C4=CC=CC=C24...,44


In [9]:
output_dir = os.path.join('data', 'filtered_train_1m_randepict')

In [10]:
for cid, smiles in tqdm(df.loc[:,['pubchem_cid', 'SMILES']].values):
    ## load
    arr = depictor.to_depiction(smiles)
    ## randepict
    arrs2 = depictor.to_hand_written_batch(arr, 10)
    for i, arr2 in enumerate(arrs2):
        img2 = Image.fromarray(arr2)
        img2.save(os.path.join(output_dir, f'{cid}_{i}.png'))

  0%|                                                                                                                                                                            | 445/958724 [00:32<19:42:31, 13.51it/s]



  0%|▎                                                                                                                                                                          | 1613/958724 [02:03<28:43:55,  9.25it/s]org.openscience.cdk.stereo.StereoElementFactory WARN: Ignoring inverted up wedge bond connected to atom idx=6
  0%|▍                                                                                                                                                                          | 2325/958724 [03:03<17:18:36, 15.35it/s]org.openscience.cdk.stereo.StereoElementFactory WARN: Ignoring inverted up wedge bond connected to atom idx=7
org.openscience.cdk.stereo.StereoElementFactory WARN: Ignoring inverted up wedge bond connected to atom idx=34
  0%|▍                                                                                                                                                                          | 2565/958724 [03:24<16:10:08, 16.43it/s]org.openscience.cd

StructureError: aromaticity