In [1]:
import os
import pandas as pd

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image

In [3]:
from RanDepict import RandomDepictor
import cv2

AttributeError: `np.sctypes` was removed in the NumPy 2.0 release. Access dtypes explicitly instead.

In [None]:
import warnings
warnings.filterwarnings('ignore')

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [None]:
class CustomRandomDepictor(RandomDepictor):
    def __init__(self, *args, **kwargs):
        super(CustomRandomDepictor, self).__init__(*args, **kwargs)
        # Set background images
        self.PATH_BKG = self.HERE.joinpath("backgrounds/")
        self.BKGS = [
            'bg-1041.png',
            'bg-12.png',
            'bg-167.png',
            'bg-27.png',
            'bg-378.png',
            'bg-46.png',
            'bg-468.png',
            'bg-635.png',
            'bg-664.png',
            'bg-76.png',
            'bg-799.png',
        ] ## total: 11

    def to_hand_written_batch(self, depiction, K):
        # Augment molecule image
        mol_aug = self.hand_drawn_augment(depiction)
        
        # Randomly select background image and use is as it is
        backgrounds_selected = np.random.choice(self.BKGS, size=K, replace=False)

        outputs = []
        for i, background_selected in enumerate(backgrounds_selected):
            bkg = cv2.imread(os.path.join(os.path.normpath(self.PATH_BKG), background_selected))
            bkg = cv2.resize(bkg, (256, 256))
            
            # Combine augmented molecule and augmented background
            p = 0.7
            depiction = cv2.addWeighted(mol_aug, p, bkg, 1 - p, gamma=0)
            outputs.append(depiction)
    
        return outputs
            
    def to_hand_written(self, depiction):
        # Augment molecule image
        mol_aug = self.hand_drawn_augment(depiction)

        # Randomly select background image and use is as it is
        backgroud_selected = self.random_choice(self.BKGS)
        bkg = cv2.imread(os.path.join(os.path.normpath(self.PATH_BKG), backgroud_selected))
        bkg = cv2.resize(bkg, (256, 256))
        # Combine augmented molecule and augmented background
        p = 0.7
        mol_bkg = cv2.addWeighted(mol_aug, p, bkg, 1 - p, gamma=0)

        # Degrade total image
        #depiction = self.degrade_img(mol_bkg)
        #return depiction
        return mol_bkg

    def hand_drawn_augment(self, img) -> np.array:
        """
        This function randomly applies different image augmentations with
        different probabilities to the input image.

        It has been modified from the original augment.py present on
        https://github.com/mtzgroup/ChemPixCH

        From the publication:
        https://pubs.rsc.org/en/content/articlelanding/2021/SC/D1SC02957F

        Args:
            img: the image to modify in array format.
        Returns:
            img: the augmented image.
        """
        # resize
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.5:
            img = self.resize_hd(img)
        # blur
        #if self.random_choice(np.arange(0, 1, 0.01)) < 0.4:
        #    img = self.blur(img)
        # erode
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.4:
            img = self.erode(img)
        # dilate
        #if self.random_choice(np.arange(0, 1, 0.01)) < 0.4:
        #    img = self.dilate(img)
        # aspect_ratio
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.7:
            img = self.aspect_ratio(img, "mol")
        # affine
        if self.random_choice(np.arange(0, 1, 0.01)) < 0.7:
            img = self.affine(img, "mol")
        # distort
        #if self.random_choice(np.arange(0, 1, 0.01)) < 0.8:
        #    img = self.distort(img)
        if img.shape != (255, 255, 3):
            img = cv2.resize(img, (256, 256))
        return img

    def to_hand_written_batch_from_smiles(self, smiles, K):
        try:
            depiction = self(smiles)
            
            # Augment molecule image
            mol_aug = self.hand_drawn_augment(depiction)
            
            # Randomly select background image and use is as it is
            backgrounds_selected = np.random.choice(self.BKGS, size=K, replace=False)

            outputs = []
            for i, background_selected in enumerate(backgrounds_selected):
                bkg = cv2.imread(os.path.join(os.path.normpath(self.PATH_BKG), background_selected))
                bkg = cv2.resize(bkg, (256, 256))
                
                # Combine augmented molecule and augmented background
                p = 0.7
                depiction = cv2.addWeighted(mol_aug, p, bkg, 1 - p, gamma=0)
                outputs.append(depiction)
        
            return outputs
        except Exception as e:
            # print(f"Error processing SMILES {smiles}: {str(e)}")
            return None

In [None]:
depictor = CustomRandomDepictor(seed=2025, hand_drawn=True)

In [None]:
TASK_NAME = 'clintox+FDA_APPROVED'

train_csv_path = os.path.join('..', '..', 'data', 'Classification', TASK_NAME, 'train.csv')
train = pd.read_csv(train_csv_path)

test_csv_path = os.path.join('..', '..', 'data', 'Classification', TASK_NAME, 'test.csv')
test = pd.read_csv(test_csv_path)

In [None]:
output_dir = os.path.join('..', '..', 'data', 'Classification', TASK_NAME, 'Randepict')
os.makedirs(output_dir, exist_ok=True)

: 

In [None]:
# images 디렉토리 생성
images_dir = os.path.join(output_dir, 'images')
os.makedirs(images_dir, exist_ok=True)

# 각 데이터셋 별로 처리
for dataset_name, df in [('train', train), ('test', test)]:
    print(f"Processing {dataset_name} dataset...")
    results = []
    target_col = df.columns[-1]
    
    # 각 SMILES에 대해 이미지 생성
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {dataset_name}"):
        
        k = 1 # 증강 횟수
        
        smiles = row['SMILES_deepchem']
        image_id = row['IDs_deepchem']
        
        # 이미지 파일명 생성
        if k == 1:
            image_filename = f'{image_id}.png'
        else:
            image_filename = f'{image_id}_0.png'
        
        relative_path = "../data/Classification/{}/Randepict/images/{}".format(TASK_NAME, image_filename)
        
        augmented_images = depictor.to_hand_written_batch_from_smiles(smiles, K=k)
        
        if augmented_images is not None:
            if k == 1:
                for i, arr in enumerate(augmented_images):
                    img = Image.fromarray(arr)
                    img.save(os.path.join(images_dir, f'{image_id}.png'))
                    
                    # 결과 행 생성
                    result_row = {
                        'IDs_deepchem': row.get('IDs_deepchem', ''),
                        'file_path': relative_path,
                        target_col: row[target_col],
                        'IDs_decimer': row.get('IDs_decimer', ''),
                        'SMILES_deepchem': row.get('SMILES_deepchem', row.get('SMILES', '')),
                        'SMILES_decimer': row.get('SMILES_decimer', row.get('SMILES', ''))
                    }
                    results.append(result_row)
                    
            else:
                for i, arr in enumerate(augmented_images):
                    img = Image.fromarray(arr)
                    img.save(os.path.join(images_dir, f'{image_id}_{i}.png'))
                    
                    # 각 증강 이미지에 대해 결과 행 생성
                    image_filename = f'{image_id}_{i}.png'
                    relative_path = "../data/Classification/{}/Randepict/images/{}".format(TASK_NAME, image_filename)
                    
                    result_row = {
                        'IDs_deepchem': f"{row.get('IDs_deepchem', '')}_{i}",
                        'file_path': relative_path,
                        target_col: row[target_col],
                        'IDs_decimer': row.get('IDs_decimer', ''),
                        'SMILES_deepchem': row.get('SMILES_deepchem', row.get('SMILES', '')),
                        'SMILES_decimer': row.get('SMILES_decimer', row.get('SMILES', ''))
                    }
                    results.append(result_row)

    results_df = pd.DataFrame(results)
    csv_output_path = os.path.join(output_dir, f'{dataset_name}_dataset.csv')
    results_df.to_csv(csv_output_path, index=False)

Processing train dataset...


Processing train:   0%|          | 0/1463 [00:00<?, ?it/s]

: 

: 