<a href="https://colab.research.google.com/github/jyryu3161/DrugDiscovery/blob/main/Lec12_chemprop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 데이터셋 확보

In [None]:
!wget -O bbbp.zip https://www.dropbox.com/scl/fi/6kh2l30lxqh9l5y9kg9uj/bbbp.zip?rlkey=9aksfwk9tuem1jtq4cl4f9cfw&dl=0 # 데이터셋 다운로드
!unzip bbbp.zip # 데이터셋 압축풀기

--2025-06-19 04:08:03--  https://www.dropbox.com/scl/fi/6kh2l30lxqh9l5y9kg9uj/bbbp.zip?rlkey=9aksfwk9tuem1jtq4cl4f9cfw
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc973dc2396812d292129e4f3785.dl.dropboxusercontent.com/cd/0/inline/Cr6lSpJXrNY1cuLjX824txU0Y7sZ3jHs0_8A9k_sf_5J0KdxQcBV4-G8sX307aTtNYHamI3H5UiUZ8nHILFAGFHIGQqtFeBSoP90uTJjWTkyiGXuVBmAcGg4OkiCzzAFs6FgpOfUtPsU6E4FdXMhHjJi/file# [following]
--2025-06-19 04:08:03--  https://uc973dc2396812d292129e4f3785.dl.dropboxusercontent.com/cd/0/inline/Cr6lSpJXrNY1cuLjX824txU0Y7sZ3jHs0_8A9k_sf_5J0KdxQcBV4-G8sX307aTtNYHamI3H5UiUZ8nHILFAGFHIGQqtFeBSoP90uTJjWTkyiGXuVBmAcGg4OkiCzzAFs6FgpOfUtPsU6E4FdXMhHjJi/file
Resolving uc973dc2396812d292129e4f3785.dl.dropboxusercontent.com (uc973dc2396812d292129e4f3785.dl.dropboxusercontent.com)... 162.125.3.15, 2620:100:6

### 관련 패키지 설치

In [None]:
!pip install chemprop
!pip install molvs

Collecting molvs
  Downloading MolVS-0.1.1.tar.gz (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: molvs
  Building wheel for molvs (setup.py) ... [?25l[?25hdone
  Created wheel for molvs: filename=MolVS-0.1.1-py3-none-any.whl size=32374 sha256=b32a278711be55120291cabd106de9e1c471d2661de6b920922c11a70ac0db88
  Stored in directory: /root/.cache/pip/wheels/26/62/a4/147e264c789d9a4e8495f2c3d60f702425761f875bcfdf44e0
Successfully built molvs
Installing collected packages: molvs
Successfully installed molvs-0.1.1


### Preprocessing

In [None]:
import pandas as pd
from rdkit import Chem
from molvs import Standardizer
from molvs.charge import Reionizer
from molvs.tautomer import TautomerCanonicalizer

from rdkit import RDLogger
import time

# RDKit의 로그 레벨을 ERROR로 설정하여 deprecation 경고 숨기기
RDLogger.logger().setLevel(RDLogger.ERROR)

def preprocess_smiles(smiles):
    """
    주어진 SMILES 문자열을 전처리합니다.

    :param smiles: 전처리할 SMILES 문자열
    :return: 전처리된 SMILES 문자열, 유효하지 않은 경우 None
    """
    # RDKit 분자 객체로 변환
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # 유효하지 않은 SMILES

    # MolVS 표준화 적용
    s = Standardizer()
    mol = s.standardize(mol)

    # 재이온화
    reionizer = Reionizer()
    mol = reionizer.reionize(mol)

    # 다시 SMILES로 변환
    return Chem.MolToSmiles(mol)

def preprocess_dataset(input_file, output_file):
    """
    입력 CSV 파일을 읽어 SMILES를 전처리하고 결과를 새 CSV 파일로 저장합니다.

    :param input_file: 입력 CSV 파일 경로
    :param output_file: 출력 CSV 파일 경로
    """
    # CSV 파일 읽기
    df = pd.read_csv(input_file)

    # SMILES 열이 있는지 확인
    if 'smiles' not in df.columns:
        raise ValueError("CSV 파일에 'smiles' 열이 없습니다.")

    # SMILES 전처리
    df['processed_smiles'] = df['smiles'].apply(preprocess_smiles)

    # 유효하지 않은 SMILES 제거
    df = df.dropna(subset=['processed_smiles'])

    # 결과를 새 CSV 파일로 저장
    df.to_csv(output_file, index=False)

# 데이터셋 전처리 실행
datasets = ['train', 'valid', 'test']

for dataset in datasets:
    input_file = f"./bbbp/{dataset}.csv"
    output_file = f"{dataset}_processed.csv"
    print(f"Processing {dataset} dataset...")
    preprocess_dataset(input_file, output_file)
    print(f"{dataset} dataset processing completed.")

print("All datasets have been processed.")

Processing train dataset...


[04:20:56] Explicit valence for atom # 1 N, 4, is greater than permitted
[04:20:56] Explicit valence for atom # 6 N, 4, is greater than permitted
[04:20:57] Explicit valence for atom # 6 N, 4, is greater than permitted
[04:20:57] Explicit valence for atom # 11 N, 4, is greater than permitted
[04:20:58] Explicit valence for atom # 5 N, 4, is greater than permitted
[04:21:01] Can't kekulize mol.  Unkekulized atoms: 12 14


train dataset processing completed.
Processing valid dataset...
valid dataset processing completed.
Processing test dataset...


[04:21:02] Explicit valence for atom # 12 N, 4, is greater than permitted
[04:21:02] Explicit valence for atom # 5 N, 4, is greater than permitted


test dataset processing completed.
All datasets have been processed.


## Chemprop

https://chemprop.readthedocs.io/en/main/index.html

In [None]:

# 기존 CSV 파일 읽기
train_dataset = pd.read_csv('train_processed.csv')
valid_dataset = pd.read_csv('valid_processed.csv')
test_dataset = pd.read_csv('test_processed.csv')

# 데이터 전처리: 열 이름 변경 (필요한 경우)
for df in [train_dataset, valid_dataset, test_dataset]:
    if 'p_np' in df.columns:
        df.rename(columns={'p_np': 'label'}, inplace=True)
    if 'processed_smiles' not in df.columns:
        print("Error: 'smiles' column not found in the data")

# 전처리된 데이터를 CSV 파일로 다시 저장
train_dataset = train_dataset[['smiles', 'label']]
valid_dataset = valid_dataset[['smiles', 'label']]
test_dataset = test_dataset[['smiles', 'label']]

train_dataset.to_csv('train_processed_chemprop.csv', index=False)
valid_dataset.to_csv('valid_processed_chemprop.csv', index=False)
test_dataset.to_csv('test_processed_chemprop.csv', index=False)


In [None]:
!chemprop train --data-path train_processed_chemprop.csv \
                --task-type classification \
                --output-dir chemprop_outputs


2025-06-19T04:26:17 - INFO:chemprop.cli.main - Running in mode 'train' with args: {'smiles_columns': None, 'reaction_columns': None, 'no_header_row': False, 'num_workers': 0, 'batch_size': 64, 'accelerator': 'auto', 'devices': 'auto', 'rxn_mode': 'REAC_DIFF', 'multi_hot_atom_featurizer_mode': 'V2', 'keep_h': False, 'add_h': False, 'ignore_stereo': False, 'reorder_atoms': False, 'molecule_featurizers': None, 'descriptors_path': None, 'no_descriptor_scaling': False, 'no_atom_feature_scaling': False, 'no_atom_descriptor_scaling': False, 'no_bond_feature_scaling': False, 'no_bond_descriptor_scaling': False, 'atom_features_path': None, 'atom_descriptors_path': None, 'bond_features_path': None, 'bond_descriptors_path': None, 'constraints_path': None, 'constraints_to_targets': None, 'config_path': None, 'data_path': PosixPath('train_processed_chemprop.csv'), 'output_dir': PosixPath('chemprop_outputs'), 'remove_checkpoints': False, 'checkpoint': None, 'freeze_encoder': False, 'model_frzn': Non

In [None]:
# 학습된 모델을 사용하여 테스트 세트에 대한 예측 수행
!chemprop predict --test-path test_processed_chemprop.csv \
                  --preds-path  chemprop_predictions.csv \
                  --model-path ./chemprop_outputs/model_0/best.pt



2025-06-19T04:33:20 - INFO:chemprop.cli.main - Running in mode 'predict' with args: {'smiles_columns': None, 'reaction_columns': None, 'no_header_row': False, 'num_workers': 0, 'batch_size': 64, 'accelerator': 'auto', 'devices': 'auto', 'rxn_mode': 'REAC_DIFF', 'multi_hot_atom_featurizer_mode': 'V2', 'keep_h': False, 'add_h': False, 'ignore_stereo': False, 'reorder_atoms': False, 'molecule_featurizers': None, 'descriptors_path': None, 'no_descriptor_scaling': False, 'no_atom_feature_scaling': False, 'no_atom_descriptor_scaling': False, 'no_bond_feature_scaling': False, 'no_bond_descriptor_scaling': False, 'atom_features_path': None, 'atom_descriptors_path': None, 'bond_features_path': None, 'bond_descriptors_path': None, 'constraints_path': None, 'constraints_to_targets': None, 'test_path': PosixPath('test_processed_chemprop.csv'), 'output': PosixPath('chemprop_predictions.csv'), 'drop_extra_columns': False, 'model_paths': [PosixPath('chemprop_outputs/model_0/best.pt')], 'cal_path': No

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, matthews_corrcoef, precision_score, recall_score

def evaluate_model(y, y_pred_proba):
    y_pred = (y_pred_proba > 0.5).astype(int)  # 확률을 이진 예측으로 변환

    acc = accuracy_score(y, y_pred)
    auroc = roc_auc_score(y, y_pred_proba)  # ROC AUC는 확률 값을 사용
    mcc = matthews_corrcoef(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)

    return {
        "ACC": acc,
        "AUROC": auroc,
        "MCC": mcc,
        "Precision": precision,
        "Recall": recall
    }

ground_truth_df = pd.read_csv('test_processed_chemprop.csv')
pred_df = pd.read_csv('chemprop_predictions.csv')

ground_truth_df['pred'] = pred_df['label']

y = ground_truth_df['label'].values  # 'p_np'가 실제 레이블 컬럼명이라고 가정
y_pred_proba = pred_df['label'].values  # 'p_np'가 예측 확률 컬럼명이라고 가정

print(evaluate_model(y, y_pred_proba))

ground_truth_df

{'ACC': 0.8385416666666666, 'AUROC': np.float64(0.8950819672131147), 'MCC': np.float64(0.6486876995404717), 'Precision': 0.8181818181818182, 'Recall': 0.9590163934426229}


Unnamed: 0,smiles,label,pred
0,C(Cl)Cl,1,0.879012
1,c1cc2c(cc(CC3=CNC(=NC3=O)NCCSCc3oc(cc3)CN(C)C)...,0,0.677595
2,CCOC(=O)c1cncn1C(C)c2ccccc2,1,0.784569
3,CN(C)c1cc(C2=NC(N)=NN2)ccn1,0,0.909329
4,N1(Cc2cc(OCCCNc3oc4ccccc4n3)ccc2)CCCCC1,1,0.966866
...,...,...,...
187,C1=C([S](N)(=O)=O)C=CC(=C1C(NCC2N(CCC2)CCC)=O)OC,1,0.856452
188,C4=C(C(C3CCN(CCC1=C(N=C2N(C1=O)CCS2)C)CC3)=O)C...,1,0.953386
189,[C@]14([C@](OC(=O)CC)([C@@H](CC1C3[C@@](F)(C2(...,1,0.995810
190,C1=CC=C2C(=C1)SC3=C(N2CC(C)CN(C)C)C=C(C=C3)C(F...,1,0.997778
