In [1]:
import os
import pickle
import pandas as pd
import re

def tokenizer(smile):
    "Tokenizes SMILES string"
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|_|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regezz = re.compile(pattern)
    tokens = [token for token in regezz.findall(smile)]
    assert smile == ''.join(tokens), ("{} could not be joined".format(smile))
    return tokens

In [2]:
def build_vocab_from_csv(csv_path, vocab_path, input_col='SMILES'):
    """
    단일 CSV 파일에서 SMILES를 읽어서 vocabulary dictionary를 생성합니다.
    
    Parameters:
    -----------
    csv_path : str
        CSV 파일 경로 (raw data, 모든 SMILES가 포함된 파일)
    vocab_path : str
        vocabulary dictionary를 저장할 경로
    input_col : str
        SMILES 컬럼 이름 (기본값: 'SMILES')
    """
    print(f'Building vocabulary dictionary from CSV file: {csv_path}')
    
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
    
    # CSV 파일 읽기
    df = pd.read_csv(csv_path)
    if input_col not in df.columns:
        raise ValueError(f"Input column '{input_col}' not found in CSV. Available columns: {df.columns.tolist()}")
    
    smiles_list = df[input_col].dropna().tolist()
    print(f'  Loaded {len(smiles_list)} SMILES from {csv_path}')
    
    # Vocabulary 생성
    char_dict = {}
    char_idx = 1
    
    for smi in smiles_list:
        if pd.isna(smi):
            continue
        try:
            tokens = tokenizer(smi)
            for tok in tokens:
                if tok not in char_dict:
                    char_dict[tok] = char_idx
                    char_idx += 1
        except Exception as e:
            print(f'  Error tokenizing SMILES: {smi}, error: {e}')
            continue
    
    # vocabulary 저장
    os.makedirs(os.path.dirname(vocab_path), exist_ok=True)
    with open(vocab_path, 'wb') as f:
        pickle.dump(char_dict, f)
    
    print(f'Vocabulary saved to {vocab_path}')
    print(f'Vocabulary size: {len(char_dict)}')
    return char_dict

In [3]:
raw_csv_path = '/home/rlawlsgurjh/hdd/work/ChEMBLv2/data/selectivity_processed/Ki_selectivity.csv'  # raw data 경로

dataset_name = 'selectivity'
task_name = 'Ki'
vocab_path = f'./data/{dataset_name}/{task_name}/smiles_char_dict.pkl'  # vocabulary 저장 경로
input_col = 'SMILES'  # SMILES 컬럼 이름

print('=' * 60)
print('Building Vocabulary from Raw Data')
print('=' * 60)
print(f'Input CSV: {raw_csv_path}')
print(f'Output vocabulary: {vocab_path}')
print('=' * 60)

# Vocabulary 생성
if os.path.exists(vocab_path):
    print(f'\nVocabulary already exists at {vocab_path}')
    print('Loading existing vocabulary...')
    with open(vocab_path, 'rb') as f:
        smilesVoc = pickle.load(f)
    print(f'Vocabulary size: {len(smilesVoc)}')
else:
    smilesVoc = build_vocab_from_csv(
        csv_path=raw_csv_path,
        vocab_path=vocab_path,
        input_col=input_col
    )
    print(f'\nVocabulary created successfully!')
    print(f'Vocabulary: {smilesVoc}')

Building Vocabulary from Raw Data
Input CSV: /home/rlawlsgurjh/hdd/work/ChEMBLv2/data/selectivity_processed/Ki_selectivity.csv
Output vocabulary: ./data/selectivity/Ki/smiles_char_dict.pkl

Vocabulary already exists at ./data/selectivity/Ki/smiles_char_dict.pkl
Loading existing vocabulary...
Vocabulary size: 39


In [4]:
# Optional: Vocabulary 통계 확인
if 'smilesVoc' in locals():
    print('\nVocabulary Statistics:')
    print(f'Total vocabulary size: {len(smilesVoc)}')
    print(f'\nFirst 20 tokens:')
    sorted_vocab = sorted(smilesVoc.items(), key=lambda x: x[1])
    for token, idx in sorted_vocab[:20]:
        print(f'  {token}: {idx}')


Vocabulary Statistics:
Total vocabulary size: 39

First 20 tokens:
  C: 1
  [S+]: 2
  (: 3
  [O-]: 4
  ): 5
  c: 6
  1: 7
  -: 8
  2: 9
  n: 10
  3: 11
  F: 12
  [nH]: 13
  Cl: 14
  N: 15
  =: 16
  O: 17
  I: 18
  #: 19
  4: 20
