In [1]:
import os
import pickle
import pandas as pd
import re
import json

def tokenizer(smile):
    "Tokenizes SMILES string"
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|_|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regezz = re.compile(pattern)
    tokens = [token for token in regezz.findall(smile)]
    assert smile == ''.join(tokens), ("{} could not be joined".format(smile))
    return tokens

In [2]:
def build_vocab_from_csv(csv_path, pkl_path, json_path, input_col='SMILES'):
    """
    Generate vocabulary dictionary from csv
    
    Parameters:
    -----------
    csv_path : str
    vocab_path : str
    input_col : str
    """
    print(f'Building vocabulary dictionary from CSV file: {csv_path}')
    
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
    
    df = pd.read_csv(csv_path)
    if input_col not in df.columns:
        raise ValueError(f"Input column '{input_col}' not found in CSV. Available columns: {df.columns.tolist()}")
    
    smiles_list = df[input_col].dropna().tolist()
    print(f'  Loaded {len(smiles_list)} SMILES from {csv_path}')

    char_dict = {}
    char_idx = 1
    
    for smi in smiles_list:
        if pd.isna(smi):
            continue
        try:
            tokens = tokenizer(smi)
            for tok in tokens:
                if tok not in char_dict:
                    char_dict[tok] = char_idx
                    char_idx += 1
        except Exception as e:
            print(f'  Error tokenizing SMILES: {smi}, error: {e}')
            continue
    
    for num in ['1', '2', '3', '4', '5', '6', '7', '8', '9']:
        if num not in char_dict:
            char_dict[num] = char_idx
            char_idx += 1
            print(f'  Added number {num} to vocabulary')

    if 'unk' not in char_dict:
        char_dict['unk'] = char_idx
        char_idx += 1
        print(f'  Added unk token to vocabulary (idx: {char_idx-1})')

    os.makedirs(os.path.dirname(pkl_path), exist_ok=True)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    with open(pkl_path, 'wb') as f:
        pickle.dump(char_dict, f)
        
    with open(json_path, 'w') as f:
        json.dump(char_dict, f, indent=4)

    print(f'Vocabulary size: {len(char_dict)}')
    return char_dict

In [3]:
raw_csv_path = '/home/rlawlsgurjh/hdd/work/ChEMBLv2/data/selectivity_processed/Ki_selectivity.csv'

dataset_name = 'selectivity'
task_name = 'Ki'
pkl_path = f'./data/{dataset_name}/{task_name}/smiles_char_dict.pkl'
json_path = f'./data/{dataset_name}/{task_name}/smiles_char_dict.json'
input_col = 'SMILES'

if os.path.exists(pkl_path):
    print(f'\nVocabulary already exists at {pkl_path}')
    print('Loading existing vocabulary...')
    with open(pkl_path, 'rb') as f:
        smilesVoc = pickle.load(f)
    print(f'Vocabulary size: {len(smilesVoc)}')
    
    max_idx = max(smilesVoc.values()) if smilesVoc else 0
    updated = False
    for num in ['1', '2', '3', '4', '5', '6', '7', '8', '9']:
        if num not in smilesVoc:
            max_idx += 1
            smilesVoc[num] = max_idx
            updated = True
            print(f'  Added number {num} to vocabulary (idx: {max_idx})')
    
    if 'unk' not in smilesVoc:
        max_idx += 1
        smilesVoc['unk'] = max_idx
        updated = True
        print(f'  Added unk token to vocabulary (idx: {max_idx})')
    
    if updated:
        print(f'\nUpdating vocabulary file with missing numbers...')
        os.makedirs(os.path.dirname(pkl_path), exist_ok=True)
        os.makedirs(os.path.dirname(json_path), exist_ok=True)
        with open(pkl_path, 'wb') as f:
            pickle.dump(smilesVoc, f)
        with open(json_path, 'w') as f:
            json.dump(smilesVoc, f, indent=2)
        print(f'Vocabulary updated! New size: {len(smilesVoc)}')
else:
    smilesVoc = build_vocab_from_csv(
        csv_path=raw_csv_path,
        pkl_path=pkl_path,
        json_path=json_path,
        input_col=input_col
    )
    print(f'\nVocabulary created successfully!')
    print(f'Vocabulary: {smilesVoc}')


Vocabulary already exists at ./data/selectivity/Ki/smiles_char_dict.pkl
Loading existing vocabulary...
Vocabulary size: 42


In [4]:
if 'smilesVoc' in locals():
    print('\nVocabulary Statistics:')
    print(f'Total vocabulary size: {len(smilesVoc)}')
    print(f'\nFirst 20 tokens:')
    sorted_vocab = sorted(smilesVoc.items(), key=lambda x: x[1])
    for token, idx in sorted_vocab[:20]:
        print(f'  {token}: {idx}')


Vocabulary Statistics:
Total vocabulary size: 42

First 20 tokens:
  C: 1
  [S+]: 2
  (: 3
  [O-]: 4
  ): 5
  c: 6
  1: 7
  -: 8
  2: 9
  n: 10
  3: 11
  F: 12
  [nH]: 13
  Cl: 14
  N: 15
  =: 16
  O: 17
  I: 18
  #: 19
  4: 20
