In [1]:
import os
import sys
import pandas as pd

from tqdm import tqdm
from multiprocessing import Pool, cpu_count

import pubchempy as pcp
from pubchempy import PubChemHTTPError

In [2]:
ROOT_DIR = os.path.join('/home', 'rlawlsgurjh', 'hdd', 'work', 'RxnFlow')

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'building_blocks')

SMI_PATH = os.path.join(DATA_DIR, 'enamine_catalog.smi')

CHECKPOINT_PATH = os.path.join(DATA_DIR, 'iupac_results.csv')

In [3]:
def fetch_iupac(smi):
    try:
        mol = pcp.get_compounds(smi, "smiles")
        if mol and mol[0].iupac_name:
            return ("success", smi, mol[0].iupac_name)
        return ("error", smi, "No result from PubChem")
    except PubChemHTTPError as e:
        if e.msg.startswith('PUGREST.NotAuthorized'):
            return ("http_401", smi, str(e))
        return ("error", smi, str(e))
    except Exception as e:
        return ("error", smi, str(e))

In [4]:
def save_checkpoint(success_list, error_list, success_path, error_path):
    if success_list:
        success_df = pd.DataFrame(success_list)
        success_df.to_csv(success_path, mode='a', index=False, header=not os.path.exists(success_path))
        success_list.clear()

    if error_list:
        error_df = pd.DataFrame(error_list)
        error_df.to_csv(error_path, mode='a', index=False, header=not os.path.exists(error_path))
        error_list.clear()

In [5]:
def check_iupac_parallel(
    smi_path, 
    success_path="iupac_success.csv",
    error_path="iupac_error.csv", 
    checkpoint_interval=1000, 
    num_cpus=None
    ):
    """
    Reads a .smi file and processes it in parallel to fetch IUPAC names.
    Stops all processes immediately if a 401 Unauthorized error is received.
    """
    if num_cpus is None:
        # Default to using all available CPU cores
        num_cpus = cpu_count()
    print(f"Using {num_cpus} CPU cores.")

    # Parse SMILES strings to be processed from the input file
    smiles_to_process = []
    try:
        with open(smi_path, "r") as f:
            for line in f:
                parts = line.strip().split()
                if parts and parts[0]:
                    smiles_to_process.append(parts[0])
    except FileNotFoundError:
        print(f"Error: File not found at {smi_path}")
        return

    # Buffers for checkpointing and counters for tracking progress
    success_buffer = []
    error_buffer = []
    total_success_count = 0
    total_error_count = 0
    processed_count = 0

    # Create a pool of worker processes
    with Pool(processes=num_cpus) as pool:
        # Use imap_unordered for efficient, non-blocking iteration
        results_iterator = pool.imap_unordered(fetch_iupac, smiles_to_process)
        
        try:
            for result in tqdm(results_iterator, total=len(smiles_to_process), desc="Processing SMILES -> IUPAC"):
                processed_count += 1
                status, smi, data = result

                # Detect and handle 401 Unauthorized error
                if status == "http_401":
                    # Calculate final counts before exiting
                    current_successes = total_success_count + len(success_buffer)
                    # The current 401 error itself is counted as a failure
                    current_errors = total_error_count + len(error_buffer) + 1
                    
                    print("\n" + "="*60, file=sys.stderr)
                    print("FATAL ERROR: Received 401 Unauthorized from PubChem API.", file=sys.stderr)
                    print("This could be due to a temporary IP ban or API access issues.", file=sys.stderr)
                    print("Stopping all processes to prevent further issues.", file=sys.stderr)
                    print("-" * 60, file=sys.stderr)
                    print("PROCESS SUMMARY:", file=sys.stderr)
                    print(f"  Total molecules processed before stop: {processed_count}", file=sys.stderr)
                    print(f"  Successful lookups: {current_successes}", file=sys.stderr)
                    print(f"  Failed lookups: {current_errors}", file=sys.stderr)
                    print("="*60, file=sys.stderr)
                    
                    # Add the molecule that caused the 401 error to the error buffer
                    error_buffer.append({'smiles': smi, 'reason': f'FATAL_401_ERROR: {data}'})
                    
                    # Save the last checkpoint before exiting
                    print("\nSaving last checkpoint before exiting...")
                    save_checkpoint(success_buffer, error_buffer, success_path, error_path)
                    
                    # Immediately terminate all worker processes
                    pool.terminate()
                    return
                # 2. 'ServerBusy' 오류를 감지하는 새로운 조건 추가
                elif status == "error" and 'ServerBusy' in str(data):
                    # 최종 카운트 계산
                    current_successes = total_success_count + len(success_buffer)
                    current_errors = total_error_count + len(error_buffer) + 1 # 현재 오류 포함
                    
                    print("\n" + "="*60, file=sys.stderr)
                    print("API LIMIT ERROR: Received 'ServerBusy' from PubChem API.", file=sys.stderr)
                    print("This is likely due to sending too many requests too quickly.", file=sys.stderr)
                    print("Stopping all processes to avoid a potential IP ban.", file=sys.stderr)
                    print("-" * 60, file=sys.stderr)
                    print("PROCESS SUMMARY:", file=sys.stderr)
                    print(f"  Total molecules processed before stop: {processed_count}", file=sys.stderr)
                    print(f"  Successful lookups: {current_successes}", file=sys.stderr)
                    print(f"  Failed lookups: {current_errors}", file=sys.stderr)
                    print("="*60, file=sys.stderr)
                    
                    error_buffer.append({'smiles': smi, 'reason': f'API_LIMIT_ERROR: {data}'})
                    
                    print("\nSaving last checkpoint before exiting...")
                    save_checkpoint(success_buffer, error_buffer, success_path, error_path)
                    
                    pool.terminate()
                    return
                elif status == "success":
                    success_buffer.append({'smiles': smi, 'iupac': data})
                else: # 'error' status
                    error_buffer.append({'smiles': smi, 'reason': data})

                # Save to file when the buffer reaches the checkpoint interval
                if len(success_buffer) + len(error_buffer) >= checkpoint_interval:
                    total_success_count += len(success_buffer)
                    total_error_count += len(error_buffer)
                    save_checkpoint(success_buffer, error_buffer, success_path, error_path)
        
        except KeyboardInterrupt:
            print("\nProcess interrupted by user. Terminating pool and saving progress...")
            pool.terminate() # Stop worker processes
            pool.join()      # Wait for them to terminate
            save_checkpoint(success_buffer, error_buffer, success_path, error_path)
            print("Progress saved. Exiting.")
            return

    # After the loop finishes, save any remaining data in the buffers
    save_checkpoint(success_buffer, error_buffer, success_path, error_path)

    print("\nProcessing complete.")
    print(f"Success results saved to: {success_path}")
    print(f"Error logs saved to: {error_path}")

In [6]:
check_iupac_parallel(
    smi_path=SMI_PATH,
    success_path=os.path.join(DATA_DIR, "success.csv"),
    error_path=os.path.join(DATA_DIR, "error.csv"),
    checkpoint_interval=100,
    num_cpus=4
)

Using 4 CPU cores.


Processing SMILES -> IUPAC:   0%|          | 0/1499332 [00:00<?, ?it/s]
API LIMIT ERROR: Received 'ServerBusy' from PubChem API.
This is likely due to sending too many requests too quickly.
Stopping all processes to avoid a potential IP ban.
------------------------------------------------------------
PROCESS SUMMARY:
  Total molecules processed before stop: 1
  Successful lookups: 0
  Failed lookups: 1
Processing SMILES -> IUPAC:   0%|          | 0/1499332 [00:00<?, ?it/s]


Saving last checkpoint before exiting...





In [7]:
print("\n--- Success Results ---")
if os.path.exists(os.path.join(DATA_DIR, "test_success.csv")):
    print(pd.read_csv(os.path.join(DATA_DIR, "test_success.csv")))


--- Success Results ---
                                            smiles  \
0           COc1cc(C=O)ccc1OCC(=O)c1ccc(Cl)c(Cl)c1   
1                 CCCCn1c(S)nc2cc(C(=O)OC)ccc2c1=O   
2                  CCOc1cc(C=O)ccc1OC(C)C(=O)N(C)C   
3      C=CCn1c(S)nnc1-c1cc(-c2ccc(C)cc2)nc2ccccc12   
4                             COc1cc(Cl)ccc1C(=O)O   
...                                            ...   
89768                [N-]=[N+]=NCCc1ccc2cc[nH]c2c1   
89769                                CCSCC(C)(C)CO   
89770                          Cn1cnc(CC2(N)CC2)c1   
89771                       CC(C)Oc1ccc(C(C)ON)cc1   
89772                         CC(C)Oc1ccc(CCON)cc1   

                                                   iupac  
0      4-[2-(3,4-dichlorophenyl)-2-oxoethoxy]-3-metho...  
1      methyl 3-butyl-4-oxo-2-sulfanylidene-1H-quinaz...  
2      2-(2-ethoxy-4-formylphenoxy)-N,N-dimethylpropa...  
3      3-[2-(4-methylphenyl)quinolin-4-yl]-4-prop-2-e...  
4                         4-chl

In [8]:
print("\n--- Error Results ---")
if os.path.exists(os.path.join(DATA_DIR, "test_error.csv")):
    print(pd.read_csv(os.path.join(DATA_DIR, "test_error.csv")))


--- Error Results ---
                                  smiles                  reason
0        O=c1[nH][nH]cc2c3cc(F)ccc3nc1-2  No result from PubChem
1       O=c1[nH][nH]cc2c3cc(Cl)ccc3nc1-2  No result from PubChem
2            O=C(/C=C/[O-])c1nc2ccccc2s1  No result from PubChem
3           S=c1[nH][nH]cc2c3ccccc3nc1-2  No result from PubChem
4                      O=C(C=C[O-])C1CC1  No result from PubChem
...                                  ...                     ...
256426       Cc1ccc(CC(C)C(=O)O)c(C)c1Br    'PUGREST.ServerBusy'
256427           Cc1ccc(C(C)CCN)c(C)c1Br    'PUGREST.ServerBusy'
256428             Cc1ccc(CCC#N)c(C)c1Br    'PUGREST.ServerBusy'
256429                 CC(CBr)c1ccn(C)n1    'PUGREST.ServerBusy'
256430                 Cn1ccc(C(O)CBr)n1    'PUGREST.ServerBusy'

[256431 rows x 2 columns]


In [27]:
success_df = pd.read_csv(os.path.join(DATA_DIR, "test_success.csv"))
error_df = pd.read_csv(os.path.join(DATA_DIR, "test_error.csv"))

In [28]:
print(len(success_df['iupac'].unique()))
print(len(error_df['reason'].unique()))

88380
4


In [29]:
print(error_df['reason'].unique())

['No result from PubChem' "'PUGREST.ServerBusy'" "'Proxy Error'"
 '<urlopen error [Errno 101] Network is unreachable>']


In [35]:
print(success_df.shape)
print(f"{len(success_df['iupac'].unique())/len(success_df):.3f}%")

(89773, 2)
0.984%


In [34]:
unique_success_df = success_df['iupac'].unique()

filter_error_df = error_df[~error_df['reason'].str.contains('ServerBusy')]

total_df = pd.concat([success_df, filter_error_df])

ratio = len(success_df['iupac'].unique())/len(total_df)

print(error_df.shape)
print(filter_error_df.shape)
print(total_df.shape)
print(f"{ratio:.3f}%")

(256431, 2)
(1947, 2)
(91720, 3)
0.964%


In [33]:
filter_error_df['reason'].value_counts()

reason
No result from PubChem                                1938
<urlopen error [Errno 101] Network is unreachable>       5
'Proxy Error'                                            4
Name: count, dtype: int64