In [1]:
from rdkit import Chem
import numpy as np
import pandas as pd
import os
import time

def preprocess_smiles(smiles_list, canonical=True, show_progress=True):
    """
    Process a list of SMILES strings and return valid ones
    
    Args:
        smiles_list: List of SMILES strings to process
        canonical: Whether to convert to canonical SMILES
        show_progress: Whether to show progress updates
    
    Returns:
        List of valid SMILES strings
    """
    clean_smiles = []
    invalid_count = 0
    total = len(smiles_list)
    
    start_time = time.time()
    
    if show_progress:
        print(f"Processing {total} SMILES strings...")
    
    # Process in batches of 5000 for progress reporting
    batch_size = 5000
    for i in range(0, total, batch_size):
        batch = smiles_list[i:i+batch_size]
        
        for smi in batch:
            mol = Chem.MolFromSmiles(smi)
            if mol:
                if canonical:
                    clean_smiles.append(Chem.MolToSmiles(mol, canonical=True))
                else:
                    clean_smiles.append(smi)
            else:
                invalid_count += 1
        
        if show_progress and i > 0:
            progress = min(i + batch_size, total) / total * 100
            elapsed = time.time() - start_time
            est_total = elapsed / progress * 100
            remaining = est_total - elapsed
            
            print(f"Progress: {progress:.1f}% ({i+batch_size}/{total}) | "
                  f"Valid: {len(clean_smiles)} | Invalid: {invalid_count} | "
                  f"Time remaining: {remaining:.1f}s")
    
    # Final stats
    if show_progress:
        print(f"Completed in {time.time() - start_time:.1f}s")
        print(f"Total SMILES: {total}")
        print(f"Valid SMILES: {len(clean_smiles)} ({len(clean_smiles)/total*100:.1f}%)")
        print(f"Invalid SMILES: {invalid_count} ({invalid_count/total*100:.1f}%)")
    
    return clean_smiles

def process_smi_file(file_path, output_path=None):
    """Process a .smi file and save valid SMILES"""
    # Read the file
    with open(file_path, 'r') as f:
        smiles_list = [line.strip() for line in f if line.strip() and not line.startswith('//')]
    
    print(f"Loaded {len(smiles_list)} SMILES from {file_path}")
    
    # Process SMILES
    clean_smiles = preprocess_smiles(smiles_list)
    
    # Save results with "cleaned" in filename
    if output_path is None:
        base_name, ext = os.path.splitext(file_path)
        output_path = f"{base_name}_cleaned{ext}"
    
    with open(output_path, 'w') as f:
        for smi in clean_smiles:
            f.write(f"{smi}\n")
    
    print(f"Saved {len(clean_smiles)} valid SMILES to {output_path}")
    return clean_smiles

process_smi_file('./data/chembl.mini.smi')

Loaded 171548 SMILES from ./data/chembl.mini.smi
Processing 171548 SMILES strings...
Progress: 5.8% (10000/171548) | Valid: 10000 | Invalid: 0 | Time remaining: 54.3s
Progress: 8.7% (15000/171548) | Valid: 15000 | Invalid: 0 | Time remaining: 52.2s
Progress: 11.7% (20000/171548) | Valid: 20000 | Invalid: 0 | Time remaining: 50.4s
Progress: 14.6% (25000/171548) | Valid: 25000 | Invalid: 0 | Time remaining: 48.7s
Progress: 17.5% (30000/171548) | Valid: 30000 | Invalid: 0 | Time remaining: 46.9s
Progress: 20.4% (35000/171548) | Valid: 35000 | Invalid: 0 | Time remaining: 45.2s
Progress: 23.3% (40000/171548) | Valid: 40000 | Invalid: 0 | Time remaining: 43.6s
Progress: 26.2% (45000/171548) | Valid: 45000 | Invalid: 0 | Time remaining: 41.9s
Progress: 29.1% (50000/171548) | Valid: 50000 | Invalid: 0 | Time remaining: 40.2s
Progress: 32.1% (55000/171548) | Valid: 55000 | Invalid: 0 | Time remaining: 38.6s
Progress: 35.0% (60000/171548) | Valid: 60000 | Invalid: 0 | Time remaining: 36.9s
Prog

['CC1(COC(C(=O)Nc2cccc(Cl)c2)=C(C=N)N2CCN(S(=O)(=O)NCc3nc4cc(Cl)ccc4o3)CC2)CC1',
 'CC(=O)C(NC(=O)C(CCCCNC(C)=S)NC(=O)C(CC(=O)O)NC(=O)C(N)CO)C(O)NC(C)C(=O)O',
 'Cc1cc(S(=O)(=O)NC(C)c2nnc3ccccn23)ccc1Br',
 'Cc1cc(=NC(=O)c2ccc3c(-c4nc5ccccc5[nH]4)[nH]nc3c2)[nH][nH]1',
 'O=C1CC(c2ccc(Br)cc2)Nc2c(Br)cc(Br)cc21',
 'CCCCC1=NC2(CCN(C(C)=O)CC2)C(=O)N1Cc1ccc(-c2ccccc2C(=O)O)cc1',
 'CCOC(Cc1ccc(OCC=C(C)c2ccc(-c3ccc(C(C)=CCOc4ccc(CC(OCC)C(=O)O)cc4)cc3)cc2)cc1)C(=O)O',
 'Cc1ccc(C(=O)CSc2ncc3c(n2)-c2ccccc2N(Cc2ccc(Cl)cc2)S3(=O)=O)cc1',
 'CC(NCC1OC(CO)C(O)C1O)c1ccc(Cl)cc1',
 'O=C(N=c1[nH]cc(Cc2ccccc2)s1)c1ccccc1',
 'CC(C)c1cc2c(cc1O)C1(C)CCCC(C)(C)C1CC2CC1=CCC(O)(C(C)C)CC1',
 'O=C(Cn1cnc(-c2ccccc2)cc1=O)N=c1cc[nH]cc1',
 'Clc1ccc(CN2CCN(CCCOc3cccc4cccnc34)CC2)cc1',
 'O=C(O)C=CC1CC=NCC1',
 'N=c1nc2ccnc(-c3ccccc3)n2[nH]1',
 'O=C(N=c1cc[nH]cc1)C(=O)c1cn(-c2cccnc2)c2ccccc12',
 'CCON=C1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(CCF)c3c2F)CC1(C)CN',
 'c1coc(CN(Cc2cccs2)Cc2nnnn2C2CCCCC2)c1',
 'O=C(NC(=S)NC1CCCCC1)c1cccc(