In [None]:
from rdkit import Chem
import numpy as np
import pandas as pd
import os
import time

def preprocess_smiles(smiles_list, canonical=True, show_progress=True):
    """
    Process a list of SMILES strings and return valid ones
    
    Args:
        smiles_list: List of SMILES strings to process
        canonical: Whether to convert to canonical SMILES
        show_progress: Whether to show progress updates
    
    Returns:
        List of valid SMILES strings
    """
    clean_smiles = []
    invalid_count = 0
    total = len(smiles_list)
    
    start_time = time.time()
    
    if show_progress:
        print(f"Processing {total} SMILES strings...")
    
    batch_size = 5000
    for i in range(0, total, batch_size):
        batch = smiles_list[i:i+batch_size]
        
        for smi in batch:
            mol = Chem.MolFromSmiles(smi)
            if mol:
                if canonical:
                    clean_smiles.append(Chem.MolToSmiles(mol, canonical=True))
                else:
                    clean_smiles.append(smi)
            else:
                invalid_count += 1
        
        if show_progress and i > 0:
            progress = min(i + batch_size, total) / total * 100
            elapsed = time.time() - start_time
            est_total = elapsed / progress * 100
            remaining = est_total - elapsed
            
            print(f"Progress: {progress:.1f}% ({i+batch_size}/{total}) | "
                  f"Valid: {len(clean_smiles)} | Invalid: {invalid_count} | "
                  f"Time remaining: {remaining:.1f}s")
    
    if show_progress:
        print(f"Completed in {time.time() - start_time:.1f}s")
        print(f"Total SMILES: {total}")
        print(f"Valid SMILES: {len(clean_smiles)} ({len(clean_smiles)/total*100:.1f}%)")
        print(f"Invalid SMILES: {invalid_count} ({invalid_count/total*100:.1f}%)")
    
    return clean_smiles

def process_smi_file(file_path, output_path=None):
    """Process a .smi file and save valid SMILES"""
    with open(file_path, 'r') as f:
        smiles_list = [line.strip() for line in f if line.strip() and not line.startswith('//')]
    
    print(f"Loaded {len(smiles_list)} SMILES from {file_path}")
    clean_smiles = preprocess_smiles(smiles_list)

    if output_path is None:
        base_name, ext = os.path.splitext(file_path)
        output_path = f"{base_name}_cleaned{ext}"
    
    with open(output_path, 'w') as f:
        for smi in clean_smiles:
            f.write(f"{smi}\n")
    
    print(f"Saved {len(clean_smiles)} valid SMILES to {output_path}")
    return clean_smiles

process_smi_file('./data/chembl.raw.smi')

Loaded 888657 SMILES from ./data/chembl.raw.smi
Processing 888657 SMILES strings...


[17:21:48] SMILES Parse Error: extra close parentheses while parsing: N3C2C(=O)[O-])cc(CCC(=O)O)[n+]1CCc1cccs1
[17:21:48] SMILES Parse Error: check for mistakes around position 14:
[17:21:48] N3C2C(=O)[O-])cc(CCC(=O)O)[n+]1CCc1cccs1
[17:21:48] ~~~~~~~~~~~~~^
[17:21:48] SMILES Parse Error: Failed parsing SMILES ' for input: '


Progress: 1.1% (10000/888657) | Valid: 9999 | Invalid: 1 | Time remaining: 296.3s
Progress: 1.7% (15000/888657) | Valid: 14999 | Invalid: 1 | Time remaining: 293.3s
Progress: 2.3% (20000/888657) | Valid: 19999 | Invalid: 1 | Time remaining: 290.6s
Progress: 2.8% (25000/888657) | Valid: 24999 | Invalid: 1 | Time remaining: 288.6s
Progress: 3.4% (30000/888657) | Valid: 29999 | Invalid: 1 | Time remaining: 286.5s
Progress: 3.9% (35000/888657) | Valid: 34999 | Invalid: 1 | Time remaining: 284.7s
Progress: 4.5% (40000/888657) | Valid: 39999 | Invalid: 1 | Time remaining: 282.7s
Progress: 5.1% (45000/888657) | Valid: 44999 | Invalid: 1 | Time remaining: 280.9s
Progress: 5.6% (50000/888657) | Valid: 49999 | Invalid: 1 | Time remaining: 279.0s
Progress: 6.2% (55000/888657) | Valid: 54999 | Invalid: 1 | Time remaining: 277.3s
Progress: 6.8% (60000/888657) | Valid: 59999 | Invalid: 1 | Time remaining: 275.7s
Progress: 7.3% (65000/888657) | Valid: 64999 | Invalid: 1 | Time remaining: 274.1s
Progr

[17:22:51] SMILES Parse Error: extra open parentheses while parsing: Cc1ccc(NC(=O)C2CCCN(S(=O)(=O)c3cccs3)
[17:22:51] SMILES Parse Error: check for mistakes around position 7:
[17:22:51] Cc1ccc(NC(=O)C2CCCN(S(=O)(=O)c3cccs3)
[17:22:51] ~~~~~~^
[17:22:51] SMILES Parse Error: Failed parsing SMILES 'Cc1ccc(NC(=O)C2CCCN(S(=O)(=O)c3cccs3) for input: 'Cc1ccc(NC(=O)C2CCCN(S(=O)(=O)c3cccs3)


Progress: 21.4% (190000/888657) | Valid: 189998 | Invalid: 2 | Time remaining: 232.8s
Progress: 21.9% (195000/888657) | Valid: 194998 | Invalid: 2 | Time remaining: 231.1s
Progress: 22.5% (200000/888657) | Valid: 199998 | Invalid: 2 | Time remaining: 229.4s
Progress: 23.1% (205000/888657) | Valid: 204998 | Invalid: 2 | Time remaining: 227.7s
Progress: 23.6% (210000/888657) | Valid: 209998 | Invalid: 2 | Time remaining: 226.0s
Progress: 24.2% (215000/888657) | Valid: 214998 | Invalid: 2 | Time remaining: 224.3s
Progress: 24.8% (220000/888657) | Valid: 219998 | Invalid: 2 | Time remaining: 222.6s
Progress: 25.3% (225000/888657) | Valid: 224998 | Invalid: 2 | Time remaining: 221.0s
Progress: 25.9% (230000/888657) | Valid: 229998 | Invalid: 2 | Time remaining: 219.3s
Progress: 26.4% (235000/888657) | Valid: 234998 | Invalid: 2 | Time remaining: 217.6s
Progress: 27.0% (240000/888657) | Valid: 239998 | Invalid: 2 | Time remaining: 215.9s
Progress: 27.6% (245000/888657) | Valid: 244998 | Inva

[17:24:16] SMILES Parse Error: extra open parentheses while parsing: COc1ccc(NC(=O)c2c
[17:24:16] SMILES Parse Error: check for mistakes around position 8:
[17:24:16] COc1ccc(NC(=O)c2c
[17:24:16] ~~~~~~~^
[17:24:16] SMILES Parse Error: Failed parsing SMILES 'COc1ccc(NC(=O)c2c for input: 'COc1ccc(NC(=O)c2c


Progress: 50.1% (445000/888657) | Valid: 444997 | Invalid: 3 | Time remaining: 147.7s
Progress: 50.6% (450000/888657) | Valid: 449997 | Invalid: 3 | Time remaining: 146.0s
Progress: 51.2% (455000/888657) | Valid: 454997 | Invalid: 3 | Time remaining: 144.4s
Progress: 51.8% (460000/888657) | Valid: 459997 | Invalid: 3 | Time remaining: 142.7s
Progress: 52.3% (465000/888657) | Valid: 464997 | Invalid: 3 | Time remaining: 141.0s
Progress: 52.9% (470000/888657) | Valid: 469997 | Invalid: 3 | Time remaining: 139.4s
Progress: 53.5% (475000/888657) | Valid: 474997 | Invalid: 3 | Time remaining: 137.7s
Progress: 54.0% (480000/888657) | Valid: 479997 | Invalid: 3 | Time remaining: 136.1s
Progress: 54.6% (485000/888657) | Valid: 484997 | Invalid: 3 | Time remaining: 134.4s
Progress: 55.1% (490000/888657) | Valid: 489997 | Invalid: 3 | Time remaining: 132.7s
Progress: 55.7% (495000/888657) | Valid: 494997 | Invalid: 3 | Time remaining: 131.1s
Progress: 56.3% (500000/888657) | Valid: 499997 | Inva

[17:25:31] SMILES Parse Error: syntax error while parsing: COc1cc(-c2ccc3[nH]cc(C(C)=O)c(=
[17:25:31] SMILES Parse Error: check for mistakes around position 32:
[17:25:31] cc3[nH]cc(C(C)=O)c(=
[17:25:31] ~~~~~~~~~~~~~~~~~~~~^
[17:25:31] SMILES Parse Error: Failed parsing SMILES 'COc1cc(-c2ccc3[nH]cc(C(C)=O)c(= for input: 'COc1cc(-c2ccc3[nH]cc(C(C)=O)c(=


Progress: 76.0% (675000/888657) | Valid: 674996 | Invalid: 4 | Time remaining: 71.1s
Progress: 76.5% (680000/888657) | Valid: 679996 | Invalid: 4 | Time remaining: 69.4s
Progress: 77.1% (685000/888657) | Valid: 684996 | Invalid: 4 | Time remaining: 67.8s
Progress: 77.6% (690000/888657) | Valid: 689996 | Invalid: 4 | Time remaining: 66.1s
Progress: 78.2% (695000/888657) | Valid: 694996 | Invalid: 4 | Time remaining: 64.5s
Progress: 78.8% (700000/888657) | Valid: 699996 | Invalid: 4 | Time remaining: 62.8s
Progress: 79.3% (705000/888657) | Valid: 704996 | Invalid: 4 | Time remaining: 61.1s
Progress: 79.9% (710000/888657) | Valid: 709996 | Invalid: 4 | Time remaining: 59.5s
Progress: 80.5% (715000/888657) | Valid: 714996 | Invalid: 4 | Time remaining: 57.8s
Progress: 81.0% (720000/888657) | Valid: 719996 | Invalid: 4 | Time remaining: 56.1s
Progress: 81.6% (725000/888657) | Valid: 724996 | Invalid: 4 | Time remaining: 54.5s
Progress: 82.1% (730000/888657) | Valid: 729996 | Invalid: 4 | Ti

[17:26:00] SMILES Parse Error: extra open parentheses while parsing: COc1ccc(C2(c3ccc(OC)cc3)NC(=O)N(CC(=O)Nc3ccccc3C(F)(F)F
[17:26:00] SMILES Parse Error: check for mistakes around position 8:
[17:26:00] COc1ccc(C2(c3ccc(OC)cc3)NC(=O)N(CC(=O)Nc3
[17:26:00] ~~~~~~~^
[17:26:00] SMILES Parse Error: extra open parentheses while parsing: COc1ccc(C2(c3ccc(OC)cc3)NC(=O)N(CC(=O)Nc3ccccc3C(F)(F)F
[17:26:00] SMILES Parse Error: check for mistakes around position 32:
[17:26:00] c3ccc(OC)cc3)NC(=O)N(CC(=O)Nc3ccccc3C(F)(
[17:26:00] ~~~~~~~~~~~~~~~~~~~~^
[17:26:00] SMILES Parse Error: Failed parsing SMILES 'COc1ccc(C2(c3ccc(OC)cc3)NC(=O)N(CC(=O)Nc3ccccc3C(F)(F)F for input: 'COc1ccc(C2(c3ccc(OC)cc3)NC(=O)N(CC(=O)Nc3ccccc3C(F)(F)F


Progress: 85.5% (760000/888657) | Valid: 759995 | Invalid: 5 | Time remaining: 42.8s
Progress: 86.1% (765000/888657) | Valid: 764995 | Invalid: 5 | Time remaining: 41.2s
Progress: 86.6% (770000/888657) | Valid: 769995 | Invalid: 5 | Time remaining: 39.5s
Progress: 87.2% (775000/888657) | Valid: 774995 | Invalid: 5 | Time remaining: 37.8s
Progress: 87.8% (780000/888657) | Valid: 779995 | Invalid: 5 | Time remaining: 36.2s
Progress: 88.3% (785000/888657) | Valid: 784995 | Invalid: 5 | Time remaining: 34.5s
Progress: 88.9% (790000/888657) | Valid: 789995 | Invalid: 5 | Time remaining: 32.8s
Progress: 89.5% (795000/888657) | Valid: 794995 | Invalid: 5 | Time remaining: 31.2s
Progress: 90.0% (800000/888657) | Valid: 799995 | Invalid: 5 | Time remaining: 29.5s
Progress: 90.6% (805000/888657) | Valid: 804995 | Invalid: 5 | Time remaining: 27.8s
Progress: 91.1% (810000/888657) | Valid: 809995 | Invalid: 5 | Time remaining: 26.2s
Progress: 91.7% (815000/888657) | Valid: 814995 | Invalid: 5 | Ti

[17:26:36] SMILES Parse Error: syntax error while parsing: CN1CCN(c2cc(=NCc3cccnc3)n3[nH]cc(
[17:26:36] SMILES Parse Error: check for mistakes around position 34:
[17:26:36] NCc3cccnc3)n3[nH]cc(
[17:26:36] ~~~~~~~~~~~~~~~~~~~~^
[17:26:36] SMILES Parse Error: Failed parsing SMILES 'CN1CCN(c2cc(=NCc3cccnc3)n3[nH]cc( for input: 'CN1CCN(c2cc(=NCc3cccnc3)n3[nH]cc(


Progress: 97.3% (865000/888657) | Valid: 864994 | Invalid: 6 | Time remaining: 7.9s
Progress: 97.9% (870000/888657) | Valid: 869994 | Invalid: 6 | Time remaining: 6.2s
Progress: 98.5% (875000/888657) | Valid: 874994 | Invalid: 6 | Time remaining: 4.5s
Progress: 99.0% (880000/888657) | Valid: 879994 | Invalid: 6 | Time remaining: 2.9s
Progress: 99.6% (885000/888657) | Valid: 884994 | Invalid: 6 | Time remaining: 1.2s
Progress: 100.0% (890000/888657) | Valid: 888651 | Invalid: 6 | Time remaining: 0.0s
Completed in 295.7s
Total SMILES: 888657
Valid SMILES: 888651 (100.0%)
Invalid SMILES: 6 (0.0%)
Saved 888651 valid SMILES to ./data/chembl.raw_cleaned.smi


['NCc1cn(S(=O)(=O)c2ccc(N)cc2)c2ccccc12',
 'FC(F)(F)c1nc2ccccc2[nH]c1=NN=Cc1ccccc1',
 'CCN(CC)CC(=O)N(C)c1nc2cc3nc(N(C)C(=O)CN(CC)CC)sc3cc2s1',
 'CCOc1cc2[nH]c(=O)n(CCc3ccc(OC)c(OC)c3)c(=O)c2cc1OCC',
 'COc1cccc(NC(=O)c2[nH]nc3ccccc23)c1',
 'CC(=NN=c1[nH]c(C)c(N=Nc2ccc(Cl)cc2)s1)c1[nH]c(-c2ccccc2)nc1C',
 'O=c1cc(CCc2ccc(C(F)(F)F)cc2)[nH][nH]c1=O',
 'CC(C)(C)C(NC(=O)CC(N)C(=O)N1CCCC1C#N)c1ccccc1',
 'CC(C)c1ccc(-c2cc(-c3ccc4c(c3)CCCC4)nc3[nH][nH]c(=N)c23)cc1',
 'O=C1COc2ccc(NC(=O)C3CCN(c4cccc(Br)c4)CC3)cc2N1',
 'CCN(CC)C(O)c1cc(CC(=O)NCCN2CCOCC2)c(=O)n(CCC2=CCCCC2)c1C',
 'C=c1sc2ccc(O)c3c(O)c4c(O)cccc4c1c23',
 'CC(CCC(C)C)=NNC(=O)COc1ccccc1-c1ccccc1',
 'Cc1c[nH]nc1C1CCCCN1C(=O)Cn1cc(Cl)cn1',
 'COC(=O)CC(NC(=O)C(C)NC(=O)C(NC(=O)CCCCCNC(=O)CCCCC1SCC2NC(=O)NC21)C(C)C)C(=O)CF',
 'CN1c2ccc(NS(=O)(=O)c3cccc(C(F)(F)F)c3)cc2N=C(c2ccc(C(=O)O)cc2)c2cc3c(cc21)C(C)(C)CCC3(C)C',
 'CCOC(=O)CNC(=O)C(C)NC(=O)C(NC(=O)OCc1ccccc1)C(C)C',
 'O=C1c2c(c3c4ccc(O)cc4n(OC4OC(CO)C(O)C(O)C4O)c3c3[nH]c4cc(O)ccc4c23)C