# AqSolDB prep
Load raw AqSolDB, canonicalize SMILES, scaffold split, and save artifacts for downstream models.


In [2]:
import os, sys
from pathlib import Path

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

sys.path.append(os.path.join("..", "src"))
from chem_utils import canonicalize_and_clean, load_aqsoldb, save_split_indices, scaffold_or_random_split

DATA = Path("../data/aqsoldb.csv")
SEED = 42


In [3]:
# Load and clean
df = load_aqsoldb(DATA)
df = canonicalize_and_clean(df)
print("N after cleaning:", len(df))
df.to_csv("../data/aqsoldb_clean.csv", index=False)




N after cleaning: 6110




In [4]:
# Scaffold split (falls back to random if needed)
train_idx, val_idx, test_idx = scaffold_or_random_split(df, seed=SEED)
save_split_indices(train_idx, val_idx, test_idx, "../artifacts")

df.loc[train_idx].to_csv("../data/train.csv", index=False)
df.loc[val_idx].to_csv("../data/val.csv", index=False)
df.loc[test_idx].to_csv("../data/test.csv", index=False)
print("Saved splits.")




Saved splits.


