# 1. Import Libraries and Tools and Create Helper Functions

In [2]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np
from typing import List, Optional, Tuple, Dict
from tqdm.auto import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, MACCSkeys, DataStructs
from rdkit.Chem.Descriptors import MolLogP

In [3]:
def find_hiv_targets(client, search_term="protease"):
    """
    Search ChEMBL for HIV related targets.

    Parameters
    ----------
    client : chembl_webresource_client.new_client
        A ChEMBL new_client instance.
    search_term : str, optional
        Broad search term (default: "protease").

    Returns
    -------
    candidates : pd.DataFrame
        Filtered DataFrame of likely HIV/protease targets
        with chembl_id, pref_name, organism, and component_text.
    """

    tc = client.target
    hits = tc.search(search_term)

    rows = []
    for h in hits:
        tid = h.get('target_chembl_id')
        pname = h.get('pref_name') or ''
        org = h.get('organism') or ''

        comp_text = ''
        for c in h.get('target_components') or []:
            comp_text += ' ' + (c.get('component_description') or '')
            for s in c.get('target_component_synonyms') or []:
                comp_text += ' ' + (s.get('component_synonym') or '')

        rows.append({
            'chembl_id': tid,
            'pref_name': pname,
            'organism': org,
            'component_text': comp_text
        })

    targets_df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)

    # filter for HIV/protease-related targets
    mask = (
        targets_df['component_text'].str.lower().str.contains('hiv|human immunodeficiency', na=False) |
        targets_df['pref_name'].str.lower().str.contains(f'hiv|{search_term}', na=False) |
        targets_df['organism'].str.lower().str.contains('hiv|human immunodeficiency', na=False)
    )

    candidates = targets_df[mask].reset_index(drop=True)

    print(f"Found {len(candidates)} candidate target(s) mentioning 'HIV' or '{search_term}e'.")
    if len(candidates) == 0:
        print(f"No obvious HIV hits found. Try search terms like 'HIV-1 {search_term}' or by known accession.")
    return candidates[['chembl_id', 'pref_name', 'organism', 'component_text']]

# 2. Curate Data for HIV-1 integrase inhibitors

In [4]:
# ==============================================================================
# Search targets and show likely HIV integrase candidates
# ==============================================================================
candidates = find_hiv_targets(new_client, search_term="integrase")
display(candidates)

Found 9 candidate target(s) mentioning 'HIV' or 'integrasee'.


Unnamed: 0,chembl_id,pref_name,organism,component_text
0,CHEMBL3463,Human immunodeficiency virus type 2 integrase,Human immunodeficiency virus 2,Integrase pol Integrase
1,CHEMBL2366505,Integrase,Human immunodeficiency virus 1,Gag-Pol polyprotein Gag-Pol polyprotein Pr160...
2,CHEMBL4296304,Integrase,Human immunodeficiency virus 1,Gag-Pol polyprotein int Gag-Pol polyprotein P...
3,CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,Gag-Pol polyprotein pol Gag-Pol polyprotein P...
4,CHEMBL5823,Gag-Pol polyprotein,Human immunodeficiency virus type 1 group M su...,Gag-Pol polyprotein gag-pol Gag-Pol polyprote...
5,CHEMBL3638326,Gag-Pol polyprotein,Human immunodeficiency virus type 1 group M su...,Gag-Pol polyprotein gag-pol Gag-Pol polyprote...
6,CHEMBL3638331,Gag-Pol polyprotein,Human immunodeficiency virus type 1 group M su...,Gag-Pol polyprotein gag-pol Gag-Pol polyprote...
7,CHEMBL3638352,Gag-Pol polyprotein,Human immunodeficiency virus type 1 group M su...,Gag-Pol polyprotein gag-pol Gag-Pol polyprote...
8,CHEMBL3638360,Gag-Pol polyprotein,Human immunodeficiency virus type 1 group M su...,Gag-Pol polyprotein gag-pol Gag-Pol polyprote...


In [5]:
# ================================================
# Fetch IC50 activities for chosen target(s)
# ================================================
activity_client = new_client.activity

# There are only two target HIV-1 proteases
target_ids = ['CHEMBL3471', 'CHEMBL2366505', 'CHEMBL4296304']

all_acts = []
for tid in target_ids:
    print("Downloading activities for", tid)
    # fetch activities filtered by standard_type=IC50 (iterable)
    acts_iter = activity_client.filter(target_chembl_id=tid, standard_type='IC50')
    acts = list(acts_iter)  # convert to list
    print(" -> returned", len(acts), "rows")
    for a in acts:
        a['_queried_target'] = tid
    all_acts.extend(acts)

print("Total activity rows collected:", len(all_acts))
acts_df = pd.DataFrame(all_acts)
# quick glance
display(acts_df.head())


Downloading activities for CHEMBL3471
 -> returned 8000 rows
Downloading activities for CHEMBL2366505
 -> returned 1 rows
Downloading activities for CHEMBL4296304
 -> returned 0 rows
Total activity rows collected: 8001


Unnamed: 0,action_type,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,...,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value,_queried_target
0,,,32538,[],CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",B,,,BAO_0000190,...,Human immunodeficiency virus type 1 integrase,11676,,,IC50,uM,UO_0000065,,9.0,CHEMBL3471
1,,,34946,[],CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",B,,,BAO_0000190,...,Human immunodeficiency virus type 1 integrase,11676,,,IC50,uM,UO_0000065,,1.4,CHEMBL3471
2,,,34947,[],CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",B,,,BAO_0000190,...,Human immunodeficiency virus type 1 integrase,11676,,,IC50,uM,UO_0000065,,1.0,CHEMBL3471
3,,,34948,[],CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",B,,,BAO_0000190,...,Human immunodeficiency virus type 1 integrase,11676,,,IC50,uM,UO_0000065,,1.7,CHEMBL3471
4,,,34949,[],CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",B,,,BAO_0000190,...,Human immunodeficiency virus type 1 integrase,11676,,,IC50,uM,UO_0000065,,1.0,CHEMBL3471


In [6]:
acts_df.columns

Index(['action_type', 'activity_comment', 'activity_id', 'activity_properties',
       'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'bao_endpoint',
       'bao_format', 'bao_label', 'canonical_smiles', 'data_validity_comment',
       'data_validity_description', 'document_chembl_id', 'document_journal',
       'document_year', 'ligand_efficiency', 'molecule_chembl_id',
       'molecule_pref_name', 'parent_molecule_chembl_id', 'pchembl_value',
       'potential_duplicate', 'qudt_units', 'record_id', 'relation', 'src_id',
       'standard_flag', 'standard_relation', 'standard_text_value',
       'standard_type', 'standard_units', 'standard_upper_value',
       'standard_value', 'target_chembl_id', 'target_organism',
       'target_pref_name', 'target_tax_id', 'text_value', 'toid', 'type',
       'units', 'uo_units', 'upper_value', 'value', '_queried_target'],
      dtype='object')

In [7]:
cols = ['molecule_chembl_id', 'canonical_smiles', 'standard_type', 'standard_value',
        'standard_units', 'pchembl_value', 'assay_chembl_id', 'assay_description',
        'target_chembl_id', 'target_pref_name', 'target_organism', 'standard_flag',
        'potential_duplicate', 'data_validity_comment', 'data_validity_description']
ii_df = acts_df[cols]
ii_df.head()

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,standard_units,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_flag,potential_duplicate,data_validity_comment,data_validity_description
0,CHEMBL324842,O=C(/C=C/c1ccc(O)c(O)c1)O[C@H](Cc1ccc(O)c(O)c1...,IC50,9000.0,nM,5.05,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,1,1,,
1,CHEMBL304722,O=C(CC(O)CCc1ccc(O)c(O)c1)O[C@H]1Cc2cc(O)c(O)c...,IC50,1400.0,nM,5.85,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,1,0,,
2,CHEMBL304722,O=C(CC(O)CCc1ccc(O)c(O)c1)O[C@H]1Cc2cc(O)c(O)c...,IC50,1000.0,nM,6.0,CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,1,0,,
3,CHEMBL67076,O=C(CCc1ccc(O)c(O)c1)c1ccc(O)c(O)c1O,IC50,1700.0,nM,5.77,CHEMBL701719,"Inhibition of HIV-1 integrase, under 1 uM for ...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,1,0,,
4,CHEMBL67076,O=C(CCc1ccc(O)c(O)c1)c1ccc(O)c(O)c1O,IC50,1000.0,nM,6.0,CHEMBL701720,"Tested for inhibition of HIV-1 integrase, unde...",CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,1,0,,


In [8]:
# Remove rows where standard_value is not a number
non_numeric_mask = pd.to_numeric(ii_df['standard_value'], errors='coerce').isna()
non_numeric_rows = ii_df[non_numeric_mask]

print(f"Non-numeric rows: {len(non_numeric_rows)}")
display(non_numeric_rows.head())

# Now drop them
ii_df = ii_df[~non_numeric_mask].reset_index(drop=True)

Non-numeric rows: 115


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,standard_units,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_flag,potential_duplicate,data_validity_comment,data_validity_description
782,CHEMBL169404,CC(=O)NS(=O)(=O)c1ccc(NC(=O)c2ccccc2SC(=O)CCCC...,IC50,,nM,,CHEMBL702427,50% Inhibitory activity against Integrase; No ...,CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,0,0,,
783,CHEMBL169259,O=C(CCCC[n+]1ccccc1)Sc1ccccc1C(=O)Nc1ccc(S(=O)...,IC50,,nM,,CHEMBL702427,50% Inhibitory activity against Integrase; No ...,CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,0,0,,
784,CHEMBL169529,CC(=O)NS(=O)(=O)c1ccc(NC(=O)c2ccccc2SC(=O)CCCC...,IC50,,nM,,CHEMBL702427,50% Inhibitory activity against Integrase; No ...,CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,0,0,,
961,CHEMBL96959,Oc1cccc2ccc(/C=C/c3ccc4cccc(O)c4n3)nc12,IC50,,nM,,CHEMBL702102,In vitro anti-HIV integrase activity against i...,CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,0,0,,
968,CHEMBL97679,Oc1ccc(/C=C/c2ccc3ccccc3n2)cc1O,IC50,,nM,,CHEMBL879162,In vitro anti-HIV integrase activity against i...,CHEMBL3471,Human immunodeficiency virus type 1 integrase,Human immunodeficiency virus 1,0,0,,


In [9]:
ii_df['standard_units'].value_counts()

Unnamed: 0_level_0,count
standard_units,Unnamed: 1_level_1
nM,7819
ug.mL-1,67


In [10]:
print(ii_df['data_validity_comment'].value_counts())
print(ii_df['data_validity_description'].value_counts())
print(ii_df['potential_duplicate'].value_counts())

data_validity_comment
Outside typical range            1126
Potential transcription error       6
Name: count, dtype: int64
data_validity_description
Values for this activity type are unusually large/small, so may not be accurate                            1126
Values appear to be an order of magnitude different from previously reported, so units may be incorrect       6
Name: count, dtype: int64
potential_duplicate
0    7501
1     385
Name: count, dtype: int64


In [11]:
# keep rows where potential_duplicate is not True
ii_df = ii_df[ii_df['potential_duplicate'] != True]

# keep rows where data_validity_comment is null/NaN
ii_df = ii_df[ii_df['data_validity_comment'].isna()]

ii_df.shape

(6392, 15)

In [12]:
print(ii_df['data_validity_comment'].value_counts())
print(ii_df['data_validity_description'].value_counts())
print(ii_df['potential_duplicate'].value_counts())

Series([], Name: count, dtype: int64)
Series([], Name: count, dtype: int64)
potential_duplicate
0    6392
Name: count, dtype: int64


In [13]:
ii_df.to_csv('ii_df.csv', index=False)

# 3. Standardize IC50 and Deduplicate the data

In [14]:
# normalize unit strings
def norm_unit(u):
    if pd.isna(u):
        return u
    if u == 'ug.mL-1':
        return 'ug/mL'
    return u


# compute mol weight for each unique canonical_smiles
def molwt_from_smiles(smi):
    if pd.isna(smi):
        return np.nan
    try:
        m = Chem.MolFromSmiles(smi)
        if m is None:
            return np.nan
        return Descriptors.MolWt(m)  # g/mol
    except:
        return np.nan


# function to convert a row to nM
def value_to_nM(row):
    val = row['standard_value']
    unit = row['standard_units_norm']
    # ensure numeric
    try:
        v = float(val)
    except:
        return np.nan
    if pd.isna(unit):
        return np.nan

    if unit == 'nM':
        return v

    elif unit == 'ug/mL':
        mw = row.get('mol_wt_g_per_mol', np.nan)
        if pd.isna(mw) or mw == 0:
            return np.nan
        # nM = v * 1e6 / MW
        return float(v) * 1e6 / float(mw)

    # add more conversions if you see additional units
    return np.nan


# compute pIC50
def to_pIC50(nM):
    if pd.isna(nM) or nM <= 0:
        return np.nan
    return 9.0 - np.log10(nM)

In [24]:
ii_df = pd.read_csv('ii_df.csv')

In [25]:
print(ii_df.shape)

(6392, 15)


In [26]:
ii_df['standard_units_norm'] = ii_df['standard_units'].astype(str).apply(norm_unit)

# compute MW per unique smiles and map back
unique_smiles = ii_df['canonical_smiles'].dropna().unique().tolist()
mw_map = {}
for s in unique_smiles:
    mw_map[s] = molwt_from_smiles(s)

# add MW column
ii_df['mol_wt_g_per_mol'] = ii_df['canonical_smiles'].map(mw_map)

ii_df['IC50_nM'] = ii_df.apply(value_to_nM, axis=1)

ii_df['pIC50'] = ii_df['IC50_nM'].apply(to_pIC50)

# Quick diagnostics: show counts by unit and how many converted sucessfully
print("Unit value counts:")
print(ii_df['standard_units_norm'].value_counts(dropna=False))
print("\nConversion success counts (IC50_nM not null):")
print(ii_df.groupby('standard_units_norm')['IC50_nM'].apply(lambda s: s.notna().sum()))

# Flag rows where conversion failed (mass units but no MW)
mask_mass_units = ii_df['standard_units_norm'].str.contains('ug|mg', na=False)
failed = ii_df[mask_mass_units & ii_df['IC50_nM'].isna()]
print("\nRows with mass units that could not be converted (need manual inspection):", len(failed))
display(failed.head(10))

Unit value counts:
standard_units_norm
nM       6353
ug/mL      39
Name: count, dtype: int64

Conversion success counts (IC50_nM not null):
standard_units_norm
nM       6353
ug/mL      39
Name: IC50_nM, dtype: int64

Rows with mass units that could not be converted (need manual inspection): 0


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_type,standard_value,standard_units,pchembl_value,assay_chembl_id,assay_description,target_chembl_id,target_pref_name,target_organism,standard_flag,potential_duplicate,data_validity_comment,data_validity_description,standard_units_norm,mol_wt_g_per_mol,IC50_nM,pIC50


In [27]:
ii_df['standard_units_norm'].value_counts()

Unnamed: 0_level_0,count
standard_units_norm,Unnamed: 1_level_1
nM,6353
ug/mL,39


In [28]:
# Drop rows where standard_units_norm is None (or missing)
ii_df_clean = ii_df[ii_df['standard_units_norm']!='None']
ii_df_clean['standard_units_norm'].value_counts()

Unnamed: 0_level_0,count
standard_units_norm,Unnamed: 1_level_1
nM,6353
ug/mL,39


In [29]:
ii_df_clean.drop(columns=['standard_units', 'standard_flag', 'potential_duplicate',
                          'data_validity_comment', 'data_validity_description', 'mol_wt_g_per_mol'],
                 inplace=True)

print(ii_df_clean.shape)

(6392, 13)


In [30]:
ii_df_clean.to_csv('ii_df_clean.csv', index=False)

# 4. Compute Descriptors and Fingerprints

In [31]:
# ------------------------------------------------------------------------------
# Utilities: SMILES -> Mol
# ------------------------------------------------------------------------------
def mol_from_smiles_safe(smi: str) -> Optional[Chem.Mol]:
    """Return RDKit Mol for a SMILES string or None if parsing fails."""
    if smi is None or (isinstance(smi, float) and np.isnan(smi)):
        return None
    try:
        m = Chem.MolFromSmiles(str(smi))
        return m
    except Exception:
        return None

def add_rdkit_mols(df: pd.DataFrame, smiles_col: str = 'canonical_smiles', mol_col: str = 'rdkit_mol',
                   show_progress: bool = False) -> pd.DataFrame:
    """Add a column with RDKit Mol objects parsed from SMILES. Returns new DataFrame (copy)."""
    df = df.copy()
    seq = df[smiles_col].tolist()
    mols = []
    if show_progress:
        iterable = tqdm(seq, desc='Parsing SMILES')
    else:
        iterable = seq
    for s in iterable:
        mols.append(mol_from_smiles_safe(s))
    df[mol_col] = mols
    return df

# ------------------------------------------------------------------------------
# Descriptor calculation
# ------------------------------------------------------------------------------
DEFAULT_DESC_LIST = [
    'MolWt', 'MolLogP', 'MolMR', 'TPSA', 'NumHDonors', 'NumHAcceptors',
    'NumRotatableBonds', 'NumAromaticRings', 'HeavyAtomCount', 'FractionCSP3'
]

def compute_descriptors_for_mol(m: Chem.Mol) -> Dict[str, float]:
    """Compute a set of basic 2D descriptors for a single RDKit Mol."""
    if m is None:
        return {k: np.nan for k in DEFAULT_DESC_LIST}
    try:
        molwt = Descriptors.MolWt(m)
        logp, mr = rdMolDescriptors.CalcCrippenDescriptors(m)
        tpsa = rdMolDescriptors.CalcTPSA(m)
        hbd = rdMolDescriptors.CalcNumHBD(m)
        hba = rdMolDescriptors.CalcNumHBA(m)
        rot = rdMolDescriptors.CalcNumRotatableBonds(m)
        arom = rdMolDescriptors.CalcNumAromaticRings(m)
        hac = Descriptors.HeavyAtomCount(m)
        fsp3 = rdMolDescriptors.CalcFractionCSP3(m)
        return {
            'MolWt': float(molwt),
            'MolLogP': float(logp),
            'MolMR': float(mr),
            'TPSA': float(tpsa),
            'NumHDonors': int(hbd),
            'NumHAcceptors': int(hba),
            'NumRotatableBonds': int(rot),
            'NumAromaticRings': int(arom),
            'HeavyAtomCount': int(hac),
            'FractionCSP3': float(fsp3)
        }
    except Exception:
        return {k: np.nan for k in DEFAULT_DESC_LIST}

def compute_descriptors_df(df: pd.DataFrame, mol_col: str = 'rdkit_mol',
                           desc_names: Optional[List[str]] = None,
                           show_progress: bool = False) -> pd.DataFrame:
    """Compute descriptors for all molecules in df[mol_col] and return a DataFrame aligned with df index."""
    if desc_names is None:
        desc_names = DEFAULT_DESC_LIST
    rows = []
    seq = df[mol_col].tolist()
    if show_progress:
        iterable = tqdm(seq, desc='Computing descriptors')
    else:
        iterable = seq
    for m in iterable:
        rows.append(compute_descriptors_for_mol(m))
    desc_df = pd.DataFrame(rows, index=df.index)[desc_names]
    return desc_df

# ------------------------------------------------------------------------------
# Fingerprint calculations
# ------------------------------------------------------------------------------
def morgan_fp_array(m: Chem.Mol, radius: int = 2, nBits: int = 2048) -> np.ndarray:
    """Return numpy array (0/1 int) of Morgan fingerprint for a single RDKit Mol."""
    if m is None:
        return np.zeros((nBits,), dtype=np.uint8)
    try:
        bitvect = AllChem.GetMorganFingerprintAsBitVect(m, radius, nBits=nBits)
        arr = np.zeros((nBits,), dtype=np.uint8)
        DataStructs.ConvertToNumpyArray(bitvect, arr)
        return arr
    except Exception:
        return np.zeros((nBits,), dtype=np.uint8)

def compute_fingerprints_arrays(df: pd.DataFrame, mol_col: str = 'rdkit_mol',
                                morgan_radius: int = 2, morgan_nbits: int = 512,
                                show_progress: bool = False
                               ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
    """Compute Morgan fingerprint array (n_samples, nBits). Optionally compute MACCS (n_samples, 167).
       Returns morgan_array."""
    mols = df[mol_col].tolist()
    if show_progress:
        mols_iter = tqdm(mols, desc='Computing fingerprints')
    else:
        mols_iter = mols
    morgan_list = [morgan_fp_array(m, radius=morgan_radius, nBits=morgan_nbits) for m in mols_iter]
    morgan_arr = np.vstack(morgan_list).astype(np.uint8)
    return morgan_arr

# ------------------------------------------------------------------------------
# Helpers to convert fp arrays -> DataFrame with bit columns
# ------------------------------------------------------------------------------
def fp_array_to_df(fp_arr: np.ndarray, prefix: str = 'FP') -> pd.DataFrame:
    """Convert 2D numpy fingerprint array (n_samples, nBits) to DataFrame with columns FP_0..FP_{nBits-1}."""
    nBits = fp_arr.shape[1]
    cols = [f'{prefix}_{i}' for i in range(nBits)]
    return pd.DataFrame(fp_arr, columns=cols, index=range(fp_arr.shape[0]))

# ------------------------------------------------------------------------------
# Build feature DataFrame
# ------------------------------------------------------------------------------
def build_feature_dataframe(df: pd.DataFrame,
                            smiles_col: str = 'canonical_smiles',
                            mol_col: str = 'rdkit_mol',
                            desc_names: Optional[List[str]] = None,
                            morgan_radius: int = 2,
                            morgan_nbits: int = 512,
                            show_progress: bool = False
                           ) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
    """Given a DataFrame with SMILES, return:
       - feature_df: descriptors + fingerprint bit columns (pandas DataFrame)
       - morgan_array: (n_samples, nBits) numpy array
    """
    # 1) ensure rdkit mols exist
    if mol_col not in df.columns:
        df = add_rdkit_mols(df, smiles_col=smiles_col, mol_col=mol_col, show_progress=show_progress)
    # 2) descriptors
    desc_df = compute_descriptors_df(df, mol_col=mol_col, desc_names=desc_names, show_progress=show_progress)
    # 3) fingerprints arrays
    morgan_arr= compute_fingerprints_arrays(df, mol_col=mol_col,
                                            morgan_radius=morgan_radius, morgan_nbits=morgan_nbits,
                                            show_progress=show_progress)
    # 4) convert fp arrays to DataFrame
    morgan_df = fp_array_to_df(morgan_arr, prefix=f'Morgan_{morgan_nbits}')
    feature_df = pd.concat([desc_df.reset_index(drop=True), morgan_df.reset_index(drop=True)], axis=1)
    return feature_df, morgan_arr

# ------------------------------------------------------------------------------
# Save/load helpers
# ------------------------------------------------------------------------------
def save_feature_arrays(prefix: str,
                        feature_df: pd.DataFrame,
                        morgan_arr: np.ndarray,
                        ) -> Dict[str,str]:
    """Save feature DataFrame to parquet and fingerprint arrays as .npy files.
       Returns dict of saved paths.
    """
    feature_path = f'{prefix}_features.csv'
    morgan_path = f'{prefix}_morgan.npy'
    pd.DataFrame(feature_df).to_csv(feature_path, index=False)
    np.save(morgan_path, morgan_arr)
    saved = {'features_csv': feature_path, 'morgan_npy': morgan_path}
    return saved

def load_feature_arrays(prefix: str) -> Tuple[pd.DataFrame, np.ndarray, Optional[np.ndarray]]:
    """Load previously saved files (expects same naming used in save_feature_arrays)."""
    feature_path = f'{prefix}_features.csv'
    morgan_path = f'{prefix}_morgan.npy'
    feature_df = pd.read_csv(feature_path)
    morgan_arr = np.load(morgan_path)
    return feature_df, morgan_arr

# ------------------------------------------------------------------------------
# Example quick diagnostics
# ------------------------------------------------------------------------------
def fingerprint_density_stats(morgan_arr: np.ndarray) -> Dict[str, float]:
    """Return simple stats: mean bit density per fingerprint, fraction of bits with >0 frequency."""
    if morgan_arr is None:
        return {}
    # average number of bits set per sample
    bits_per_sample = morgan_arr.sum(axis=1)
    mean_bits = float(bits_per_sample.mean())
    # fraction of fingerprint bits that are ever on
    bits_active = (morgan_arr.sum(axis=0) > 0).mean()
    return {'mean_bits_set_per_sample': mean_bits, 'fraction_bits_active': float(bits_active)}

# ------------------------------------------------------------------------------
# Convenience: aggregate duplicates by SMILES (median target)
# ------------------------------------------------------------------------------
def aggregate_by_smiles(df: pd.DataFrame, smiles_col: str = 'canonical_smiles',
                        value_col: str = 'pIC50') -> pd.DataFrame:
    """Aggregate rows by canonical SMILES, returning DataFrame with unique smiles and median value_col.
       Keeps the first molecule_chembl_id if present.
    """
    df = df.copy()
    df['can_smiles'] = df[smiles_col]
    agg = df.groupby('can_smiles').agg({
        value_col: 'median',
        'molecule_chembl_id': lambda x: x.dropna().astype(str).unique().tolist()
    }).rename(columns={value_col: f'median_{value_col}'}).reset_index()
    return agg

In [32]:
ii_df = pd.read_csv('ii_df_clean.csv')

In [33]:
integrase = ii_df.copy()

# Add RDKit mols (safe)
integrase = add_rdkit_mols(integrase, smiles_col='canonical_smiles', mol_col='rdkit_mol', show_progress=True)

# Drop rows where SMILES failed
integrase = integrase[integrase['rdkit_mol'].notna()].reset_index(drop=True)

# Build features
feature_inte, morgan_arr = build_feature_dataframe(integrase, smiles_col='canonical_smiles',
                                                morgan_radius=2, morgan_nbits=512,
                                                show_progress=True)

# attach pIC50 to features for modelling
feature_inte['pIC50'] = integrase['pIC50'].values

# Save
paths = save_feature_arrays('integrase_qsar', feature_inte, morgan_arr)
print(paths)

Parsing SMILES:   0%|          | 0/6392 [00:00<?, ?it/s]

Computing descriptors:   0%|          | 0/6386 [00:00<?, ?it/s]

Computing fingerprints:   0%|          | 0/6386 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


{'features_csv': 'integrase_qsar_features.csv', 'morgan_npy': 'integrase_qsar_morgan.npy'}
