In [16]:
# ==================================================
# Import
# ==================================================

import pandas as pd
import numpy as np
import os
import json
from sklearn.metrics import root_mean_squared_error, make_scorer
from sklearn.model_selection import KFold, cross_validate, GridSearchCV
from sklearn.base import clone
import optuna
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import torch
from torch.utils.data import DataLoader
from argparse import Namespace
from sklearn.linear_model import Ridge
import joblib

# Feature Extractors

In [5]:
# ==================================================
# Feature Extractor: Baseline Descriptors
# ==================================================

def DescriptorFarm(df: pd.DataFrame) -> pd.DataFrame:
    from rdkit import Chem
    from rdkit.Chem import AllChem, Descriptors
    from rdkit.Chem.Descriptors3D import descList as _desc3d_list
    from rdkit.Chem.Scaffolds import MurckoScaffold

    # 2D Descriptors
    desc2d = [(n, f) for n, f in Descriptors._descList if not n.startswith('fr_')]
    names2d = {n for n, _ in desc2d}

    # 3D Descriptors
    desc3d = [(n, f) for n, f in _desc3d_list if not n.startswith('fr_') and n not in names2d]

    # Significant SMARTS Patterns
    smarts_defs = {
        'Imidazole': ['c1ncnc1'],
        'Pyrazole': ['n1nccc1'],
        'Thiazole': ['c1cscn1'],
        'Triazole': ['n1nncc1', 'c1ncnn1'],
        'Toluene': ['Cc1ccccc1'],
        'N-Ethyllformamide': ['O=CNCC[aR]'],
        'Amino_Arylmethane': ['[aR]CN'],
        'N-Phenethylformamide': ['O=CNCCc1ccccc1'],
        'Carbamate': ['OC(=O)N'],
        'Benzodioxole': ['c1cc2OCOc2cc1'],
        'Furan': ['c1ccoc1'],
        'Terminal_Alkyne': ['[C]#[CH]'],
        'Primary_Amine': ['[CX4][NH2]'],
        'Sec_Cyclic_Amine': ['[CX4;R][NH1;R][CX4;R]'],
        'Cyclopropyl_Amine': [
        'C(C)(C)(C)N(C1CC1)C(C)(C)(C)',
           'C(=C)(C)N(C1CC1)C(=C)(C)',
           'C(#C)N(C1CC1)C(#C)',
           '[H]N(C1CC1)[H]'
        ],
        'Hydroquinone':[
            'Oc1ccc(O)cc1',
            'Oc1ccccc1(O)',
            'O=c1ccc(=O)cc1',
            'O=c1ccccc1(=O)'
        ],
        'Epoxide': ['C1OC1'],
        'Tertiary_Amine': ['N1([CX4])CCN(CC1)[CX4]'],
        'Alkylphenol': [
            '[OH]c1ccc(C([C,H])([C,H])[C,H])cc1',
            '[OH]c1ccccc1(C([C,H])([C,H])[C,H])'
        ],
        'Alkylaromatic_Ether': [
            'c1c([CX4])cc[c](O[CX4])c1',
            'c1cc[c](O[CX4])c([CX4])c1'
        ],
        'Arenes':[
            'c1ccc2c(c1)CCCO2',
            '[C,H]N(C(c1ccccc1)c2ccccc2)[C,H]',
            'c1ccccc1[CX4]C=C(C)(C)',
            'c1ccccc1[CX4]C=C(=C)'
        ],
        'Alkoxybenzene': ['c1ccc(OC)cc1'],
    }
    smarts_patterns = {
        name: [Chem.MolFromSmarts(s) for s in smarts_defs[name]]
        for name in smarts_defs
    }

    # Row-Wise Computation
    records = []
    for smi in df['Canonical_Smiles']:
        rec = {}
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            for name, _ in desc2d + desc3d:
                rec[name] = None
            for col in [
                'Murcko_num_rings',
                'Murcko_max_ring_size',
                'Murcko_min_ring_size',
                'Murcko_mean_ring_size',
                'Murcko_heavy_atom_count',
                'Murcko_heteroatom_count',
                'Murcko_mol_wt'
            ]:
                rec[col] = None
            for name in smarts_patterns:
                rec[name] = None
        else:
            # 2D Descriptors
            for name, func in desc2d:
                try:
                    rec[name] = func(mol)
                except:
                    rec[name] = None

            # 3D Descriptors
            mol3d = Chem.AddHs(mol)
            try:
                AllChem.EmbedMolecule(mol3d, AllChem.ETKDG())
                AllChem.MMFFOptimizeMolecule(mol3d)
                for name, func in desc3d:
                    try:
                        rec[name] = func(mol3d)
                    except:
                        rec[name] = None
            except:
                for name, _ in desc3d:
                    rec[name] = None

            # Murcko Scaffold
            try:
                scaffold = MurckoScaffold.GetScaffoldForMol(mol)
                ri = scaffold.GetRingInfo()
                ring_sizes = [len(r) for r in ri.AtomRings()]
                num_rings = len(ring_sizes)
                rec['Murcko_num_rings'] = num_rings
                rec['Murcko_max_ring_size'] = max(ring_sizes) if ring_sizes else 0
                rec['Murcko_min_ring_size'] = min(ring_sizes) if ring_sizes else 0
                rec['Murcko_mean_ring_size'] = sum(ring_sizes)/num_rings if num_rings else 0
                rec['Murcko_heavy_atom_count'] = scaffold.GetNumHeavyAtoms()
                heteros = sum(1 for atom in scaffold.GetAtoms()
                             if atom.GetAtomicNum() not in (6,1))
                rec['Murcko_heteroatom_count'] = heteros
                rec['Murcko_mol_wt'] = Descriptors.MolWt(scaffold)
            except Exception:
                for col in ['Murcko_num_rings',
                            'Murcko_max_ring_size',
                            'Murcko_min_ring_size',
                            'Murcko_mean_ring_size',
                            'Murcko_heavy_atom_count',
                            'Murcko_heteroatom_count',
                            'Murcko_mol_wt']:
                    rec[col] = None

            # Matching SMARTS Patterns
            for name, patterns in smarts_patterns.items():
                try:
                    rec[name] = any(mol.HasSubstructMatch(pat) for pat in patterns)
                except:
                    rec[name] = None

        records.append(rec)

    # Result
    desc_df = pd.DataFrame(records, index = df.index)

    return pd.concat([df, desc_df], axis = 1)

In [6]:
# ==================================================
# Feature Extractor: Fingerprints
# ==================================================

def FingerprintFarm(
        df: pd.DataFrame,
        smiles_col: str = 'Canonical_Smiles',
        fp: str | None = None
) -> pd.DataFrame:
    import os
    from rdkit import Chem, RDConfig
    from rdkit.Chem import rdMolDescriptors, rdFingerprintGenerator
    from rdkit.Chem.rdmolops import PatternFingerprint
    from rdkit.Chem.Pharm2D.SigFactory import SigFactory
    from rdkit.Chem.Pharm2D import Generate as Pharm2DGen
    from rdkit.Chem.rdReducedGraphs import GetErGFingerprint
    from rdkit.Chem.rdMHFPFingerprint import MHFPEncoder
    from rdkit.Chem import ChemicalFeatures

    # Fingerprint Generator
    fp_name = None
    fp_gen = None
    if fp:
        nm = fp.lower()
        if nm == 'rdkit':
            fp_name, fp_gen = 'rdkit', rdFingerprintGenerator.GetRDKitFPGenerator(fpSize = 2048)
        elif nm == 'atompairs':
            fp_name, fp_gen = 'atompairs', rdFingerprintGenerator.GetAtomPairGenerator(fpSize = 2048)
        elif nm == 'topologicaltorsions':
            fp_name, fp_gen = 'topologicaltorsions', rdFingerprintGenerator.GetTopologicalTorsionGenerator(fpSize = 2048)
        elif nm == 'ecfp4':
            fp_name, fp_gen = 'ecfp4', rdFingerprintGenerator.GetMorganGenerator(radius = 2, fpSize = 2048)
        elif nm == 'ecfp6':
            fp_name, fp_gen = 'ecfp6', rdFingerprintGenerator.GetMorganGenerator(radius = 3, fpSize = 2048)
        elif nm == 'maccs':
            fp_name = 'maccs'
        elif nm == 'pattern':
            fp_name = 'pattern'
        elif nm == '2dpharmacophore':
            fdef = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
            featFactory = ChemicalFeatures.BuildFeatureFactory(fdef)
            sigFactory = SigFactory(featFactory, minPointCount = 2, maxPointCount = 3, trianglePruneBins = False)
            sigFactory.SetBins([(0, 2), (2, 5), (5, 8)])
            sigFactory.Init()
            fp_name, fp_gen = '2dpharmacophore', sigFactory
        elif nm == 'erg':
            fp_name = 'erg'
        elif nm == 'mhfp':
            fp_name, fp_gen = 'mhfp', MHFPEncoder()
        elif nm == 'secfp':
            fp_name, fp_gen = 'secfp', MHFPEncoder()

    # Row-Wise Computation
    records = []
    for smi in df[smiles_col]:
        rec = {}
        mol = Chem.MolFromSmiles(smi)
        if fp_name is None or mol is None:
            if fp_name:
                for i in range(2048):
                    rec[f'{fp_name}_{i}'] = None
        else:
            if fp_name in ('rdkit', 'atompairs', 'topologicaltorsions', 'ecfp4', 'ecfp6'):
                bv = fp_gen.GetFingerprint(mol)
                bits = [int(b) for b in bv.ToBitString()]
            elif fp_name == 'maccs':
                bv = rdMolDescriptors.GetMACCSKeysFingerprint(mol)
                bits = [int(b) for b in bv.ToBitString()]
            elif fp_name == 'pattern':
                bv = PatternFingerprint(mol)
                bits = [int(b) for b in bv.ToBitString()]
            elif fp_name == '2dpharmacophore':
                bv = Pharm2DGen.Gen2DFingerprint(mol, fp_gen)
                bits = [int(b) for b in bv.ToBitString()]
            elif fp_name == 'erg':
                try:
                    bv = GetErGFingerprint(mol)
                    bits = list(bv)
                except KeyError:
                    bits = [None] * 2048
            elif fp_name == 'mhfp':
                bv = fp_gen.EncodeMol(mol)
                bits = list(bv)
            elif fp_name == 'secfp':
                bv = fp_gen.EncodeSECFPMol(mol)
                bits = list(bv)
            else:
                bits = [0] * 2048
            for i, bit in enumerate(bits):
                rec[f'{fp_name}_{i}'] = bit
        records.append(rec)

    fp_df = pd.DataFrame(records, index = df.index)
    return pd.concat([df, fp_df], axis = 1)

# CV Settings

In [7]:
# ==================================================
# CV Settings
# ==================================================

# Evalutation Score
def ScoreFarm_(y_true, y_pred):
    nrmse = root_mean_squared_error(y_true, y_pred) / (np.max(y_true) - np.min(y_true))
    pearson = np.clip(np.corrcoef(y_true, y_pred)[0, 1], 0, 1)
    return 0.5 * (1 - np.minimum(nrmse, 1) + pearson)

ScoreFarm = make_scorer(ScoreFarm_, greater_is_better = True)

# 10-Fold CV
cv = KFold(n_splits = 10, shuffle = True, random_state = 42)

# LGBM

In [8]:
# ==================================================
# LGBM: Preprocessing
# ==================================================

# Baseline Descriptors
train = pd.read_csv('train.csv')
train = DescriptorFarm(train)

# Fingerprints List
fp_list = [
    'rdkit', 'atompairs', 'topologicaltorsions', 'ecfp4', 'ecfp6',
    'maccs', 'pattern', '2dpharmacophore', 'erg', 'mhfp', 'secfp'
]

# Inputs Dictionary
x = {'base': train.drop(['ID', 'Canonical_Smiles', 'Inhibition'], axis = 1)}
for i in fp_list:
    tmp = FingerprintFarm(train, fp = i)
    tmp = tmp.drop(['ID', 'Canonical_Smiles', 'Inhibition'], axis = 1)
    x[i] = tmp

# Train Output
y = train['Inhibition']

# Constant Column Removal
for i in x.keys():
    nonconst_cols = x[i].columns[x[i].nunique() > 1]
    x[i] = x[i][nonconst_cols]

# ==================================================
# LGBM: Model Definition
# ==================================================

# Base Estimator
lgbm = LGBMRegressor(
    objective = 'regression',
    subsample_freq = 1,
    random_state = 42,
    device = 'gpu',
    verbose = -1
)

# Models Dictionary
md = {}
for i in ['base'] + fp_list:
    md[i] = clone(lgbm)

# Dense Cases (n > p)
dense_list = ['base', 'maccs', 'erg']

# Sparse Cases (n < p)
sparse_list = [
    'rdkit', 'atompairs', 'topologicaltorsions', 'ecfp4',
    'ecfp6', 'pattern', '2dpharmacophore', 'mhfp', 'secfp'
]

In [None]:
# ==================================================
# LGBM: Model Selection for Dense Cases
# ==================================================

# Dense Cases (n > p)
for i in dense_list:
    # Optuna Objective
    def objective(trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 64, 128),
            'max_depth': trial.suggest_int('max_depth', 8, 16),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03, log = True), 
            'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
            'min_child_samples': trial.suggest_int('min_child_samples', 30, 100),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.7),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10.0, log = True),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10.0, log = True)
        }

        optuna_model = clone(lgbm).set_params(**params)

        scores = cross_validate(
            estimator = optuna_model, X = x[i], y = y,
            scoring = ScoreFarm,
            cv = cv,
            n_jobs = 4,
            verbose = 1
        )
        
        return scores['test_score'].mean()
    
    # Optimization
    os.environ['PYTHONHASHSEED'] = str(42)
    study = optuna.create_study(
        direction = 'maximize',
        sampler = optuna.samplers.TPESampler(seed = 42)
    )
    study.optimize(objective, n_trials = 30, n_jobs = 3, show_progress_bar = True)

    # Result
    print(f'\nBest Parameters (x[\'{i}\']):')
    print(study.best_params)
    print(f'Best CV Score (x[\'{i}\']):')
    print(study.best_trial.value)
    print('\n========================================\n')

    # Save Parameters & CV Scores
    with open(f'params/params_{i}.json', 'w') as f:
        json.dump(study.best_trial.params, f, indent = 4)
    with open(f'cv_scores/cv_score_{i}.txt', 'w') as f:
        f.write(str(study.best_trial.value))

In [None]:
# ==================================================
# LGBM : Model Selection for Sparse Cases
# ==================================================

# Sparse Cases (n < p)
for i in sparse_list:
    # Optuna Objective
    def objective(trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 8, 32),
            'max_depth': trial.suggest_int('max_depth', 4, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.03, log = True), 
            'n_estimators': trial.suggest_int('n_estimators', 500, 3000),
            'min_child_samples': trial.suggest_int('min_child_samples', 50, 200),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.6),
            'reg_alpha': trial.suggest_float('reg_alpha', 1.0, 10.0, log = True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 10.0, log = True)
        }

        optuna_model = clone(lgbm).set_params(**params)

        scores = cross_validate(
            estimator = optuna_model, X = x[i], y = y,
            scoring = ScoreFarm,
            cv = cv,
            n_jobs = 4,
            verbose = 1
        )
        
        return scores['test_score'].mean()
    
    # Optimization
    os.environ['PYTHONHASHSEED'] = str(42)
    study = optuna.create_study(
        direction = 'maximize',
        sampler = optuna.samplers.TPESampler(seed = 42)
    )
    study.optimize(objective, n_trials = 30, n_jobs = 3, show_progress_bar = True)

    # Result
    print(f'\nBest Parameters (x[\'{i}\']):')
    print(study.best_params)
    print(f'Best CV Score (x[\'{i}\']):')
    print(study.best_trial.value)
    print('\n========================================\n')

    # Save
    with open(f'params/params_{i}.json', 'w') as f:
        json.dump(study.best_trial.params, f, indent = 4)
    with open(f'cv_scores/cv_score_{i}.txt', 'w') as f:
        f.write(str(study.best_trial.value))

# RF

In [9]:
# ==================================================
# RF (with Top 5 Fingerprints): Preprocessing
# ==================================================

# Top 5 CV Scores
cv_scores = {}
for i in md.keys():
    with open(f'cv_scores/cv_score_{i}.txt', 'r') as f:
        cv_scores[i] = float(f.readline())
cv_scores = pd.Series(
    list(cv_scores.values()),
    index = list(cv_scores.keys())
).sort_values(ascending = False)
top5_list = list(cv_scores.index[:5])

# ==================================================
# RF: Model Definition
# ==================================================

# Base Estimator
rf = RandomForestRegressor(
    n_jobs = 10,
    random_state = 42
)

# Inputs & Models Dictionary Update
additional_list = [i + '_additional' for i in top5_list]
for i in additional_list:
    i_tmp = i.replace('_additional', '')
    x[i] = x[i_tmp].copy()
    md[i] = clone(rf)

In [None]:
# ==================================================
# RF: Model Selection
# ==================================================

for i in top5_list:
    # Optuna Objective
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 300, 1500, step = 100),
            'max_features': trial.suggest_float('max_features', 0.3, 0.6),
            'max_samples': trial.suggest_float('max_samples', 0.6, 1.0),
        }

        optuna_model = clone(rf).set_params(**params)

        scores = cross_validate(
            estimator = optuna_model, X = x[i], y = y,
            scoring = ScoreFarm,
            cv = cv,
            verbose = 1
        )
        
        return scores['test_score'].mean()
    
    # Optimization
    os.environ['PYTHONHASHSEED'] = str(42)
    study = optuna.create_study(
        direction = 'maximize',
        sampler = optuna.samplers.TPESampler(seed = 42)
    )
    study.optimize(objective, n_trials = 30, show_progress_bar = True)

    # Result
    print(f'\nBest Parameters (x[\'{i}\']):')
    print(study.best_params)
    print(f'Best CV Score (x[\'{i}\']):')
    print(study.best_trial.value)
    print('\n========================================\n')

    # Save
    with open(f'params/params_{i}_additional.json', 'w') as f:
        json.dump(study.best_trial.params, f, indent = 4)
    with open(f'cv_scores/cv_score_{i}_additional.txt', 'w') as f:
        f.write(str(study.best_trial.value))

# GROVER

In [None]:
# ==================================================
# GROVER (https://github.com/tencent-ailab/grover): Preprocessing
# ==================================================

from grover.model.models import GroverFpGeneration
from grover.data.molgraph import MolCollator

# `MolCollator`-Compatible Class
smiles_list = pd.read_csv('train.csv')['Canonical_Smiles'].to_list()

class Record:
    __slots__ = ("smiles", "features", "targets")
    def __init__(self, s):
        self.smiles, self.features, self.targets = s, None, [None]

records = [Record(s) for s in smiles_list]

collator = MolCollator({}, Namespace(bond_drop_rate = 0, no_cache = True))

# Data Loader
loader = DataLoader(
    records,
    batch_size = 128,
    shuffle = False,
    collate_fn = collator
)

# ==================================================
# GROVER: Model Definition 
# ==================================================

# Pretrained GROVER
grover_large = torch.load(
    'grover/grover_large.pt', map_location = 'cpu', weights_only = False
)
grover_state = grover_large['state_dict']

# Additional Arguments Required
grover_args = grover_large['args']
grover_args.cuda = torch.cuda.is_available()
grover_args.dropout = 0.1
grover_args.fingerprint_source = 'both'

# Model Definition
grover = GroverFpGeneration(grover_args)
grover.load_state_dict(grover_state, strict = False)

_IncompatibleKeys(missing_keys=['readout.cached_zero_vector'], unexpected_keys=[])

In [None]:
# ==================================================
# GROVER + Ridge: Model Selection (Head Only)
# ==================================================

# Device Setting
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Embedding Extraction
grover.to(device).eval()
emb_list = []

with torch.no_grad():
    for _, graph_components, *_ in loader:
        graph_components = tuple(
            t.to(device) if torch.is_tensor(t) else t
            for t in graph_components
        )
        
        emb = grover(graph_components, [None]).cpu().numpy()
        emb_list.append(emb)
        
        # Resource Optimization
        del emb, graph_components
        loader.collate_fn.shared_dict = {}
        torch.cuda.empty_cache()

grover_train = pd.DataFrame(np.concatenate(emb_list, axis = 0))

# Inputs Dictionary Update
x['grover'] = grover_train



In [12]:
# Replacement of SGD with Ridge Regression
ridge = Ridge(random_state = 42)

# Models Dictionary Update
md['grover'] = ridge

# Regularization Parameter Grid
alpha_grid = {
    'alpha': np.logspace(1, 3, 40)
}

# Regularization Parameter Selection
grover_cv = GridSearchCV(
    estimator = ridge,
    param_grid = alpha_grid,
    scoring = ScoreFarm,
    n_jobs = -1,
    cv = cv
)

grover_cv = grover_cv.fit(x['grover'], y)

# Result
print('Best Alpha:')
print(grover_cv.best_params_)
print('CV Score of GROVER:')
print(grover_cv.best_score_)

# Save
with open('params/params_grover.json', 'w') as f:
    json.dump(grover_cv.best_params_, f, indent = 4)
with open('cv_scores/cv_score_grover.txt', 'w') as f:
    f.write(str(grover_cv.best_score_))

Best Alpha:
{'alpha': np.float64(437.54793750741845)}
CV Score of GROVER:
0.5939004399450267


# Ensemble Model

In [13]:
# ==================================================
# Ensemble: Preprocessing
# ==================================================

# Parameters Setting
for i in md.keys():
    with open(f'params/params_{i}.json', 'r') as f:
        params_tmp = json.load(f)
    
    md[i] = md[i].set_params(
        **params_tmp
    )

# Meta Features of Each Fold for CV Score
ensemble_split_list = []

for k, (train_idx, valid_idx) in enumerate(cv.split(train, y)):
    y_train = y.iloc[train_idx]
    y_valid = y.iloc[valid_idx]

    # Untrained Individual Models
    md_tmp = {i: clone(j) for i, j in md.items()}
    
    meta_x_train = {}
    meta_x_valid = {}

    # Train & Prediction
    for i in md_tmp.keys():
        x_train = x[i].iloc[train_idx]
        x_valid = x[i].iloc[valid_idx]

        md_tmp[i] = md_tmp[i].fit(x_train, y_train)

        meta_x_train[i] = md_tmp[i].predict(x_train)
        meta_x_valid[i] = md_tmp[i].predict(x_valid)

    meta_x_train = pd.DataFrame(meta_x_train)
    meta_x_valid = pd.DataFrame(meta_x_valid)

    # List of Tuples
    ensemble_split_list.append((meta_x_train, meta_x_valid, y_train, y_valid))
    
    print(f'Fold {k + 1} done!')

Fold 1 done!
Fold 2 done!
Fold 3 done!
Fold 4 done!
Fold 5 done!
Fold 6 done!
Fold 7 done!
Fold 8 done!
Fold 9 done!
Fold 10 done!


In [14]:
# ==================================================
# Model Selection: Weighted Average
# ==================================================

# Optuna Objective
def objective(trial):
    params = {
        'base': trial.suggest_int('base', 0, 100),
        'rdkit': trial.suggest_int('rdkit', 0, 100),
        'atompairs': trial.suggest_int('atompairs', 0, 100),
        'topologicaltorsions': trial.suggest_int('topologicaltorsions', 0, 100),
        'ecfp4': trial.suggest_int('ecfp4', 0, 100),
        'ecfp6': trial.suggest_int('ecfp6', 0, 100),
        'maccs': trial.suggest_int('maccs', 0, 100),
        'pattern': trial.suggest_int('pattern', 0, 100),
        '2dpharmacophore': trial.suggest_int('2dpharmacophore', 0, 100),
        'erg': trial.suggest_int('erg', 0, 100),
        'mhfp': trial.suggest_int('mhfp', 0, 100),
        'secfp': trial.suggest_int('secfp', 0, 100),
        'ecfp4_additional': trial.suggest_int('ecfp4_additional', 0, 100),
        'ecfp6_additional': trial.suggest_int('ecfp6_additional', 0, 100),
        'atompairs_additional': trial.suggest_int('atompairs_additional', 0, 100),
        'maccs_additional': trial.suggest_int('maccs_additional', 0, 100),
        'topologicaltorsions_additional': trial.suggest_int('topologicaltorsions_additional', 0, 100),
        'grover': trial.suggest_int('grover', 0, 100),
    }

    w = np.array(list(params.values()))

    scores = []

    for _, meta_x_valid, _, y_valid in ensemble_split_list:

        y_pred = meta_x_valid.mul(w, axis = 1).sum(axis = 1) / w.sum()
        scores.append(ScoreFarm_(y_valid, y_pred))

    scores = np.array(scores)
    return scores.mean()

# Optimization
os.environ['PYTHONHASHSEED'] = str(42)
study = optuna.create_study(
    direction = 'maximize',
    sampler = optuna.samplers.TPESampler(seed = 42)
)
study.optimize(objective, n_trials = 1000, n_jobs = 10, show_progress_bar = True)

# Result
print('Best Weights:')
print(study.best_params)
print('CV Score of Weighted Average Ensemble:')
print(study.best_trial.value)

# Save
with open('params/params_ens.json', 'w') as f:
    json.dump(study.best_trial.params, f, indent = 4)
with open('cv_scores/cv_score_ens.txt', 'w') as f:
    f.write(str(study.best_trial.value))

[I 2025-07-29 16:23:58,165] A new study created in memory with name: no-name-f5e6e3de-2706-4fe9-9184-2b2f06f1d9bb


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-07-29 16:23:58,275] Trial 1 finished with value: 0.6143339733301776 and parameters: {'base': 6, 'rdkit': 31, 'atompairs': 74, 'topologicaltorsions': 38, 'ecfp4': 33, 'ecfp6': 8, 'maccs': 85, 'pattern': 11, '2dpharmacophore': 26, 'erg': 60, 'mhfp': 24, 'secfp': 86, 'ecfp4_additional': 78, 'ecfp6_additional': 68, 'atompairs_additional': 54, 'maccs_additional': 18, 'topologicaltorsions_additional': 23, 'grover': 88}. Best is trial 1 with value: 0.6143339733301776.
[I 2025-07-29 16:23:58,343] Trial 7 finished with value: 0.6154977857760671 and parameters: {'base': 80, 'rdkit': 39, 'atompairs': 34, 'topologicaltorsions': 35, 'ecfp4': 73, 'ecfp6': 28, 'maccs': 0, 'pattern': 21, '2dpharmacophore': 20, 'erg': 10, 'mhfp': 82, 'secfp': 71, 'ecfp4_additional': 22, 'ecfp6_additional': 12, 'atompairs_additional': 24, 'maccs_additional': 90, 'topologicaltorsions_additional': 79, 'grover': 99}. Best is trial 7 with value: 0.6154977857760671.
[I 2025-07-29 16:23:58,359] Trial 8 finished with v

In [17]:
# Save the Whole Trained Models
for i in md.keys():
    md[i] = md[i].fit(x[i], y)

joblib.dump(md, 'model_dict.joblib')

['model_dict.joblib']