# Load models

In [1]:
import pickle
import glob

models = []
for model_path in glob.glob('models/XGBRegressor_20250913_234623/*.pkl'):
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)
        models.append(model)

for model_path in glob.glob('models/LGBMRegressor_20250913_234633/*.pkl'):
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)
        models.append(model)

print(f'Loaded {len(models)} models')

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

  model = pickle.load(model_file)


Loaded 50 models


# Get importance of new features

In [2]:
import numpy as np
from collections import defaultdict
from glob import glob

# from side_chain_features import SIDECHAIN_BACKBONE_FEATURE_NAMES
# new_feature_names = SIDECHAIN_BACKBONE_FEATURE_NAMES

# from gemini_features import ALL_GEMINI_FEATURE_NAMES
# new_feature_names = ALL_GEMINI_FEATURE_NAMES

new_feature_names = [f'polyBERT_{index}' for index in range(600)]

# feature_prediction_model_paths = glob('simulations/models/*.joblib')
# new_feature_names = [path.split('/')[-1].replace('.joblib', '') for path in feature_prediction_model_paths]

# --------------------------------------------------------------
# helper: extract {feature_name: importance_weight} for any model
# --------------------------------------------------------------
def get_feature_importances(model) -> dict[str, float]:
    """
    Return a mapping {feature_name: importance_weight} that works for:
    • xgboost.XGBRegressor or its Booster
    • lightgbm.LGBMRegressor or its Booster
    """
    # ---- XGBoost -------------------------------------------------
    if hasattr(model, "get_booster"):              # XGBRegressor wrapper
        return model.get_booster().get_score(importance_type="weight")

    # ---- LightGBM -----------------------------------------------
    # scikit wrapper stores the LightGBM Booster in .booster_
    if hasattr(model, "booster_"):                 # LGBMRegressor
        booster = model.booster_
    else:
        booster = model                            # raw Booster

    if hasattr(booster, "feature_name") and hasattr(booster, "feature_importance"):
        names = booster.feature_name()
        values = booster.feature_importance(importance_type="split")
        return dict(zip(names, values))

    raise TypeError(f"Unsupported model type: {type(model)}")


# --------------------------------------------------------------
# rank aggregation
# --------------------------------------------------------------
feature_names_to_importance_ranks: dict[str, list[int]] = defaultdict(list)
total_feats: int = len(new_feature_names)

all_unique_names = set()
for model in models:
    importance_dict = get_feature_importances(model)

    # sort by descending weight, build rank list (1 = best)
    ranked_names = [
        name for name, _ in sorted(importance_dict.items(), key=lambda kv: kv[1], reverse=True)
    ]
    all_unique_names.update(ranked_names)

    for feature_name in new_feature_names:
        if feature_name in ranked_names:
            rank = ranked_names.index(feature_name)
            feature_names_to_importance_ranks[feature_name].append(rank)
        else:
            rank = len(ranked_names)
            feature_names_to_importance_ranks[feature_name].append(rank)
            # print(f'WARNING: {feature_name} is missing from model')

# mean rank across all models
average_ranks = {feat: np.mean(ranks) for feat, ranks in feature_names_to_importance_ranks.items()}

# features ordered from most to least important (lower avg. rank = better)
features_by_average_rank = sorted(average_ranks, key=average_ranks.get)

# Display top features
features_by_average_rank[:10]


['polyBERT_169',
 'polyBERT_308',
 'polyBERT_361',
 'polyBERT_246',
 'polyBERT_56',
 'polyBERT_177',
 'polyBERT_10',
 'polyBERT_434',
 'polyBERT_534',
 'polyBERT_294']

In [3]:
features_by_average_rank[10:]

['polyBERT_93',
 'polyBERT_295',
 'polyBERT_14',
 'polyBERT_52',
 'polyBERT_389',
 'polyBERT_597',
 'polyBERT_406',
 'polyBERT_38',
 'polyBERT_166',
 'polyBERT_544',
 'polyBERT_342',
 'polyBERT_375',
 'polyBERT_32',
 'polyBERT_594',
 'polyBERT_459',
 'polyBERT_218',
 'polyBERT_510',
 'polyBERT_134',
 'polyBERT_242',
 'polyBERT_62',
 'polyBERT_396',
 'polyBERT_102',
 'polyBERT_57',
 'polyBERT_379',
 'polyBERT_157',
 'polyBERT_297',
 'polyBERT_37',
 'polyBERT_27',
 'polyBERT_349',
 'polyBERT_572',
 'polyBERT_141',
 'polyBERT_207',
 'polyBERT_193',
 'polyBERT_309',
 'polyBERT_74',
 'polyBERT_384',
 'polyBERT_554',
 'polyBERT_408',
 'polyBERT_1',
 'polyBERT_525',
 'polyBERT_435',
 'polyBERT_39',
 'polyBERT_370',
 'polyBERT_479',
 'polyBERT_289',
 'polyBERT_383',
 'polyBERT_138',
 'polyBERT_108',
 'polyBERT_142',
 'polyBERT_335',
 'polyBERT_280',
 'polyBERT_198',
 'polyBERT_489',
 'polyBERT_36',
 'polyBERT_549',
 'polyBERT_583',
 'polyBERT_70',
 'polyBERT_352',
 'polyBERT_300',
 'polyBERT_1

In [4]:
print(features_by_average_rank)

['polyBERT_169', 'polyBERT_308', 'polyBERT_361', 'polyBERT_246', 'polyBERT_56', 'polyBERT_177', 'polyBERT_10', 'polyBERT_434', 'polyBERT_534', 'polyBERT_294', 'polyBERT_93', 'polyBERT_295', 'polyBERT_14', 'polyBERT_52', 'polyBERT_389', 'polyBERT_597', 'polyBERT_406', 'polyBERT_38', 'polyBERT_166', 'polyBERT_544', 'polyBERT_342', 'polyBERT_375', 'polyBERT_32', 'polyBERT_594', 'polyBERT_459', 'polyBERT_218', 'polyBERT_510', 'polyBERT_134', 'polyBERT_242', 'polyBERT_62', 'polyBERT_396', 'polyBERT_102', 'polyBERT_57', 'polyBERT_379', 'polyBERT_157', 'polyBERT_297', 'polyBERT_37', 'polyBERT_27', 'polyBERT_349', 'polyBERT_572', 'polyBERT_141', 'polyBERT_207', 'polyBERT_193', 'polyBERT_309', 'polyBERT_74', 'polyBERT_384', 'polyBERT_554', 'polyBERT_408', 'polyBERT_1', 'polyBERT_525', 'polyBERT_435', 'polyBERT_39', 'polyBERT_370', 'polyBERT_479', 'polyBERT_289', 'polyBERT_383', 'polyBERT_138', 'polyBERT_108', 'polyBERT_142', 'polyBERT_335', 'polyBERT_280', 'polyBERT_198', 'polyBERT_489', 'polyB

In [5]:
print(all_unique_names)

{'mfp_1270', 'mfp_769', 'ap_1947', 'polyBERT_38', 'ap_2004', 'polyBERT_333', 'mfp_125', 'ap_478', 'tt_22', 'tt_595', 'tt_557', 'mfp_1905', 'polyBERT_380', 'mfp_1577', 'mfp_486', 'mfp_1298', 'tt_1655', 'mfp_1399', 'tt_619', 'ap_425', 'mfp_1790', 'polyBERT_230', 'tt_1826', 'mfp_1800', 'polyBERT_29', 'tt_369', 'tt_796', 'mfp_942', 'tt_1077', 'ap_1904', 'mfp_1306', 'ap_924', 'sidechain_NumBridgeheadAtoms', 'polyBERT_198', 'tt_1945', 'polyBERT_332', 'tt_2', 'ap_368', 'tt_1448', 'mfp_1060', 'tt_1253', 'tt_482', 'ap_1594', 'mfp_1219', 'polyBERT_181', 'tt_471', 'ap_1999', 'ap_121', 'ap_1126', 'ap_552', 'tt_227', 'polyBERT_28', 'tt_258', 'polyBERT_240', 'mfp_1444', 'tt_1768', 'ap_1796', 'tt_1', 'tt_624', 'polyBERT_111', 'tt_428', 'mfp_968', 'ap_40', 'ap_822', 'mfp_37', 'halogen_count', 'tt_984', 'ap_816', 'tt_1356', 'tt_2032', 'maccs_52', 'polyBERT_361', 'tt_1942', 'mfp_837', 'ap_1276', 'tt_250', 'ap_724', 'mfp_2016', 'tt_82', 'mfp_871', 'tt_397', 'gasteiger_charge_max_pos', 'ap_604', 'tt_252',