In [66]:
import pandas as pd

file_paths = {
    "ogbg-molbace": "benchmarking/classification-scores-ogbg-molbace.csv",
    "ogbg-molbbbp": "benchmarking/classification-scores-ogbg-molbbbp.csv",
    "ogbg-molhiv": "benchmarking/classification-scores-ogbg-molhiv.csv"
}

datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}


## RandomForestClassifier

In [21]:
descriptor_scores = {}

for name, df in datasets.items():
    if "desc_name" in df.columns and "RandomForestClassifier_valid_mean" in df.columns:
        for _, row in df.iterrows():
            descriptor = row["desc_name"]
            valid_score = row["RandomForestClassifier_valid_mean"]

            if pd.notna(valid_score):  # Ensure the score is not NaN
                if descriptor not in descriptor_scores:
                    descriptor_scores[descriptor] = []
                descriptor_scores[descriptor].append(valid_score)

avg_scores = {desc: sum(scores) / len(scores) for desc, scores in descriptor_scores.items()}


In [13]:
avg_scores_df = pd.DataFrame(avg_scores.items(), columns=["Descriptor", "Average Validation Score"])
avg_scores_df = avg_scores_df.sort_values(by="Average Validation Score", ascending=False)

print(avg_scores_df.head(10))


                   Descriptor  Average Validation Score
3            molecular_weight                  0.827784
14               wiener_index                  0.782365
2            heavy_atom_count                  0.657959
11            petitjean_index                  0.586257
8             balaban_j_index                  0.569239
5   number_of_rotatable_bonds                  0.565097
16            zagreb_index_m2                  0.525781
12            polarity_number                  0.525134
4             number_of_rings                  0.488963
15            zagreb_index_m1                  0.483250


## LogisticRegressionCV

In [22]:
descriptor_scores = {}

for name, df in datasets.items():
    if "desc_name" in df.columns and "LogisticRegressionCV_valid_mean" in df.columns:
        for _, row in df.iterrows():
            descriptor = row["desc_name"]
            valid_score = row["LogisticRegressionCV_valid_mean"]

            if pd.notna(valid_score):  # Ensure the score is not NaN
                if descriptor not in descriptor_scores:
                    descriptor_scores[descriptor] = []
                descriptor_scores[descriptor].append(valid_score)

avg_scores = {desc: sum(scores) / len(scores) for desc, scores in descriptor_scores.items()}


In [20]:
avg_scores_df = pd.DataFrame(avg_scores.items(), columns=["Descriptor", "Average Validation Score"])
avg_scores_df = avg_scores_df.sort_values(by="Average Validation Score", ascending=False)

print(avg_scores_df.head(15))

                   Descriptor  Average Validation Score
13                     radius                  0.785841
4             number_of_rings                  0.766025
9                    diameter                  0.749548
8             balaban_j_index                  0.744723
12            polarity_number                  0.736545
3            molecular_weight                  0.726589
14               wiener_index                  0.720854
16            zagreb_index_m2                  0.719111
10       graph_distance_index                  0.711621
6            total_atom_count                  0.702861
1                  bond_count                  0.698654
7        average_wiener_index                  0.697082
15            zagreb_index_m1                  0.690814
2            heavy_atom_count                  0.674333
5   number_of_rotatable_bonds                  0.633306


## LGBMClassifier

In [23]:
descriptor_scores = {}

for name, df in datasets.items():
    if "desc_name" in df.columns and "LGBMClassifier_valid_mean" in df.columns:
        for _, row in df.iterrows():
            descriptor = row["desc_name"]
            valid_score = row["LGBMClassifier_valid_mean"]

            if pd.notna(valid_score):  # Ensure the score is not NaN
                if descriptor not in descriptor_scores:
                    descriptor_scores[descriptor] = []
                descriptor_scores[descriptor].append(valid_score)

avg_scores = {desc: sum(scores) / len(scores) for desc, scores in descriptor_scores.items()}


In [17]:
avg_scores_df = pd.DataFrame(avg_scores.items(), columns=["Descriptor", "Average Validation Score"])
avg_scores_df = avg_scores_df.sort_values(by="Average Validation Score", ascending=False)

print(avg_scores_df.head(10))

              Descriptor  Average Validation Score
3       molecular_weight                  0.795904
2       heavy_atom_count                  0.790673
14          wiener_index                  0.730853
1             bond_count                  0.714108
8        balaban_j_index                  0.690009
10  graph_distance_index                  0.686016
6       total_atom_count                  0.656025
4        number_of_rings                  0.649885
15       zagreb_index_m1                  0.636063
16       zagreb_index_m2                  0.587504


In [67]:
import numpy as np

descriptor_scores = {}

for name, df in datasets.items():
    if "desc_name" in df.columns:
        for _, row in df.iterrows():
            descriptor = row["desc_name"]

            # Extract scores, ensuring they are not NaN
            rf_score = row.get("RandomForestClassifier_valid_mean", np.nan)
            lr_score = row.get("LogisticRegressionCV_valid_mean", np.nan)
            lgbm_score = row.get("LGBMClassifier_valid_mean", np.nan)

            if descriptor not in descriptor_scores:
                descriptor_scores[descriptor] = {"RF": [], "LR": [], "LGBM": []}

            if pd.notna(rf_score):
                descriptor_scores[descriptor]["RF"].append(rf_score)
            if pd.notna(lr_score):
                descriptor_scores[descriptor]["LR"].append(lr_score)
            if pd.notna(lgbm_score):
                descriptor_scores[descriptor]["LGBM"].append(lgbm_score)

# Compute averages
avg_scores = {
    desc: {
        "RandomForestClassifier": np.mean(scores["RF"]) if scores["RF"] else np.nan,
        "LogisticRegressionCV": np.mean(scores["LR"]) if scores["LR"] else np.nan,
        "LGBMClassifier": np.mean(scores["LGBM"]) if scores["LGBM"] else np.nan,
    }
    for desc, scores in descriptor_scores.items()
}

# Convert to DataFrame
avg_scores_df = pd.DataFrame.from_dict(avg_scores, orient="index").reset_index()
avg_scores_df.columns = ["Descriptor", "RandomForestClassifier", "LogisticRegressionCV", "LGBMClassifier"]


avg_scores_df

Unnamed: 0,Descriptor,RandomForestClassifier,LogisticRegressionCV,LGBMClassifier
0,average_molecular_weight,0.417238,0.619626,0.496544
1,bond_count,0.460682,0.698654,0.714108
2,heavy_atom_count,0.657959,0.674333,0.790673
3,molecular_weight,0.827784,0.726589,0.795904
4,number_of_rings,0.488963,0.766025,0.649885
5,number_of_rotatable_bonds,0.565097,0.633306,0.46472
6,total_atom_count,0.417489,0.702861,0.656025
7,average_wiener_index,0.420735,0.697082,0.363988
8,balaban_j_index,0.569239,0.744723,0.690009
9,diameter,0.331777,0.749548,0.392146


In [68]:
filtered_df = avg_scores_df[
    (avg_scores_df["RandomForestClassifier"] > 0.55) &
    (avg_scores_df["LogisticRegressionCV"] > 0.55) &
    (avg_scores_df["LGBMClassifier"] > 0.55)
]

filtered_df

Unnamed: 0,Descriptor,RandomForestClassifier,LogisticRegressionCV,LGBMClassifier
2,heavy_atom_count,0.657959,0.674333,0.790673
3,molecular_weight,0.827784,0.726589,0.795904
8,balaban_j_index,0.569239,0.744723,0.690009
14,wiener_index,0.782365,0.720854,0.730853


In [56]:
import pandas as pd

file_paths = {
    "ogbg-molbace": "classification-scores-ogbg-molbace.csv",
    "ogbg-molbbbp": "classification-scores-ogbg-molbbbp.csv",
    "ogbg-molhiv": "classification-scores-ogbg-molhiv.csv"
}

datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}


In [63]:
import numpy as np
fingerprint_scores = {}

for name, df in datasets.items():
    if "fp_name" in df.columns:
        for _, row in df.iterrows():
            fingerprint= row["fp_name"]

            # Extract scores, ensuring they are not NaN
            rf_score = row.get("RandomForestClassifier_mean", np.nan)
            lr_score = row.get("LogisticRegressionCV_mean", np.nan)
            lgbm_score = row.get("LGBMClassifier_mean", np.nan)

            if fingerprint not in fingerprint_scores:
                fingerprint_scores[fingerprint] = {"RF": [], "LR": [], "LGBM": []}

            if pd.notna(rf_score):
                fingerprint_scores[fingerprint]["RF"].append(rf_score)
            if pd.notna(lr_score):
                fingerprint_scores[fingerprint]["LR"].append(lr_score)
            if pd.notna(lgbm_score):
                fingerprint_scores[fingerprint]["LGBM"].append(lgbm_score)

# Compute averages
avg_scores = {
    desc: {
        "RandomForestClassifier": np.mean(scores["RF"]) if scores["RF"] else np.nan,
        "LogisticRegressionCV": np.mean(scores["LR"]) if scores["LR"] else np.nan,
        "LGBMClassifier": np.mean(scores["LGBM"]) if scores["LGBM"] else np.nan,
    }
    for desc, scores in fingerprint_scores.items()
}

# Convert to DataFrame
avg_scores_df = pd.DataFrame.from_dict(avg_scores, orient="index").reset_index()
avg_scores_df.columns = ["Fingerprint", "RandomForestClassifier", "LogisticRegressionCV", "LGBMClassifier"]


avg_scores_df

Unnamed: 0,Fingerprint,RandomForestClassifier,LogisticRegressionCV,LGBMClassifier
0,AtomPairFingerprint,0.712977,0.721293,0.622682
1,E3FPFingerprint,0.671047,0.650832,0.615121
2,TopologicalTorsionFingerprint,0.642039,0.658884,0.602166
