# Gradient Boosting

This notebook implements a gradient boosting baseline for the NeurIPS polymer competition. The approach uses AutoGluon to automatically select and tune ensemble methods on molecular features extracted from SMILES strings. Due to a small dataset, only gradient boosting (and limited Random Forest) models are used.

The baseline combines:
- RDKit molecular descriptors
- Morgan fingerprints (radius=2, 256 bits)
- MACCS keys fingerprints

Features are then pruned to avoid overfitting on a small datasets by reducing dimensionality.

In [None]:
import pandas as pd
import numpy as np
from rdkit.Chem import MolFromSmiles, MolToSmiles, Descriptors
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from typing import Union
from tqdm import tqdm
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [None]:
def make_smile_canonical(smile: str) -> Union[str, np.nan]:
    """Makes a SMILE string canonical to prevent duplicates.

    Returns:
        The canonical SMILE string or NaN if parsing error occurs.
    """
    try:
        mol = MolFromSmiles(smile)
        return MolToSmiles(mol, canonical=True)
    except:
        return np.nan

In [None]:
def read_file(path: str) -> pd.DataFrame:
    """Sanitises data from a CSV file."""
    data = pd.read_csv(path)
    data["SMILES"] = data["SMILES"].apply(make_smile_canonical)

    return (data.dropna(subset=["SMILES"])
            .drop_duplicates(subset=["SMILES"]))

In [None]:
def load_train_data() -> pd.DataFrame:
    """Loads all competition input data and computes weights.

    Prioritises train.csv properties over supplementary data properties when merging in
    supplementary data.
    """
    primary_data = read_file("../data/train.csv")
    supplement_paths = [
        "../data/train_supplement/dataset1.csv",
        "../data/train_supplement/dataset3.csv",
        "../data/train_supplement/dataset4.csv",
    ]

    for path in supplement_paths:
        sec_data = read_file(path)
        if "TC_mean" in sec_data.columns:
            sec_data = sec_data.rename(columns={"TC_mean": "Tc"})

        primary_data = primary_data.merge(sec_data, on="SMILES", how="outer", suffixes=("", "_supp"))

        for col in sec_data.columns:
            if col != "SMILES" and col in primary_data.columns:
                supp_col = f"{col}_supp"
                if supp_col not in primary_data.columns:
                    continue
                primary_data[col] = primary_data[col].fillna(primary_data[supp_col])

    supp_columns = filter(lambda col_name: "_supp" in col_name, primary_data.columns)
    clean_data = primary_data.drop(columns=list(supp_columns) + ["id"])
    
    return clean_data



### Feature extraction rationale

Features need to balance predictive power and dimensionality due to small datasets for some properties (~600). This is why, e.g., Mordred descriptors were not included.

- RDKit descriptors capture fundamental molecular properties (MW, logP, etc.)
- Morgan fingerprints encode local structural patterns around atoms
- MACCS keys express interpretable structural patterns

Features with low-variance and highly correlation to other features (but one of a group) were removed to prevent overfitting.

In [None]:
morgan_gen = GetMorganGenerator(radius=2, fpSize=256)

def extract_features(smile: str) -> pd.Series:
    """Generates baseline features from a SMILE string.

    This includes RDKit descriptors, the morgan fingerprint (set up with length=256 and radius=2),
    and the MACCSKeysFingerprint. This setup was chosen to avoid overfitting due to too many features.
    """
    mol = MolFromSmiles(smile)

    desc = [desc_fn(mol) for name, desc_fn in Descriptors.descList]
    maccs = GetMACCSKeysFingerprint(mol)
    morgan = morgan_gen.GetFingerprint(mol)
    features = pd.Series(desc + list(maccs) + list(morgan))

    # XGBoost does not perform optimally with infinite-like values.
    return (features.mask(features.abs() > 1e10, np.nan)
            .replace([np.inf, -np.inf], np.nan))

In [None]:
def get_feature_mask(features: pd.DataFrame, variance_threshold = 0.01, correlation_threshold = 0.95) -> pd.Series:
    """Gets a mask that removes redundant features.

    This includes features with low variance and all but one of highly correlated features.
    """
    mask = pd.Series(True, index=features.columns)
    mask &= features.var() >= variance_threshold

    remaining_features = features.loc[:, mask]

    corr_matrix = remaining_features.corr().abs()
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape, dtype=bool), k=1)
    )

    # Find highly correlated pairs and remove one from each pair
    high_corr_pairs = np.where(upper_triangle > correlation_threshold)
    features_to_drop = set(corr_matrix.columns[high_corr_pairs[1]])

    for feature in features_to_drop:
        mask[feature] = False

    return mask

In [None]:
def train_ag(target: str, data: pd.DataFrame, group: str, quality: str, time_limit: int) -> TabularPredictor:
    """Trains an AutoGluon predictor adjusted for small datasets.

    Args:
        group: the folder where the model is saved (inside models/)
    """
    other_targets = [col for col in ["Tg", "FFV", "Tc", "Density", "Rg"] if col != target]
    valid_rows = data[~data[target].isna()]
    is_small_dataset = valid_rows.shape[0] < 1000

    predictor = TabularPredictor(
        label=target,
        eval_metric="mean_absolute_error",
        path=f"../models/{group}/{target}"
    )

    predictor.fit(
        valid_rows.drop(columns=other_targets),
        presets=quality,
        time_limit=time_limit,
        excluded_model_types=["NN", "FASTAI"] if is_small_dataset else [],
        num_stack_levels=0 if is_small_dataset else 1,
    )

    return predictor

### Evaluation and training

5-fold CV mean absolute error was used to evaluate models as it is a good proxy for the metric used by the competition. Out-of-fold predictions are captured which could assist meta-learning later (though this could lead to overfitting due to very small samples).

Once a satisfactory setup was reached, the model was retrained on all data. Breaking convention is justified by the small dataset size and need to maximise training data for final predictions.

In [None]:
train_data = load_train_data()

features = train_data["SMILES"].apply(extract_features)
mask = get_feature_mask(features)

train_data = (pd.concat([train_data, features.loc[:, mask]], axis=1)
              .drop(columns=["SMILES"]))

In [None]:
oof_preds = pd.DataFrame(index=train_data.index)
targets = ["Tg", "FFV", "Tc", "Density", "Rg"]

for target in targets:
    k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in tqdm(
        k_fold.split(train_data),
        desc=f"Evaluating {target} AutoGluon model"
    ):
        train_subset, eval_subset = train_data.iloc[train_index], train_data.iloc[test_index]
        predictor = train_ag(target, train_subset, "autogluon-eval", "medium_quality", 300)
        predictions = predictor.predict(eval_subset.drop(columns=targets))

        oof_preds.loc[eval_subset.index, target] = predictions

    mae = mean_absolute_error(oof_preds[target], train_data[target])
    print(f"Finished evaluating {target}: mae={mae}")