# Model Ensemble Optimisation

This notebook combines predictions from the gradient boosting and graph neural network approaches using weighted averaging. Optimal ensemble weights are determined through minimisation of mean absolute error on out-of-fold predictions.

A simple approach (rather than meta-learning) is used to avoid overfitting.

In [1]:
import pandas as pd
from scipy.optimize import minimize
from sklearn.metrics import mean_absolute_error

In [None]:
gnn_preds = pd.read_csv('../data/gnn_preds.csv')
train_data = pd.read_csv('../data/train.csv')
ag_preds = pd.read_csv('../data/ag_preds.csv')

properties = train_data.columns[-5:]

In [None]:
def optimize_weights(gnn_pred: pd.Series, ag_pred: pd.Series, true_values: pd.Series):
    """Finds the optimal weights to combine GNN predictions and an AG predictions.

    Returns:
        An iterable containing the optimal weights, which sum to 1.
    """
    def mae_loss(weights: list):
        """Computes the mean absolute error with respect to weights."""
        w1, w2 = weights
        ensemble_pred = w1 * gnn_pred + w2 * ag_pred
        return mean_absolute_error(true_values, ensemble_pred)

    initial_weights = [0.5, 0.5]
    constraint = {'type': 'eq', 'fun': lambda w: w[0] + w[1] - 1}
    bounds = [(0, 1), (0, 1)]

    result = minimize(mae_loss, initial_weights, method='SLSQP', bounds=bounds, constraints=constraint)
    return result.x

In [None]:
optimal_weights = {}

for prop in properties:
    mask = (~gnn_preds[prop].isna()) & (~ag_preds[prop].isna()) & (~train_data[prop].isna())
    
    if mask.sum() > 0:
        gnn_valid = gnn_preds[prop][mask]
        ag_valid = ag_preds[prop][mask]
        true_valid = train_data[prop][mask]
        
        weights = optimize_weights(gnn_valid, ag_valid, true_valid)
        optimal_weights[prop] = weights
        
        print(f"{prop}: GNN weight = {weights[0]:.3f}, AG weight = {weights[1]:.3f}")