# Graph Neural Network Approach

This notebook implements a graph neural network (GNN) baseline using molecular graph representations. The GNN directly learns from atomic connectivity and bond information, augmented with Morgan fingerprints and MACCS keys for enhanced chemical feature representation.

Multi-task learning is employed to predict all five polymer properties simultaneously, leveraging shared molecular patterns across targets whilst accommodating the small dataset constraints.

In [None]:
import importnb
import joblib
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch_molecule.predictor.gnn import GNNMolecularPredictor
from tqdm import tqdm

In [None]:
grad_boosting = importnb.Notebook.load_file("01_gradient_boosting.ipynb", include_non_defs=False)

train_data = grad_boosting.load_train_data()
targets = ["Tg", "FFV", "Tc", "Density", "Rg"]

In [None]:
def train_gnn(data: pd.DataFrame, validate: bool) -> tuple:
    """Trains a multi-task GNN-based predictor.

    Args:
        validate: Whether to use validation set to control early stopping.

    Returns:
        A tuple containing the trained GNNMolecularPredictor predictor and a StandardScaler for inputs.
    """
    scaler = StandardScaler()

    if validate:
        train_idx, val_idx = train_test_split(
            data.index,
            test_size=0.2,
            random_state=42
        )
        y_train = scaler.fit_transform(data.loc[train_idx, targets])
        y_val = scaler.transform(data.loc[val_idx, targets])
    else:
        y_train = scaler.fit_transform(data.loc[:, targets])

    predictor = GNNMolecularPredictor(
        num_task=5,
        num_layer=3,
        hidden_size=160,
        epochs=400 if validate else 140,
        batch_size=64,
        patience=120 if validate else 50,
        augmented_feature=["morgan", "maccs"],
        use_lr_scheduler=True
    )

    if validate:
        predictor.fit(
            list(data.loc[train_idx, "SMILES"]), y_train,
            list(data.loc[val_idx, "SMILES"]), y_val
        )
    else:
        predictor.fit(list(data.loc[:, "SMILES"]), y_train)

    return predictor, scaler

In [None]:
oof_preds = pd.DataFrame(index=train_data.index)
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in tqdm(
    k_fold.split(train_data),
    desc=f"Evaluating GNN model"
):
    train_subset, eval_subset = train_data.iloc[train_index], train_data.iloc[test_index]
    predictor, scaler = train_gnn(train_subset, validate=False)
    predictions = predictor.predict(list(eval_subset["SMILES"]))["prediction"]
    predictions = scaler.inverse_transform(predictions)

    for i, target in enumerate(targets):
        oof_preds.loc[eval_subset.index, target] = predictions[:, i]

In [None]:
for target in targets:
    not_nan = train_data[target].notna()
    mae = mean_absolute_error(oof_preds.loc[not_nan, target], train_data.loc[not_nan, target])
    print(f"{target} mae={mae}")

In [None]:
predictor, scaler = train_gnn(train_data, validate=False)

predictor.save_to_local("../models/gnn/grin_predictor.pt")
joblib.dump(scaler, "../models/gnn/scaler.pkl")