In [42]:
%reload_ext autoreload
%autoreload 2

In [44]:
import numpy as np
import logging
from polymetrix.datasets.curated_tg_dataset import CuratedGlassTempDataset
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from mofdscribe.splitters.splitters import LOCOCV
from polymetrix.splitters.splitters import TgSplitter

In [45]:
# Configuration
VERSION = "v11"
URL = "https://zenodo.org/records/14945126/files/LAMALAB_CURATED_Tg_structured.csv?download=1"
RANDOM_STATE = 42

# Initialize logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# Load dataset
dataset = CuratedGlassTempDataset(
    version=VERSION,
    url=URL,
    feature_levels=["sidechainlevel", "backbonelevel", "fullpolymerlevel"]
)

# Extract features and labels
X = dataset.get_features(idx=np.arange(len(dataset)))
y = dataset.get_labels(idx=np.arange(len(dataset)), label_names=["labels.Exp_Tg(K)"]).ravel()

# Dataset info logging
logging.info(f"Number of samples: {len(dataset)}")
logging.info(f"Feature columns: {dataset.available_features}")
logging.info(f"Active feature levels: {dataset.active_feature_levels}")

INFO: Number of samples: 7367
INFO: Feature columns: ['sidechainlevel.features.num_atoms_sidechainfeaturizer_sum', 'sidechainlevel.features.num_atoms_sidechainfeaturizer_mean', 'sidechainlevel.features.num_atoms_sidechainfeaturizer_max', 'sidechainlevel.features.num_atoms_sidechainfeaturizer_min', 'sidechainlevel.features.numsidechainfeaturizer', 'sidechainlevel.features.sidechainlength_to_star_attachment_distance_ratio_mean', 'sidechainlevel.features.sidechainlength_to_star_attachment_distance_ratio_min', 'sidechainlevel.features.sidechainlength_to_star_attachment_distance_ratio_max', 'sidechainlevel.features.sidechainlength_to_star_attachment_distance_ratio_sum', 'sidechainlevel.features.star_to_sidechain_min_distance_mean', 'sidechainlevel.features.star_to_sidechain_min_distance_min', 'sidechainlevel.features.star_to_sidechain_min_distance_max', 'sidechainlevel.features.star_to_sidechain_min_distance_sum', 'sidechainlevel.features.num_diverse_sidechains', 'sidechainlevel.features.ba

### Evaluation and modeling functions

In [49]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    model = GradientBoostingRegressor(random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

def log_splits(X_train, X_valid, X_test):
    """Log split sizes"""
    logging.info(f"Training set: {len(X_train)} samples")
    logging.info(f"Validation set: {len(X_valid) if X_valid is not None else 0} samples") 
    logging.info(f"Test set: {len(X_test)} samples")

### 1: Random Split 
Traditional train/valid/test split

In [50]:
# Random split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE
)

log_splits(X_train, X_valid, X_test)

# Evaluation
valid_mae = train_and_evaluate(X_train, X_valid, y_train, y_valid)
test_mae = train_and_evaluate(X_train, X_test, y_train, y_test)
logging.info(f"Validation MAE: {valid_mae:.2f}, Test MAE: {test_mae:.2f}")

INFO: Training set: 5156 samples
INFO: Validation set: 1105 samples
INFO: Test set: 1106 samples
INFO: Validation MAE: 32.40, Test MAE: 34.29


### Random kfold split

In [51]:
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    fold_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
    cv_scores.append(fold_mae)
    logging.info(f"Fold {fold} MAE: {fold_mae:.2f}")

logging.info(f"CV MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")


INFO: Fold 1 MAE: 33.48
INFO: Fold 2 MAE: 33.85
INFO: Fold 3 MAE: 32.67
INFO: Fold 4 MAE: 34.19
INFO: Fold 5 MAE: 32.32
INFO: CV MAE: 33.30 ± 0.70


### 2: Leave-cluster-out cross-validation

In [52]:
loco = LOCOCV(
    ds=dataset,
    feature_names=dataset.available_features,
    n_pca_components=3,
    random_state=RANDOM_STATE,
    scaled=True
)

# Single split
train_idx, valid_idx, test_idx = loco.train_valid_test_split()
log_splits(X[train_idx], X[valid_idx], X[test_idx])

# Evaluation
valid_mae = train_and_evaluate(X[train_idx], X[valid_idx], y[train_idx], y[valid_idx])
test_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
logging.info(f"LOCOCV MAE: Valid {valid_mae:.2f}, Test {test_mae:.2f}")

INFO: Training set: 3194 samples
INFO: Validation set: 980 samples
INFO: Test set: 3193 samples
INFO: LOCOCV MAE: Valid 48.24, Test 42.85


### 2: Leave-cluster-out cross-validation
kfold split based on cluster

In [53]:
# LOCOCV 5-Fold
loco_cv = LOCOCV(
    ds=dataset,
    feature_names=dataset.available_features,
    n_pca_components=5,  # For 5-fold CV
    random_state=RANDOM_STATE,
    scaled=True
)

cv_scores = []
for fold, (train_idx, test_idx) in enumerate(loco_cv.k_fold(k=5), 1):
    fold_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
    cv_scores.append(fold_mae)
    logging.info(f"LOCOCV Fold {fold} MAE: {fold_mae:.2f}")

logging.info(f"LOCOCV 5-Fold MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")


INFO: LOCOCV Fold 1 MAE: 35.52
INFO: LOCOCV Fold 2 MAE: 39.54
INFO: LOCOCV Fold 3 MAE: 37.36
INFO: LOCOCV Fold 4 MAE: 56.58
INFO: LOCOCV Fold 5 MAE: 37.71
INFO: LOCOCV 5-Fold MAE: 41.34 ± 7.73


### Tgsplitter

In [54]:
tg_splitter = TgSplitter(
    ds=dataset,
    tg_q=np.linspace(0, 1, 5),
    shuffle=True,
    random_state=RANDOM_STATE
)

# Single split
train_idx, valid_idx, test_idx = tg_splitter.train_valid_test_split(
    frac_train=0.7,
    frac_valid=0.1
)
log_splits(X[train_idx], X[valid_idx], X[test_idx])

# Evaluation
valid_mae = train_and_evaluate(X[train_idx], X[valid_idx], y[train_idx], y[valid_idx])
test_mae = train_and_evaluate(X[train_idx], X[test_idx], y[train_idx], y[test_idx])
logging.info(f"TgSplitter MAE: Valid {valid_mae:.2f}, Test {test_mae:.2f}")

INFO: Training set: 3693 samples
INFO: Validation set: 1842 samples
INFO: Test set: 1832 samples
INFO: TgSplitter MAE: Valid 34.13, Test 88.98


### Tgsplitter
kfold split

In [55]:
# TgSplitter Grouped K-Fold
tg_splitter_cv = TgSplitter(
    ds=dataset,
    tg_q=np.linspace(0, 1, 6),  # 5 groups for 5-fold
    shuffle=True,
    random_state=RANDOM_STATE
)

groups = tg_splitter_cv._get_groups()
unique_groups = np.unique(groups)
cv_scores = []

for fold, test_group in enumerate(unique_groups, 1):
    train_mask = groups != test_group
    test_mask = groups == test_group
    
    fold_mae = train_and_evaluate(X[train_mask], X[test_mask], y[train_mask], y[test_mask])
    cv_scores.append(fold_mae)
    logging.info(f"TgSplitter Fold {fold} MAE: {fold_mae:.2f}")

logging.info(f"TgSplitter 5-Fold MAE: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

INFO: TgSplitter Fold 1 MAE: 89.37
INFO: TgSplitter Fold 2 MAE: 33.59
INFO: TgSplitter Fold 3 MAE: 47.56
INFO: TgSplitter Fold 4 MAE: 45.61
INFO: TgSplitter Fold 5 MAE: 92.25
INFO: TgSplitter 5-Fold MAE: 61.68 ± 24.28
