Here we use DeepChem/ChemBERTa-77M-MLM as a baseline and uses https://github.com/kaiwenzha/Rank-N-Contrast as a loss

In [2]:
import os
import wandb
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb_key = user_secrets.get_secret("wandb_key")
    wandb.login(key=wandb_key)
    wandb.init(entity='lacemaker', project='openadmet2026')
except:
    pass



In [3]:
from pathlib import Path
OUTPUTDIR = Path("../working")
OUTPUTDIR.mkdir(exist_ok=True)
WHEELDIR = (OUTPUTDIR / "wheels").as_posix()
REQUIREMENTS = (OUTPUTDIR/"requirements.txt").as_posix()


In [4]:
%%writefile $REQUIREMENTS
#scikit-learn
#sklearn-compat
#category-encoders
#cesium
einops
sentence-transformers # == 5.1.0
torch # == 2.6.0 --index-url https://download.pytorch.org/whl/cu128
tabpfn
#transformers
# rdkit

Overwriting ../working/requirements.txt


In [5]:
!nvidia-smi

Mon Jan 19 02:50:02 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P0             26W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
!pip download --destination-directory $WHEELDIR -r $REQUIREMENTS
!pip wheel --wheel-dir $WHEELDIR -r $REQUIREMENTS
!pip install --upgrade --no-index --find-links=$WHEELDIR -r $REQUIREMENTS

Collecting einops (from -r ../working/requirements.txt (line 5))
  File was already downloaded /kaggle/working/wheels/einops-0.8.1-py3-none-any.whl
Collecting sentence-transformers (from -r ../working/requirements.txt (line 6))
  File was already downloaded /kaggle/working/wheels/sentence_transformers-5.2.0-py3-none-any.whl
Collecting torch (from -r ../working/requirements.txt (line 7))
  File was already downloaded /kaggle/working/wheels/torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl
Collecting tabpfn (from -r ../working/requirements.txt (line 8))
  File was already downloaded /kaggle/working/wheels/tabpfn-6.3.1-py3-none-any.whl
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers->-r ../working/requirements.txt (line 6))
  File was already downloaded /kaggle/working/wheels/transformers-4.57.6-py3-none-any.whl
Collecting tqdm (from sentence-transformers->-r ../working/requirements.txt (line 6))
  File was already downloaded /kaggle/working/wheels/tqdm-4.67.1-py3-none-

In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shutil
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

../input/openadmet2026-data-split/train_folds.csv
../input/openadmet2026-data-split/test_with_augmentations.csv
../input/openadmet2026-data-split/__results__.html
../input/openadmet2026-data-split/__notebook__.ipynb
../input/openadmet2026-data-split/__output__.json


In [None]:
import sentence_transformers as st
st.__version__

In [None]:
import torch
torch.__version__

In [None]:
import os
import random
import numpy as np
import torch
from torch import Tensor


def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

RANDOM_SEED = 3407  # 42  # 3407
set_seed(RANDOM_SEED)

## Data preparation

In [None]:
import kagglehub
openadmet_data_split_path = kagglehub.notebook_output_download('latticetower/openadmet2026-data-split')

In [1]:
train_df = pd.read_csv(Path(openadmet_data_split_path) / "train_folds.csv")
blind_test_df = pd.read_csv(Path(openadmet_data_split_path) / "test_with_augmentations.csv")
# fold_df = pd.read_csv(Path(latticetower_polymers_data_split_ext_path) / "train_folds.csv")
additional_smiles_columns = [col for col in train_df.columns if col.startswith('AUG_SMILES')]

merged_df = train_df
print(merged_df.shape)
merged_df.head()

NameError: name 'pd' is not defined

In [None]:
blind_test_df.head()

In [None]:
train_ids = merged_df.fold_name.isin(['train', 'test', 'val'])

train_df = merged_df.loc[train_ids].reset_index(drop=True)
val_df = merged_df.loc[merged_df.fold_name == 'val'].reset_index(drop=True)
test_df = merged_df.loc[merged_df.fold_name == 'test'].reset_index(drop=True)


In [None]:
TARGET_COLUMNS = [
    'LogD', 'KSOL', 'HLM CLint', 'MLM CLint',
    'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB',
    'MBPB', 'MGMB'
]

MODEL_NAME = "DeepChem/ChemBERTa-77M-MLM"

## Define model

In [2]:
%%writefile mixer_wrapper.py
from typing import Callable, Self

import torch
import torch.nn as nn
from torch import Tensor
import sentence_transformers as st
from sentence_transformers.models import Module
from einops.layers.torch import EinMix as Mix


# https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/models/Dense.py#L16-L105
# https://github.com/UKPLab/sentence-transformers/blob/v4.1.0/sentence_transformers/models/Dense.py#L15
class MixerWrapper(Module):
    config_keys: list[str] = [
        "in_features",
        "out_features",
        "n_channels",
        "activation_function",
    ]

    def __init__(
        self,
        in_features: int,
        out_features: int,
        n_channels=5,
        activation_function: Callable[[Tensor], Tensor] | None = nn.Tanh(),
        **kwargs
    ) -> None:

        super(MixerWrapper, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.n_channels = n_channels
        self.activation_function = nn.Identity() \
            if activation_function is None \
            else activation_function

        self.mixer_layer = Mix(
            "b e -> b o ch",
            weight_shape="e o ch",
            bias_shape="o ch",
            e=in_features,
            ch=n_channels,
            o=out_features
        )

    def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
        features.update({
            "sentence_embedding": self.activation_function(
                self.mixer_layer(features["sentence_embedding"]))
        })
        return features

    def get_sentence_embedding_dimension(self) -> int:
        return self.out_features

    def get_config_dict(self):
        return {
            "in_features": self.in_features,
            "out_features": self.out_features,
            "n_channels": self.n_channels,
            "activation_function": st.util.misc.fullname(self.activation_function),
        }
    def save(self, output_path: str, *args, safe_serialization: bool = True, **kwargs) -> None:
        self.save_config(output_path)
        self.save_torch_weights(output_path, safe_serialization=safe_serialization)

    def __repr__(self):
        return f"MixerWrapper({self.get_config_dict()})"

    @classmethod
    def load(
        cls,
        model_name_or_path: str,
        subfolder: str = "",
        token: bool | str | None = None,
        cache_folder: str | None = None,
        revision: str | None = None,
        local_files_only: bool = False,
        **kwargs,
    ) -> Self:
        hub_kwargs = {
            "subfolder": subfolder,
            "token": token,
            "cache_folder": cache_folder,
            "revision": revision,
            "local_files_only": local_files_only,
        }
        config = cls.load_config(model_name_or_path=model_name_or_path, **hub_kwargs)
        config["activation_function"] = st.util.misc.import_from_string(config["activation_function"])()
        model = cls(**config)
        model = cls.load_torch_weights(
            model_name_or_path=model_name_or_path,
            model=model,
            **hub_kwargs
        )
        return model


Writing mixer_wrapper.py


In [None]:
from mixer_wrapper import MixerWrapper

In [None]:
transformer = st.models.Transformer(MODEL_NAME)
transformer.auto_model.embeddings.requires_grad_(False)
for param in transformer.auto_model.embeddings.parameters():
    param.requires_grad = False

transformer.auto_model.encoder.layer[:10].requires_grad_(False)
for param in transformer.auto_model.encoder.layer[:10].parameters():
    param.requires_grad = False
EMB_SIZE = transformer.get_word_embedding_dimension()

pooling = st.models.Pooling(EMB_SIZE, pooling_mode="mean")
normalization = st.models.Normalize()
mixer_layer = MixerWrapper(EMB_SIZE, EMB_SIZE)


model = st.SentenceTransformer(
    modules=[transformer, pooling, normalization, mixer_layer, normalization],
    device='cuda',
    model_card_data=st.SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="SentenceTransformer model based on kuelumbus/polyBERT to predict polymeric properties",
        generate_widget_examples=False
    )
)