Here we use DeepChem/ChemBERTa-77M-MLM as a baseline and uses https://github.com/kaiwenzha/Rank-N-Contrast as a loss

In [2]:
import os
import wandb
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb_key = user_secrets.get_secret("wandb_key")
    wandb.login(key=wandb_key)
    wandb.init(entity='lacemaker', project='openadmet2026')
except:
    pass



In [3]:
from pathlib import Path
OUTPUTDIR = Path("../working")
OUTPUTDIR.mkdir(exist_ok=True)
WHEELDIR = (OUTPUTDIR / "wheels").as_posix()
REQUIREMENTS = (OUTPUTDIR/"requirements.txt").as_posix()


In [4]:
%%writefile $REQUIREMENTS
#scikit-learn
#sklearn-compat
#category-encoders
#cesium
einops
sentence-transformers # == 5.1.0
torch # == 2.6.0 --index-url https://download.pytorch.org/whl/cu128
tabpfn
#transformers
# rdkit

Overwriting ../working/requirements.txt


In [5]:
!nvidia-smi

Mon Jan 19 02:50:02 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P0             26W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
!pip download --destination-directory $WHEELDIR -r $REQUIREMENTS
!pip wheel --wheel-dir $WHEELDIR -r $REQUIREMENTS
!pip install --upgrade --no-index --find-links=$WHEELDIR -r $REQUIREMENTS

Collecting einops (from -r ../working/requirements.txt (line 5))
  File was already downloaded /kaggle/working/wheels/einops-0.8.1-py3-none-any.whl
Collecting sentence-transformers (from -r ../working/requirements.txt (line 6))
  File was already downloaded /kaggle/working/wheels/sentence_transformers-5.2.0-py3-none-any.whl
Collecting torch (from -r ../working/requirements.txt (line 7))
  File was already downloaded /kaggle/working/wheels/torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl
Collecting tabpfn (from -r ../working/requirements.txt (line 8))
  File was already downloaded /kaggle/working/wheels/tabpfn-6.3.1-py3-none-any.whl
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers->-r ../working/requirements.txt (line 6))
  File was already downloaded /kaggle/working/wheels/transformers-4.57.6-py3-none-any.whl
Collecting tqdm (from sentence-transformers->-r ../working/requirements.txt (line 6))
  File was already downloaded /kaggle/working/wheels/tqdm-4.67.1-py3-none-

In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shutil
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

../input/openadmet2026-data-split/train_folds.csv
../input/openadmet2026-data-split/test_with_augmentations.csv
../input/openadmet2026-data-split/__results__.html
../input/openadmet2026-data-split/__notebook__.ipynb
../input/openadmet2026-data-split/__output__.json


In [None]:
import sentence_transformers as st
st.__version__

In [None]:
import torch
torch.__version__

In [None]:
import os
import random
import numpy as np
import torch
from torch import Tensor


def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

RANDOM_SEED = 3407  # 42  # 3407
set_seed(RANDOM_SEED)

## Data preparation

In [None]:
import kagglehub
openadmet_data_split_path = kagglehub.notebook_output_download('latticetower/openadmet2026-data-split')

In [1]:
train_df = pd.read_csv(Path(openadmet_data_split_path) / "train_folds.csv")
blind_test_df = pd.read_csv(Path(openadmet_data_split_path) / "test_with_augmentations.csv")
# fold_df = pd.read_csv(Path(latticetower_polymers_data_split_ext_path) / "train_folds.csv")
additional_smiles_columns = [col for col in train_df.columns if col.startswith('AUG_SMILES')]

merged_df = train_df
print(merged_df.shape)
merged_df.head()

NameError: name 'pd' is not defined

In [None]:
blind_test_df.head()

In [None]:
train_ids = merged_df.fold_name.isin(['train', 'test', 'val'])

train_df = merged_df.loc[train_ids].reset_index(drop=True)
val_df = merged_df.loc[merged_df.fold_name == 'val'].reset_index(drop=True)
test_df = merged_df.loc[merged_df.fold_name == 'test'].reset_index(drop=True)


In [None]:
TARGET_COLUMNS = [
    'LogD', 'KSOL', 'HLM CLint', 'MLM CLint',
    'Caco-2 Permeability Papp A>B', 'Caco-2 Permeability Efflux', 'MPPB',
    'MBPB', 'MGMB'
]

MODEL_NAME = "DeepChem/ChemBERTa-77M-MLM"

## Define model

In [2]:
%%writefile mixer_wrapper.py
from typing import Callable, Self

import torch
import torch.nn as nn
from torch import Tensor
import sentence_transformers as st
from sentence_transformers.models import Module
from einops.layers.torch import EinMix as Mix


# https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/models/Dense.py#L16-L105
# https://github.com/UKPLab/sentence-transformers/blob/v4.1.0/sentence_transformers/models/Dense.py#L15
class MixerWrapper(Module):
    config_keys: list[str] = [
        "in_features",
        "out_features",
        "n_channels",
        "activation_function",
    ]

    def __init__(
        self,
        in_features: int,
        out_features: int,
        n_channels=5,
        activation_function: Callable[[Tensor], Tensor] | None = nn.Tanh(),
        **kwargs
    ) -> None:

        super(MixerWrapper, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.n_channels = n_channels
        self.activation_function = nn.Identity() \
            if activation_function is None \
            else activation_function

        self.mixer_layer = Mix(
            "b e -> b o ch",
            weight_shape="e o ch",
            bias_shape="o ch",
            e=in_features,
            ch=n_channels,
            o=out_features
        )

    def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
        features.update({
            "sentence_embedding": self.activation_function(
                self.mixer_layer(features["sentence_embedding"]))
        })
        return features

    def get_sentence_embedding_dimension(self) -> int:
        return self.out_features

    def get_config_dict(self):
        return {
            "in_features": self.in_features,
            "out_features": self.out_features,
            "n_channels": self.n_channels,
            "activation_function": st.util.misc.fullname(self.activation_function),
        }
    def save(self, output_path: str, *args, safe_serialization: bool = True, **kwargs) -> None:
        self.save_config(output_path)
        self.save_torch_weights(output_path, safe_serialization=safe_serialization)

    def __repr__(self):
        return f"MixerWrapper({self.get_config_dict()})"

    @classmethod
    def load(
        cls,
        model_name_or_path: str,
        subfolder: str = "",
        token: bool | str | None = None,
        cache_folder: str | None = None,
        revision: str | None = None,
        local_files_only: bool = False,
        **kwargs,
    ) -> Self:
        hub_kwargs = {
            "subfolder": subfolder,
            "token": token,
            "cache_folder": cache_folder,
            "revision": revision,
            "local_files_only": local_files_only,
        }
        config = cls.load_config(model_name_or_path=model_name_or_path, **hub_kwargs)
        config["activation_function"] = st.util.misc.import_from_string(config["activation_function"])()
        model = cls(**config)
        model = cls.load_torch_weights(
            model_name_or_path=model_name_or_path,
            model=model,
            **hub_kwargs
        )
        return model


Writing mixer_wrapper.py


In [None]:
from mixer_wrapper import MixerWrapper

In [None]:
transformer = st.models.Transformer(MODEL_NAME)
transformer.auto_model.embeddings.requires_grad_(False)
for param in transformer.auto_model.embeddings.parameters():
    param.requires_grad = False

transformer.auto_model.encoder.layer[:10].requires_grad_(False)
for param in transformer.auto_model.encoder.layer[:10].parameters():
    param.requires_grad = False
EMB_SIZE = transformer.get_word_embedding_dimension()

pooling = st.models.Pooling(EMB_SIZE, pooling_mode="mean")
normalization = st.models.Normalize()
mixer_layer = MixerWrapper(EMB_SIZE, EMB_SIZE)


model = st.SentenceTransformer(
    modules=[transformer, pooling, normalization, mixer_layer, normalization],
    device='cuda',
    model_card_data=st.SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="SentenceTransformer model based on kuelumbus/polyBERT to predict polymeric properties",
        generate_widget_examples=False
    )
)

In [None]:
model.encode(train_df.SMILES.values[:4]).shape

In [None]:
merged_df.loc[:, TARGET_COLUMNS].describe().loc[['mean', 'std']]

In [None]:
target_means = train_df.loc[:, TARGET_COLUMNS].mean().to_dict()
target_means_list = [target_means[col] for col in TARGET_COLUMNS]
# print(target_means_list)

TARGET_MEANS = np.asarray(target_means_list)
TARGET_MEANS

In [None]:
target_deviations = train_df.loc[:, TARGET_COLUMNS].std().to_dict()
target_deviations_list = [target_deviations[col] for col in TARGET_COLUMNS]

TARGET_DEVIATIONS = np.asarray(target_deviations_list)
TARGET_DEVIATIONS

In [None]:
# model.device
train_df.loc[:, TARGET_COLUMNS] = ((train_df.loc[:, TARGET_COLUMNS] 
                                    - TARGET_MEANS) / TARGET_DEVIATIONS)
val_df.loc[:, TARGET_COLUMNS] = ((val_df.loc[:, TARGET_COLUMNS] 
                                    - TARGET_MEANS) / TARGET_DEVIATIONS)

test_df.loc[:, TARGET_COLUMNS] = ((test_df.loc[:, TARGET_COLUMNS] 
                                    - TARGET_MEANS) / TARGET_DEVIATIONS)

In [None]:
print('VAL')
print(val_df[TARGET_COLUMNS].mean(), '\n---\n', val_df[TARGET_COLUMNS].std())
print('\nTEST')
print(test_df[TARGET_COLUMNS].mean(), '\n---\n', test_df[TARGET_COLUMNS].std())

In [None]:
def get_augmented_data_from_index(
        df, index, n_repeats=1, # sample_random=False, 
        target_columns=TARGET_COLUMNS,
        random_state=None,
        smiles_column='SMILES',
        additional_smiles_columns=additional_smiles_columns,
    ):
    targets_list = df.loc[index, target_columns].values
    if n_repeats > 1:
        targets_list = np.repeat(targets_list, n_repeats, axis=0)
        repeated_index = np.repeat(index, n_repeats, axis=0)
    else:
        repeated_index = index
    if random_state is None:
        # use deterministic approach
        smiles_list1 = df.loc[repeated_index, smiles_column].values
        smiles_column2 = additional_smiles_columns[0] if len(additional_smiles_columns) > 0 else smiles_column
        smiles_list2 = df.loc[repeated_index, smiles_column2].values
    else:
        np.random.seed(random_state)
        smiles2select = df.loc[repeated_index, [smiles_column]+additional_smiles_columns].values
        smiles2select = [sorted(np.unique(xs)) for xs in smiles2select]
        smiles2select = [np.random.choice(xs, 2) for xs in smiles2select]
        smiles_list1, smiles_list2 = list(zip(*smiles2select))
        # print(len(smiles_list1), len(smiles_list2))
    return {
        'smiles1': smiles_list1,
        'smiles2': smiles_list2,
        'label': targets_list
    }

In [None]:
from datasets import Dataset

print("dataframe sizes:", train_df.shape[0], val_df.shape[0], test_df.shape[0])

np.random.seed(42)
TRAIN_SIZE = 100_00
VAL_SIZE = 5000
TEST_SIZE = 5000

NUM_TRAIN_REPEATS = 5

train_index = []
for col in TARGET_COLUMNS:
    indices = np.random.choice(
        train_df[train_df[col].isnull()].index, (TRAIN_SIZE// len(TARGET_COLUMNS),))
    train_index.append(indices)
train_index = np.concatenate(train_index)

print(train_index.shape)
# print('smth', train_pair_index.shape)
# train_pair_index = np.random.choice(train_df.index, (2, TRAIN_SIZE))
train_dict = get_augmented_data_from_index(
    train_df, 
    train_index, 
    n_repeats=NUM_TRAIN_REPEATS,
    random_state=42
)
train_dataset = Dataset.from_dict(train_dict)

# val_pair_index = np.random.choice(val_df.index, (2, VAL_SIZE))
val_index = []
for col in TARGET_COLUMNS:
    indices = np.random.choice(
        val_df[val_df[col].isnull()].index, (VAL_SIZE// len(TARGET_COLUMNS), ))
    val_index.append(indices)
val_index = np.concatenate(val_index)
val_dict = get_augmented_data_from_index(val_df, val_index)
val_dataset = Dataset.from_dict(val_dict)

#test_pair_index = np.random.choice(val_df.index, (2, TEST_SIZE))
test_index = []
for col in TARGET_COLUMNS:
    indices = np.random.choice(
        test_df[test_df[col].isnull()].index,
        (TEST_SIZE// len(TARGET_COLUMNS), ))
    test_index.append(indices)
test_index = np.concatenate(test_index)
test_dict = get_augmented_data_from_index(test_df, test_index)
test_dataset = Dataset.from_dict(test_dict)
# 764*764/2
print("selected index sizes:", train_index.shape, val_index.shape, test_index.shape)

print("final dataset sizes", train_dataset.num_rows, val_dataset.num_rows, test_dataset.num_rows)

In [None]:
import torch.nn.functional as F


def normalize_embeddings(embeddings):
    return F.normalize(embeddings, p=2, dim=1)

_convert_to_tensor=st.util.tensor._convert_to_tensor


def pairwise_euclidean_distance(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor):
    a = _convert_to_tensor(a)
    b = _convert_to_tensor(b)

    return torch.sqrt(torch.sum((a - b) ** 2, dim=1)).to_dense()

def pairwise_euclidean_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor):
    distance = pairwise_euclidean_distance(a, b)

    return -distance

def pairwise_manhattan_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor):
    a = _convert_to_tensor(a)
    b = _convert_to_tensor(b)

    return -torch.sum(torch.abs(a - b), dim=1).to_dense()

def pairwise_dot_score(a: Tensor, b: Tensor) -> Tensor:

    a = _convert_to_tensor(a)
    b = _convert_to_tensor(b)

    return (a * b).sum(dim=1).to_dense()

def pairwise_cos_sim(a: Tensor, b: Tensor) -> Tensor:
    a = st.util._convert_to_tensor(a)
    b = _convert_to_tensor(b)

    # Handle sparse tensors
    if a.is_sparse or b.is_sparse:
        a_norm = normalize_embeddings(a)
        b_norm = normalize_embeddings(b)
        return (a_norm * b_norm).sum(dim=1).to_dense()
    else:
        return pairwise_dot_score(normalize_embeddings(a), normalize_embeddings(b)).to_dense()


In [None]:
# https://github.com/kaiwenzha/Rank-N-Contrast/blob/main/loss.py
# below is modified version to support multiple targets
from typing import Iterable, Any
import torch
import torch.nn as nn
import torch.nn.functional as F


class LabelDifference(nn.Module):
    def __init__(self, distance_type='l1'):
        super(LabelDifference, self).__init__()
        self.distance_type = distance_type

    def forward(self, labels):
        # labels: [bs, label_dim]
        # output: [bs, bs]
        if self.distance_type == 'l1':
            return torch.abs(labels[:, None, :] - labels[None, :, :]).sum(dim=-1)
        else:
            raise ValueError(self.distance_type)


class FeatureSimilarity(nn.Module):
    def __init__(self, similarity_type='l2'):
        super(FeatureSimilarity, self).__init__()
        self.similarity_type = similarity_type

    def forward(self, features):
        # labels: [bs, feat_dim]
        # output: [bs, bs]
        if self.similarity_type == 'l2':
            return - (features[:, None, :] - features[None, :, :]).norm(2, dim=-1)
        else:
            raise ValueError(self.similarity_type)


class MultilabelRnCLoss(nn.Module):
    """
    Each sentence transformer head returns tensor with float values [batch_size, out_embedding_size, n_channels], there is a pair of them.
    Labels have shape [batch_size, n_channels], might also contain NaNs.

    First, the loss computes euclidean distance between pair of tensors with the shape [batch_size, out_embedding_size, n_channels], along the dimension 1.
    """
    def __init__(
        self,
        model: st.SentenceTransformer,
        n_channels: int=5,
        # loss_fct: nn.Module = nn.MSELoss(),
        # score_transformation: nn.Module = nn.Identity(),
        temperature=2, 
        label_diff='l1', 
        feature_sim='l2'
    ) -> None:

        super().__init__()
        
        self.model = model
        self.n_channels = n_channels
        # self.loss_fct = loss_fct
        # self.score_transformation = score_transformation
        
        self.t = temperature
        self.label_diff_fn = LabelDifference(label_diff)
        self.feature_sim_fn = FeatureSimilarity(feature_sim)
    

    def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor:
        f1, f2 = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features]
        features = torch.cat([f1.unsqueeze(1), f2.unsqueeze(1)], dim=1)
        feature_splits = torch.split(features, 1, dim=-1)
        label_splits = torch.split(labels, 1, dim=-1)
        loss_parts = []
        for tgt_features, tgt_values in zip(feature_splits, label_splits):
            # print(tgt_features.shape, tgt_values.shape)
            tgt_features = tgt_features.squeeze(-1)
            # tgt_values = tgt_values.squeeze(-1)
            ids = ~torch.isnan(tgt_values).any(1)
            # print(ids.shape, tgt_features.shape, tgt_values.shape)
            if ids.sum() < 1:
                continue
            loss = self.compute_rnc_loss(tgt_features[ids], tgt_values[ids])
            # print(loss.shape)
            loss_parts.append(loss)
            
        return torch.stack(loss_parts).mean()
        
    def compute_rnc_loss(self, features, labels):
        # features: [bs, 2, feat_dim]
        # labels: [bs, label_dim]
        # print(features.shape, labels.shape)

        features = torch.cat([features[:, 0], features[:, 1]], dim=0)  # [2bs, feat_dim]
        # print(features.device)
        labels = labels.repeat(2, 1)  # [2bs, label_dim]
        # print(labels.shape)

        label_diffs = self.label_diff_fn(labels)
        logits = self.feature_sim_fn(features).div(self.t)
        logits_max, _ = torch.max(logits, dim=1, keepdim=True)
        logits -= logits_max.detach()
        exp_logits = logits.exp()

        n = logits.shape[0]  # n = 2bs

        # remove diagonal
        logits = logits.masked_select((1 - torch.eye(n).to(logits.device)).bool()).view(n, n - 1)
        exp_logits = exp_logits.masked_select((1 - torch.eye(n).to(logits.device)).bool()).view(n, n - 1)
        label_diffs = label_diffs.masked_select((1 - torch.eye(n).to(logits.device)).bool()).view(n, n - 1)

        loss = 0.
        for k in range(n - 1):
            pos_logits = logits[:, k]  # 2bs
            pos_label_diffs = label_diffs[:, k]  # 2bs
            neg_mask = (label_diffs >= pos_label_diffs.view(-1, 1)).float()  # [2bs, 2bs - 1]
            pos_log_probs = pos_logits - torch.log((neg_mask * exp_logits).sum(dim=-1))  # 2bs
            loss += - (pos_log_probs / (n * (n - 1))).sum()

        return loss
    # def compute_loss_from_embeddings(self, embeddings: list[Tensor], labels: Tensor) -> Tensor:
    #     # cosine_similarities = torch.cosine_similarity(embeddings[0], embeddings[1], dim=1)
    #     pair_distances = pairwise_euclidean_distance(embeddings[0], embeddings[1])
    #     output = self.score_transformation(pair_distances)
    #     # output = self.cos_score_transformation(cosine_similarities)
    #     condition = ~torch.isnan(labels)
    #     output = torch.where(condition, output, torch.zeros_like(output, requires_grad=False))
    #     prepared_labels = torch.where(condition, labels, torch.zeros_like(output, requires_grad=False))
    #     return self.loss_fct(output, prepared_labels.float())

    def get_config_dict(self) -> dict[str, Any]:
        return {
            'temperature': self.t,
            'label_diff_fn': st.util.misc.fullname(self.label_diff_fn), #st.util.misc.fullname(),
            'feature_sim_fn': st.util.misc.fullname(self.feature_sim_fn),
            #"loss_fct": st.util.misc.fullname(self.loss_fct)
        }


In [None]:
train_loss = MultilabelRnCLoss(model)

In [None]:
from sentence_transformers.training_args import BatchSamplers


args = st.SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="model_saves",
    # Optional training parameters:
    num_train_epochs=10,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="polymers-contrastive",  # Will be used in W&B if `wandb` is installed
)

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
from scipy.stats import pearsonr, spearmanr
import csv

from sentence_transformers.similarity_functions import SimilarityFunction

class MultilabelSilhouetteEvaluator(st.evaluation.EmbeddingSimilarityEvaluator):
    def __init__(self, sentences=[], scores=[], name='evaluator', num_classes=10):
        super().__init__(sentences1=sentences, sentences2=sentences, scores=scores)
        self.sentences = sentences
        self.scores = scores
        self.name = name
        self.num_classes = num_classes

    def __call__(
        self,
        model: st.SentenceTransformer,
        output_path: str | None = None,
        epoch: int = -1,
        steps: int = -1
    ) -> dict[str, float]:
        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps"
        else:
            out_txt = ""
        #if self.truncate_dim is not None:
        #    out_txt += f" (truncated to {self.truncate_dim})"

        # logger.info(f"EmbeddingSimilarityEvaluator: Evaluating the model on the {self.name} dataset{out_txt}:")

        embeddings = self.embed_inputs(model, self.sentences)
        # embeddings = embeddings.detach().cpu().numpy()
        # embeddings2 = self.embed_inputs(model, self.sentences2)
        # print(embeddings1.shape)
        # # Binary and ubinary embeddings are packed, so we need to unpack them for the distance metrics
        # if self.precision == "binary":
        #     embeddings1 = (embeddings1 + 128).astype(np.uint8)
        #     embeddings2 = (embeddings2 + 128).astype(np.uint8)
        # if self.precision in ("ubinary", "binary"):
        #     embeddings1 = np.unpackbits(embeddings1, axis=1)
        #     embeddings2 = np.unpackbits(embeddings2, axis=1)
        num_scores = self.scores.shape[1]
        scores_list = []
        for i in range(num_scores):
            targets = self.scores[:, i]
            ids = ~np.isnan(targets)
            targets = targets[ids]
            bins = np.histogram_bin_edges(targets, self.num_classes, range=(targets.min() - 1e-3, targets.max() + 1e-3))
            tgt_classes = np.digitize(targets, bins=bins)
            features = embeddings[ids, :, i]
            score = silhouette_score(features, tgt_classes)
            scores_list.append(score)
            
        # labels = np.np.stack(labels, axis=-1)
        # print(labels.shape)

        metrics = {}
        metric_column_names = []
        for column, score in zip(TARGET_COLUMNS, scores_list):
            # print(scores.shape, labels.shape)
            #eval_pearson, _ = pearsonr(target, pred)
            # eval_spearman, _ = spearmanr(target, pred)
            # mae_values = np.abs(target - pred)
            metrics[f"silhouette_{column}"] = score
            # metrics[f"spearman_{column}"] = eval_spearman
            # metrics[f"mae_{column}"] = mae_values.mean()
            metric_column_names.extend([
                f"silhouette_{column}",
            ])
        
        metrics['silhouette_mean'] = np.mean(scores_list)
        metric_column_names.append('silhouette_mean')

        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, newline="", mode="a" if output_file_exists else "w", encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(['epoch', 'steps'] + metric_column_names)

                writer.writerow([epoch, steps] + [metrics[column] for column in metric_column_names])

        # if len(self.similarity_fn_names) > 1:
        #     metrics["pearson_max"] = max(metrics[f"pearson_{fn_name}"] for fn_name in self.similarity_fn_names)
        #     metrics["spearman_max"] = max(metrics[f"spearman_{fn_name}"] for fn_name in self.similarity_fn_names)
        self.primary_metric = 'silhouette_mean'

        # if self.main_similarity:
        #     self.primary_metric = {
        #         SimilarityFunction.COSINE: "spearman_cosine",
        #         SimilarityFunction.EUCLIDEAN: "spearman_euclidean",
        #         SimilarityFunction.MANHATTAN: "spearman_manhattan",
        #         SimilarityFunction.DOT_PRODUCT: "spearman_dot",
        #     }.get(self.main_similarity)
        # else:
        #     if len(self.similarity_fn_names) > 1:
        #         self.primary_metric = "spearman_max"
        #     else:
        #         self.primary_metric = f"spearman_{self.similarity_fn_names[0]}"
        #print('before adding prefix', metrics)
        metrics = self.prefix_name_to_metrics(metrics, self.name)
        #print('store in model card', metrics)
        self.store_metrics_in_model_card_data(model, metrics, epoch, steps)
        return metrics

In [None]:
dev_evaluator = MultilabelSilhouetteEvaluator(
    sentences=val_dataset['smiles1'],
    scores=np.asarray(val_dataset['label']),
    name="val",
)
    #anchors=val_dataset["anchor"],
#     positives=eval_dataset["positive"],
#     negatives=eval_dataset["negative"],
#     name="all-nli-dev",
# )
print(dev_evaluator(model))

# 7. Create a trainer & train
trainer = st.SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)

In [None]:
trainer.train()

In [None]:
test_evaluator = MultilabelSilhouetteEvaluator(
    sentences=test_dataset["smiles1"],
    # sentences2=test_dataset["smiles2"],
    scores=np.asarray(test_dataset['label']),
    name="test",
)
test_evaluator(model)

In [None]:
MODELSAVEDIR = "polymers-v1-model"
model.save_pretrained(MODELSAVEDIR)
shutil.copy("mixer_wrapper.py", MODELSAVEDIR)

del model

In [None]:
model = st.SentenceTransformer(MODELSAVEDIR)

In [None]:
embeddings = model.encode(merged_df.SMILES.values)

In [None]:
embeddings.shape

In [None]:
from sklearn.decomposition import PCA
embeddings2d = []
reducers_list = []
for i, target_column in enumerate(TARGET_COLUMNS):
    print(target_column)
    train_ids = merged_df.fold_name.values=='train'
    reducer = PCA(n_components=2)
    reducer.fit(embeddings[train_ids, :, i])
    e2d = reducer.transform(embeddings[:, :, i])
    embeddings2d.append(e2d)
    reducers_list.append(reducer)

embeddings2d = np.stack(embeddings2d, axis=-1)

In [None]:
train_df.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for i, target_column in enumerate(TARGET_COLUMNS):
    sel_ids = ~merged_df[target_column].isnull()
    ax = sns.scatterplot(
        x=embeddings2d[sel_ids, 0, i], 
        y=embeddings2d[sel_ids, 1, i],
        hue=merged_df.loc[sel_ids, target_column],
        s=15, 
        palette='Spectral',
        marker='.',
        alpha=0.5,
        legend=False
    )
    ax.set_title(target_column)
    plt.show()