# Install additional packages

In [None]:
!pip install fair-esm
!pip install sentence-transformers scikit-learn numpy pandas
!pip install biopython
!pip install wandb

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m92.2/93.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!nvidia-smi

Mon Dec 25 23:38:29 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import wandb
from sklearn.decomposition import PCA
from datetime import datetime
import gc
from pathlib import Path
import json

from collections import defaultdict
from sentence_transformers import SentenceTransformer


def timestamp():
    return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
current_timestamp = timestamp()

In [None]:
import os
import random
import numpy as np
import torch


def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

RANDOM_SEED = 3407 # 42  # 3407
set_seed(RANDOM_SEED)

Random seed set as 3407


# Mount & locate data folders at Google Drive

Mount Google Drive folder and define the directory where the model weights and additional csv files will be saved.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from pathlib import Path
BASEDIR = Path("/content/gdrive/MyDrive/hips/bgc_embeddings_task1_loco_new_domains_pairs") / ("seed_" + str(RANDOM_SEED)) / current_timestamp
BASEDIR.mkdir(exist_ok=True, parents=True)

# Data preparation

## Get MIBiG dataset

In [None]:
import os
if not os.path.exists("data"):
  !cp -r /content/gdrive/MyDrive/hips/MIBiG data


In [None]:
!ls data

domain_data.csv  dom_faa.zip  mibig_gbk.tar.gz	mibig_json.tar.gz  mibig_prot_seqs.fasta


In addition to pre-downloaded MIBiG files, I have `domain_data.csv`, which has the domain sequences annotated with Pfam. Later I'll add script to process MIBiG files and produce this file.

In [None]:
import os
VERSION = "3.1"
if not os.path.exists("data"):
  !mkdir data
if not os.path.exists("data/mibig_json.tar.gz"):
  !wget "https://dl.secondarymetabolites.org/mibig/mibig_json_{VERSION}.tar.gz" -O data/mibig_json.tar.gz
  !wget "https://dl.secondarymetabolites.org/mibig/mibig_prot_seqs_{VERSION}.fasta" -O data/mibig_prot_seqs.fasta
if not os.path.exists("data/mibig_gbk.tar.gz"):
  !wget "https://dl.secondarymetabolites.org/mibig/mibig_gbk_{VERSION}.tar.gz" -O data/mibig_gbk.tar.gz

!ls -l data | head -n 10

total 145100
-rw------- 1 root root 30503894 Dec 25 23:39 domain_data.csv
-rw------- 1 root root 12187901 Dec 25 23:39 dom_faa.zip
-rw------- 1 root root 76459061 Dec 25 23:39 mibig_gbk.tar.gz
-rw------- 1 root root   839183 Dec 25 23:39 mibig_json.tar.gz
-rw------- 1 root root 28583652 Dec 25 23:39 mibig_prot_seqs.fasta


In [None]:
if not os.path.exists(f"data/mibig_json"):
  !(cd data && tar -xzf mibig_json.tar.gz && mv "mibig_json_{VERSION}" "mibig_json")
if not os.path.exists(f"data/mibig_gbk"):
  !(cd data && tar -xzf mibig_gbk.tar.gz && mv "mibig_gbk_{VERSION}" "mibig_gbk")

## Read data

In [None]:
JSONDIR = Path("data/mibig_json")
list(JSONDIR.iterdir())[:10]
def read_mibig_file(path):
  with open(path) as f:
    data = json.load(f)
  return data
mibig_data = {file.stem: read_mibig_file(file) for file in JSONDIR.iterdir()}

In [None]:

clusters_df = pd.DataFrame([{**entry['cluster'], 'cluster':entry['cluster']} for entry in mibig_data.values()])
clusters_df["num_compounds"] = clusters_df.compounds.apply(len)
clusters_df['completeness'] = clusters_df['loci'].apply(lambda x: x['completeness'])

In [None]:
domains_df = pd.read_csv("data/domain_data.csv")

domains_df.head()

Unnamed: 0,bgcid,desc,domain_sequence,query name,accession,env_coord_from,env_coord_to,coord_from,coord_to
0,BGC0000001,BGC0000001|c1|1-1083|+|AEK75490.1|protein_meth...,LDRAFDAVPAPIYTHHERHGETVHRSAPESIRRELAALQVRAGDRV...,PCMT,PF01135.22,4,165,1,1083
1,BGC0000001,BGC0000001|c1|1887-2633|+|AEK75492.1|pathway-s...,RTITAPKVETLFATLLIRANHTVTTDELIAELWGENPPRHARTALH...,Trans_reg_C,PF00486.31,13,87,1887,2633
2,BGC0000001,BGC0000001|c1|1887-2633|+|AEK75492.1|pathway-s...,VDAVELQRMHALGRSLLVTDPEAALVPLRRAVGLFRGPVLAGIRNG...,BTAD,PF03704.20,94,238,1887,2633
3,BGC0000001,BGC0000001|c1|2646-3836|-|AEK75493.1|cytochrom...,TRTCPHQPPEGYAALRENGPLAQVRLVGDRTAWVVTDHDVARTLLV...,p450,PF00067.25,12,369,2646,3836
4,BGC0000001,BGC0000001|c1|3927-6596|-|AEK75494.1|LuxR_fami...,ATPADRLSQALARARSGRGGVVELVGEPGIGKTQALTELTRLARVA...,AAA_16,PF13191.9,21,95,3927,6596


In [None]:
ids = clusters_df.biosyn_class.apply(lambda x: len(x) == 1)
clusters_one_class = clusters_df.loc[ids, ["mibig_accession", "biosyn_class"]]
clusters_one_class["bgc"] = clusters_one_class['biosyn_class'].apply(lambda x: x[0])
bgc2class = {
    bgc_id: bgc_class for bgc_id, bgc_class in clusters_one_class[['mibig_accession', "bgc"]].values
}


In [None]:
domains_df['bgc_class'] = domains_df.bgcid.apply(lambda x: bgc2class.get(x, None))
domains_df['bgc_class'].value_counts()

Polyketide    37193
NRP           20218
Other          5874
RiPP           3977
Saccharide     3486
Terpene        1879
Alkaloid       1192
Name: bgc_class, dtype: int64

## Prepare the data for the Siamese network

In [None]:
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
NFOLDS = 5

all_bgc_class_names = sorted(domains_df[~domains_df.bgc_class.isnull()].bgc_class.unique())
for ifold, bgc_class in enumerate(all_bgc_class_names):
  test_index = domains_df['bgc_class'] == bgc_class
  domains_df.loc[test_index, 'fold'] = ifold


def split_train_val(subset_df, nfolds=NFOLDS, random_state=42, target_column="bgc_class", group_column="cluster_group"):
  skf = StratifiedGroupKFold(n_splits=nfolds, shuffle=True, random_state=random_state)
  for ifold, (train_index, val_index) in enumerate(skf.split(subset_df.index, subset_df[target_column], subset_df[group_column])):
    train_index = subset_df.index[train_index]
    val_index = subset_df.index[val_index]
    yield train_index, val_index



In [None]:
DSDIR = BASEDIR / "datasets"
DSDIR.mkdir(exist_ok=True, parents=True)
domains_df['index'] = domains_df.index
domains_df.sort_values(by=["bgcid", "coord_from", "coord_to", "env_coord_from", "env_coord_to"], inplace=True, ascending=True)
domains_df.to_csv(DSDIR / "domains.csv", index=None)



In [None]:

df1 = domains_df.iloc[:-1].reset_index(drop=True).add_prefix("p1_")
df2 = domains_df.iloc[1:].reset_index(drop=True).add_prefix("p2_")
print(df1.shape, df2.shape)
df_merged = pd.concat([df1, df2],  axis=1)
ids = (df_merged.p1_bgcid == df_merged.p2_bgcid) & (df_merged.p1_env_coord_from < df_merged.p2_env_coord_from) & (df_merged.p1_env_coord_to < df_merged.p2_env_coord_to)
ids = ids & (df_merged.p1_coord_from <= df_merged.p2_coord_from) & (df_merged.p1_coord_to <= df_merged.p2_coord_to)
df_merged = df_merged[ids].reset_index(drop=True)

df_merged.rename(columns={"p1_bgc_class": "bgc_class", "p1_bgcid": "bgcid"}, inplace=True)
df_merged.drop(columns=["p1_bgcid", "p2_bgcid", "p1_bgc_class", "p2_bgc_class"], inplace=True, errors='ignore')
df_merged.to_csv(DSDIR / "domain_pairs.csv", index=None)


(100654, 12) (100654, 12)


In [None]:
domains_df = df_merged[~df_merged.bgc_class.isnull()].reset_index(drop=True)

## Finetune

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader


NTRAIN_SAMPLES = 6400  # 10000
NTEST_SAMPLES = 1000


In [None]:
def extract_pos_neg(train_df, n=100):
  bgc_class_names = train_df.bgc_class.unique()
  bgc_indices = dict()
  for bgc in bgc_class_names:
    ids = train_df["bgc_class"] == bgc
    bgc_indices[bgc] = np.asarray(train_df.index[ids])

  samples = dict()
  positive_samples = []
  negative_samples = []
  for i, bgc1 in enumerate(bgc_class_names):
    bgc_indices1 = bgc_indices[bgc1]

    for bgc2 in bgc_class_names:
      bgc_indices2 = bgc_indices[bgc2]
      i1 = np.random.choice(bgc_indices1, n)
      i2 = np.random.choice(bgc_indices2, n)
      pairs = list(zip(i1, i2))
      samples[(bgc1, bgc2)] = pairs

      if bgc1 == bgc2:
        positive_samples.extend(pairs)
      else:
        negative_samples.extend(pairs)
  npos = len(positive_samples)
  if len(negative_samples) > 0:
    neg_samples_ids = np.random.choice(len(negative_samples), npos)
    neg_samples = [negative_samples[i] for i in neg_samples_ids]
  else:
    neg_samples = []
  return positive_samples, neg_samples


def make_dataset(pos_samples, neg_samples, data_column, datadir, name="train"):
  dataset = []
  train_df = []
  for (i, j) in tqdm(pos_samples):
    if isinstance(data_column, tuple):
      s1 = [column.loc[i] for column in data_column]
      s2 = [column.loc[j] for column in data_column]
    else:
      s1 = data_column.loc[i] # fasta_df.loc[i, col]
      s2 = data_column.loc[j] # fasta_df.loc[j, 7]
    if name == "train":
      dataset.append(InputExample(texts=[*s1, *s2], label=1.))
    else:
      dataset.append((*s1, *s2, 1.))
    train_df.append({
        "sequence1": s1,
        "sequence2": s2,
        "i1": i,
        "i2": j,
        "label": "positive"
    })
  for (i, j) in tqdm(neg_samples):
    if isinstance(data_column, tuple):
      s1 = [column.loc[i] for column in data_column]
      s2 = [column.loc[j] for column in data_column]
    else:
      s1 = data_column.loc[i] # fasta_df.loc[i, col]
      s2 = data_column.loc[j] # fasta_df.loc[j, 7]
    if name == "train":
      dataset.append(InputExample(texts=[*s1, *s2], label=-1.))
    else:
      dataset.append((*s1, *s2, -1.))
    train_df.append({
        "sequence1": s1,
        "sequence2": s2,
        "i1": i,
        "i2": j,
        "label": "negative"
    })
  train_df = pd.DataFrame(train_df)
  train_df.to_csv(datadir / f"{name}_pairs_oversampled.csv", index=None)
  return dataset




In [None]:

def prepare_fold_data(fasta_df, ifold=0, pair_columns=[], ntrain_samples=NTRAIN_SAMPLES, ntest_samples=NTEST_SAMPLES, return_indices=True):
    fold_name = f"fold_{ifold}_{all_bgc_class_names[ifold]}"
    train_val_ids = fasta_df.fold != ifold
    test_ids = fasta_df.fold == ifold
    test_index = fasta_df.index[test_ids]

    train_index, val_index = next(split_train_val(fasta_df[train_val_ids]))
    train_bgcids = fasta_df.loc[train_index].bgcid.unique()
    val_bgcids = fasta_df.loc[val_index].bgcid.unique()
    test_bgcids = fasta_df.loc[test_ids].bgcid.unique()
    # print(np.isin(train_bgcids, val_bgcids).sum(), train_bgcids.shape, val_bgcids.shape, test_bgcids.shape)
    assert not np.isin(test_bgcids, val_bgcids).any(), "Test bgcids shouldn't be in val dataset"
    assert not np.isin(test_bgcids, train_bgcids).any(), "Test bgcids shouldn't be in train dataset"
    assert not np.isin(train_bgcids, val_bgcids).any(), "Train bgcids shouldn't be in val dataset"

    assert not np.isin(test_index, val_index).any(), "Test index shouldn't be in val index"
    assert not np.isin(test_index, train_index).any(), "Test index shouldn't be in train index"
    assert not np.isin(train_index, val_index).any(), "Train index shouldn't be in val index"

    train_ids = fasta_df.index.isin(train_index)
    val_ids = fasta_df.index.isin(val_index)

    FOLDDIR = DSDIR / fold_name
    FOLDDIR.mkdir(exist_ok=True, parents=True)
    bgc_class_names = fasta_df[train_ids].bgc_class.unique()

    pos_samples, neg_samples = extract_pos_neg(fasta_df[train_ids], n=ntrain_samples)
    pos_samples_val, neg_samples_val = extract_pos_neg(fasta_df[val_ids], n=ntest_samples)
    if 7 in fasta_df.columns:
        train_dataset = make_dataset(pos_samples, neg_samples, fasta_df.loc[:, 7], FOLDDIR, name="train")
        val_dataset = make_dataset(pos_samples_val, neg_samples_val, fasta_df.loc[:, 7], FOLDDIR, name="val")
        test_sequences = fasta_df.loc[test_ids, 7]
    else:
        seq_data = tuple([fasta_df.loc[:, col] for col in pair_columns])
        train_dataset = make_dataset(pos_samples, neg_samples, seq_data, FOLDDIR, name="train")
        val_dataset = make_dataset(pos_samples_val, neg_samples_val, seq_data, FOLDDIR, name="val")
        test_sequences = fasta_df.loc[test_ids, pair_columns]

    test_sequences.to_csv(FOLDDIR / "test_sequences.csv", index=None)
    fasta_df.loc[test_ids, :].to_csv(FOLDDIR/"test_data.csv")
    fasta_df.loc[val_ids, :].to_csv(FOLDDIR/"val_data.csv")
    fasta_df.loc[train_ids, :].to_csv(FOLDDIR/"train_data.csv")
    np.savez_compressed(
        (FOLDDIR/"train_val_split.npz").as_posix(),
        train_index=train_index,
        val_index=val_index,
        test_index=test_index
    )
    np.savez_compressed(
        (FOLDDIR / "bgcid_split.npz").as_posix(),
        train_bgcids=train_bgcids,
        val_bgcids=val_bgcids,
        test_bgcids=test_bgcids
    )
    if return_indices:
       return train_dataset, val_dataset, test_sequences, train_index, val_index, test_index
    return train_dataset, val_dataset, test_sequences

In [None]:
# PROTEIN_MODEL = "ElnaggarLab/ankh-base"
PROTEIN_MODEL = "facebook/esm2_t6_8M_UR50D"

In [None]:
domains_df[0] = domains_df['bgcid']
ids = domains_df.bgc_class.isin(all_bgc_class_names)
domains_df = domains_df[ids].reset_index(drop=True)

In [None]:
domains_df['fold'] = -1
for i, x in enumerate(all_bgc_class_names):
  ids = domains_df.bgc_class == x
  domains_df.loc[ids, "fold"] = i

# domains_df.head()
# fasta_df.head()
domains_df.loc[:, 'cluster_group'] = -1
bgc_cluster_names = domains_df.bgcid.unique()
for i, bgcid in enumerate(bgc_cluster_names):
  ids = domains_df.bgcid == bgcid
  domains_df.loc[ids, 'cluster_group'] = i


In [None]:
import wandb


def wandb_callback(score, epoch, steps):
  wandb.log({"val_loss": score, "epoch": epoch, "step": steps})
  pass



In [None]:
pair_columns = ["p1_domain_sequence", "p2_domain_sequence"]

In [None]:
def draw_data(emb, bgc_indices, bgc_class_names, label=None, savepath=None):
    x_min = emb[:, 0].min()
    x_max = emb[:, 0].max()
    y_min = emb[:, 1].min()
    y_max = emb[:, 1].max()
    x_offset = (x_max - x_min)/10
    y_offset = (y_max - y_min)/10
    fig, ax = plt.subplots(1, figsize=(14, 10))
    # plt.scatter(*emb[emb_group == 0].T, s=2, c=targets_mean, marker='x', cmap='Spectral', alpha=1.0)
    plt.scatter(*emb.T, s=2, c=bgc_indices, marker='x', cmap='Spectral', alpha=1.0)

    plt.setp(ax, xticks=[], yticks=[])
    nclasses = len(bgc_class_names)
    cbar = plt.colorbar(boundaries=np.arange(nclasses + 1)-0.5)
    cbar.set_ticks(np.arange(nclasses))
    cbar.set_ticklabels(bgc_class_names)
    ax.set_xlim(x_min-x_offset, x_max+x_offset)
    ax.set_ylim(y_min-y_offset, y_max+y_offset)
    if label is None:
        label = 'Mean pretrained ESM embeddings for BGC trained without NRP via PCA'
    plt.title(label);
    if savepath is not None:
        fig.savefig(savepath, dpi=150, bbox_inches="tight")
    plt.show()

# Additional code
To effectively use sentence_transformers in our case, we'll need to define several classes, which are responsible for the training loss and evaluation during validation and test phases.

## Define custom loss

In [None]:
# Make custom loss
import torch
from torch import nn, Tensor
from typing import Iterable, Dict
from sentence_transformers import SentenceTransformer


class PairCosineSimilarityLoss(nn.Module):
    """
    CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label.

    It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two.
    By default, it minimizes the following loss: ||input_label - cos_score_transformation(cosine_sim(u,v))||_2.

    :param model: SentenceTransformer model
    :param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE:  ||input_label - cosine_sim(u,v)||_2
    :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity. By default, the identify function is used (i.e. no change).

    Example::

            from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses

            model = SentenceTransformer('distilbert-base-nli-mean-tokens')
            train_examples = [InputExample(texts=['pair1 sent1', 'pair1 sent2', 'pair2 sent1', 'pair2 sent2', ], label=0.8),
                InputExample(texts=['pair3 sent1', 'pair3 sent2', 'pair4 sent1', 'pair4 sent2'], label=0.3)]
            train_dataset = SentencesDataset(train_examples, model)
            train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
            train_loss = losses.CosineSimilarityLoss(model=model)


    """
    def __init__(self, model: SentenceTransformer, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()):
        super(PairCosineSimilarityLoss, self).__init__()
        self.model = model
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation


    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):

        embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]

        emb1 = torch.cat([embeddings[0], embeddings[1]], dim=-1)
        emb2 = torch.cat([embeddings[2], embeddings[3]], dim=-1)

        output = self.cos_score_transformation(torch.cosine_similarity(emb1, emb2))
        return self.loss_fct(output, labels.view(-1))


## Define evaluators

In [None]:
from sentence_transformers.evaluation import SentenceEvaluator, SimilarityFunction
from sentence_transformers.evaluation import SequentialEvaluator
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr
import numpy as np
from typing import List, Tuple
from sentence_transformers.readers import InputExample
from sklearn.metrics import silhouette_score
import sklearn.metrics as skmetrics
import sklearn.metrics.pairwise as pmetrics

class PairEmbeddingSimilarityEvaluator(SentenceEvaluator):
    """
    Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
    in comparison to the gold standard labels.
    The metrics are the cosine similarity as well as euclidean and Manhattan distance
    The returned score is the Spearman correlation with a specified metric.

    The results are written in a CSV. If a CSV already exists, then values are appended.

    : List[str]
    """
    def __init__(self, sentences: Tuple[List[str]], scores: List[float], batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = False, write_csv: bool = True):
        """
        Constructs an evaluator based for the dataset

        The labels need to indicate the similarity between the sentences.

        :param sentences1:  List with the first sentence in a pair
        :param sentences2: List with the second sentence in a pair
        :param scores: Similarity score between sentences1[i] and sentences2[i]
        :param write_csv: Write results to a CSV file
        """
        self.sentences1, self.sentences2, self.sentences3, self.sentences4 = sentences
        self.scores = scores
        self.write_csv = write_csv

        assert len(self.sentences1) == len(self.sentences2)
        assert len(self.sentences2) == len(self.sentences3)
        assert len(self.sentences3) == len(self.sentences4)
        assert len(self.sentences1) == len(self.scores)

        self.main_similarity = main_similarity
        self.name = name

        self.batch_size = batch_size
        if show_progress_bar is None:
            show_progress_bar=True
        self.show_progress_bar = show_progress_bar

        self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv"
        self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"]

    @classmethod
    def from_input_examples(cls, examples: List[InputExample], **kwargs):
        sentences1 = []
        sentences2 = []
        sentences3 = []
        sentences4 = []
        scores = []

        for example in examples:
            sentences1.append(example.texts[0])
            sentences2.append(example.texts[1])
            sentences3.append(example.texts[2])
            sentences4.append(example.texts[3])
            scores.append(example.label)
        return cls((sentences1, sentences2, sentences3, sentences4), scores, **kwargs)

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        emb1 = model.encode(self.sentences1, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        emb2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        emb3 = model.encode(self.sentences3, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        emb4 = model.encode(self.sentences4, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)

        embeddings1 = np.hstack([emb1, emb2])
        embeddings2 = np.hstack([emb3, emb4])
        labels = self.scores

        cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
        euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
        dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        eval_pearson_dot, _ = pearsonr(labels, dot_products)
        eval_spearman_dot, _ = spearmanr(labels, dot_products)

        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([
                    epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
                    eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan,
                    eval_pearson_dot, eval_spearman_dot])


        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
            return eval_spearman_dot
        elif self.main_similarity is None:
            return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
        else:
            raise ValueError("Unknown main_similarity value")


class PairEmbeddingSilhouetteEvaluator(SentenceEvaluator):
    """
    Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
    in comparison to the gold standard labels.
    The metrics are the cosine similarity as well as euclidean and Manhattan distance
    The returned score is the Spearman correlation with a specified metric.

    The results are written in a CSV. If a CSV already exists, then values are appended.

    : List[str]
    """
    def __init__(self, sentences: Tuple[List[str]], classes: List[int], class_names: List[str], bgcid_list=None,
                 batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '',
                 show_progress_bar: bool = False, write_csv: bool = True, savedir=None):
        """
        Constructs an evaluator based for the dataset

        The labels need to indicate the similarity between the sentences.

        :param sentences:  List with the first and second sentences in a pair
        :param classes: List of the indices of the cluster the corresponding sample belongs to
        :param write_csv: Write results to a CSV file
        """
        self.sentences = sentences
        self.classes = classes
        self.class_names = class_names
        self.bgcid_list = bgcid_list
        if bgcid_list is None:
            self.bgcid_list = np.arange(len(sentences))
        self.bgcid2indices = defaultdict(list)
        for i, bgcid in enumerate(self.bgcid_list):
            self.bgcid2indices[bgcid].append(i)
        self.bgcid2indices = {k: np.asarray(v) for k, v in self.bgcid2indices.items()}
        self.bgcid2classes = {k: self.classes[v[0]] for k, v in self.bgcid2indices.items()}
        self.bgcids = np.asarray(sorted(self.bgcid2indices))
        assert min(classes) >= 0
        assert max(classes) < len(class_names)
        self.write_csv = write_csv

        self.main_similarity = main_similarity
        self.name = name

        self.batch_size = batch_size
        self.show_progress_bar = show_progress_bar
        if show_progress_bar is None:
            show_progress_bar = True
        #     show_progress_bar = (logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG)

        self.csv_file = "similarity_evaluation"+("_"+name if name else '')+"_results.csv"
        self.csv_headers = [
            "epoch", "steps", "silhouette_cos_dist",
            "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman",
            "manhattan_pearson", "manhattan_spearman"
        ]
        self.savedir = savedir

    @classmethod
    def from_input_examples(cls, examples: List[InputExample], **kwargs):
        sentences1 = []
        sentences2 = []
        classes = []

        for example in examples:
            sentences1.append(example.texts[0])
            sentences2.append(example.texts[1])
            classes.append(example.label)
        return cls((sentences1, sentences2), scores, **kwargs)

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        all_embeddings = []
        for sentences in self.sentences:
            embedding = model.encode(sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
            all_embeddings.append(embedding)
        # emb2 = model.encode(self.sentences2, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        # emb3 = model.encode(self.sentences3, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
        # emb4 = model.encode(self.sentences4, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)

        # embeddings = np.hstack([emb1, emb2])
        if len(all_embeddings) == 1:
            embeddings = all_embeddings[0]
        else:
            embeddings = np.hstack(all_embeddings)
        mean_embeddings = [embeddings[self.bgcid2indices[bgcid]].mean(0) for bgcid in self.bgcids]
        mean_embeddings = np.stack(mean_embeddings)

        test_labels = np.asarray([self.bgcid2classes[bgcid] for bgcid in self.bgcids])
        labels = np.where(test_labels.reshape(-1, 1) == test_labels, 1, -1).flatten()

        cosine_similarities = cosine_similarity(mean_embeddings, mean_embeddings)
        cosine_distances = 1. - cosine_similarities
        cosine_scores = cosine_similarities.flatten()
        score_cos_dist = silhouette_score(mean_embeddings, test_labels, metric="cosine")

        manhattan_distances = -pmetrics.manhattan_distances(mean_embeddings, mean_embeddings).flatten()
        euclidean_distances = -pmetrics.euclidean_distances(mean_embeddings, mean_embeddings).flatten()

        eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
        eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)

        eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)

        wandb.log({
            "silhouette_val": score_cos_dist,
            "pearson_cosine": eval_pearson_cosine
        })

        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([epoch, steps, score_cos_dist,
                                 eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
                                 eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan])


        if self.main_similarity == SimilarityFunction.COSINE:
            return eval_spearman_cosine
        elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
            return eval_spearman_euclidean
        elif self.main_similarity == SimilarityFunction.MANHATTAN:
            return eval_spearman_manhattan
        elif self.main_similarity is None:
            return score_cos_dist
        else:
            raise ValueError("Unknown main_similarity value")


class TestSilhouetteEvaluator(SentenceEvaluator):
    def __init__(self, df, all_index: List[int], test_index: List[int], class_names: List[str],
                 columns: List[str],
                 test_fold:int,
                 batch_size: int = 16, main_similarity: SimilarityFunction = None, name: str = '',
                 show_progress_bar: bool = False, write_csv: bool = True, savedir=None, target_column="biosyn_class_index"):
        self.df = df
        self.all_index = np.asarray(all_index).flatten()
        self.test_index = np.asarray(test_index).flatten()
        self.class_names = class_names
        if isinstance(columns, str):
            columns = [columns]
        self.columns = columns
        self.test_fold = test_fold
        self.batch_size=batch_size
        self.main_similarity = main_similarity
        self.name = name
        self.show_progress_bar = show_progress_bar
        if show_progress_bar is not None:
            self.show_progress_bar = True
        self.write_csv = write_csv
        self.savedir = savedir
        self.eval_sequences = [df.loc[all_index, col].values for col in columns]
        self.eval_targets = df.loc[all_index, target_column].values

        self.test_ids = np.isin(self.all_index, test_index)

        self.all_bgcid = self.df.loc[all_index, 'bgcid'].values
        self.test_bgcid = self.df.loc[test_index, 'bgcid'].values

        bgc2indices = defaultdict(list)
        for i, bgcid in enumerate(self.all_bgcid):
            bgc2indices[bgcid].append(i)
        self.bgc2indices = {k: np.asarray(v) for k, v in bgc2indices.items()}

        self.bgc2targets = {k: v for k, v in df.loc[all_index, ["bgcid", target_column]].values}

        self.all_bgcid = np.unique(self.all_bgcid)
        self.test_bgcid = np.unique(self.test_bgcid)

        self.csv_file = "test_evaluation"+("_"+name if name else '')+"_results.csv"
        self.csv_headers = [
            "epoch", "steps",
            "silhouette_test",
            "silhouette_full",
            "calinsky_score"
        ]
        self.savedir = savedir

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        if epoch != -1:
            if steps == -1:
                out_txt = " after epoch {}:".format(epoch)
            else:
                out_txt = " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"
        eval_embeddings = []
        for sequences in self.eval_sequences:
            embedding = model.encode(sequences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True)
            eval_embeddings.append(embedding)

        domain_embeddings = np.hstack(eval_embeddings)
        bgc2embeddings = {
            bgcid: domain_embeddings[indices].mean(0)
            for bgcid, indices in self.bgc2indices.items()
        }
        all_embeddings = np.stack([bgc2embeddings[bgcid] for bgcid in self.all_bgcid])
        self.test_ids = np.isin(self.all_bgcid, self.test_bgcid)

        all_targets = np.asarray([self.bgc2targets[bgcid] for bgcid in self.all_bgcid])
        test_labels = all_targets[self.test_ids]
        test_labels = np.where(test_labels.reshape(-1, 1) == test_labels, 1, -1).flatten()

        cosine_similarities = cosine_similarity(all_embeddings[self.test_ids], all_embeddings[self.test_ids])
        cosine_distances = 1. - cosine_similarities
        flat_cosine_distances = cosine_distances.flatten()
        eval_silhouette_samples = skmetrics.silhouette_samples(all_embeddings, all_targets, metric="cosine")
        test_silhouette_score = eval_silhouette_samples[self.test_ids].mean()
        full_silhouette_score = eval_silhouette_samples.mean()
        calinsky_score = skmetrics.calinski_harabasz_score(all_embeddings, all_targets)

        manhattan_distances = -pmetrics.manhattan_distances(all_embeddings[self.test_ids], all_embeddings[self.test_ids]).flatten()
        euclidean_distances = -pmetrics.euclidean_distances(all_embeddings[self.test_ids], all_embeddings[self.test_ids]).flatten()

        eval_pearson_cosine, _ = pearsonr(test_labels, flat_cosine_distances)
        eval_spearman_cosine, _ = spearmanr(test_labels, flat_cosine_distances)

        eval_pearson_manhattan, _ = pearsonr(test_labels, manhattan_distances)
        eval_spearman_manhattan, _ = spearmanr(test_labels, manhattan_distances)

        eval_pearson_euclidean, _ = pearsonr(test_labels, euclidean_distances)
        eval_spearman_euclidean, _ = spearmanr(test_labels, euclidean_distances)

        wandb.log({
            "silhouette_full": full_silhouette_score,
            "silhouette_test": test_silhouette_score,
            "pearson_cosine": eval_pearson_cosine,
            "calinsky_score": calinsky_score
        })
        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            output_file_exists = os.path.isfile(csv_path)
            with open(csv_path, newline='', mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
                writer = csv.writer(f)
                if not output_file_exists:
                    writer.writerow(self.csv_headers)

                writer.writerow([
                    epoch, steps,
                    test_silhouette_score,
                    full_silhouette_score,
                    calinsky_score
                ])

        return test_silhouette_score

In [None]:
# domains_tiny_df = domains_df.groupby("fold", as_index=False).head(10)
tiny_bgcids = domains_df[['fold', "bgcid"]].drop_duplicates().groupby("fold", as_index=False).head(3)['bgcid'].unique()
domains_tiny_df = domains_df[domains_df.bgcid.isin(tiny_bgcids)].groupby(['fold', "bgcid"]).head(2).reset_index(drop=True)
domains_tiny_df.shape

(41, 25)

In [None]:
all_bgc_class_names = sorted(['NRP', 'Polyketide', 'Other', 'Saccharide', 'Alkaloid', 'RiPP', 'Terpene'])
biosyn_class2index = {x: i for i, x in enumerate(all_bgc_class_names)}

domains_df["biosyn_class_index"] = domains_df.bgc_class.apply(lambda x: biosyn_class2index.get(x, -1))

In [None]:
from sentence_transformers import models as st_models

def prepare_st_model(freeze=True):
    if not freeze:
        return SentenceTransformer(PROTEIN_MODEL)
    word_embedding_model = st_models.Transformer(PROTEIN_MODEL)
    word_embedding_model.auto_model.embeddings.requires_grad_(False)
    for param in word_embedding_model.auto_model.embeddings.parameters():
        param.requires_grad = False

    word_embedding_model.auto_model.encoder.layer[:5].requires_grad_(False)
    for param in word_embedding_model.auto_model.encoder.layer[:5].parameters():
        param.requires_grad = False
    pooling_model = st_models.Pooling(word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

# Fine-tuning

In [None]:
BATCH_SIZE = 16
NEPOCHS = 20
current_df = domains_df # domains_tiny_df
ntrain_samples = NTRAIN_SAMPLES
ntest_samples = NTEST_SAMPLES
TEST_BIOSYN_CLASS_NAME = "NRP"  # "Terpene":  # "Saccharide":  # "Alkaloid":  # "RiPP":  # "NRP": # "Polyketide"

for ifold, bgc_class in enumerate(all_bgc_class_names):
  if bgc_class != TEST_BIOSYN_CLASS_NAME:
      continue
  wandb.init(
    project="task1-pair_domains_v2",
    # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
    name=f"loco_fixed_pairs_{bgc_class}_seed_{RANDOM_SEED}_" + current_timestamp,
    # Track hyperparameters and run metadata
    config={
      "architecture": "esm2_8m",
      "epochs": NEPOCHS,
      "batch_size": BATCH_SIZE,
      "ntrain_samples": ntrain_samples,
      "ntest_samples": ntest_samples,
      'random_seed': RANDOM_SEED,
      'biosyn_class': bgc_class
    }
  )
  # train_dataset, val_dataset, test_sequences = prepare_fold_data(fasta_df, ifold=ifold)
  train_dataset, val_dataset, test_sequences, train_index, val_index, test_index = prepare_fold_data(
      current_df, ifold=ifold, pair_columns=pair_columns, ntrain_samples=ntrain_samples,
      ntest_samples=ntest_samples
  )
  assert np.isin(current_df.loc[test_index, "bgc_class"].unique(), [bgc_class]).all(), "Biosynthetic class in test is unexpected"
  # break
  val_ids = current_df.index.isin(val_index)
  val_sequences = [current_df.loc[val_ids, col].values for col in pair_columns]
  val_targets = current_df.loc[val_ids, "biosyn_class_index"].values
  val_bgcid_list = current_df.loc[val_ids, "bgcid"].values

  all_index = current_df.index[current_df.index.isin(train_index) | current_df.index.isin(val_index) | current_df.index.isin(test_index)]
  # all_sequences = [current_df.loc[all_ids, col].values for col in pair_columns]
  # all_targets = current_df.loc[all_ids, "biosyn_class_index"].values

  train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
  # test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)
  # break
  NSTEPS = len(train_dataset) // BATCH_SIZE
  print("nsteps=", NSTEPS)

  # model = SentenceTransformer(PROTEIN_MODEL)
  model = prepare_st_model(freeze=True)
  # break
  train_loss = PairCosineSimilarityLoss(model)
  val_data = list(zip(*val_dataset))
  sentences = val_data[:4]
  scores = val_data[4]

  name = PROTEIN_MODEL.replace("/", "_")
  name = f"{name}_{ifold}_{bgc_class}"
  SAVEDIR = BASEDIR / name # PROTEIN_MODEL.replace("/", "_")
  SAVEDIR.mkdir(exist_ok=True, parents=True)
  BESTDIR = BASEDIR / (name + "_best")
  IMGDIR = SAVEDIR / "images"
  IMGDIR.mkdir(exist_ok=True)

  # evaluator = PairEmbeddingSimilarityEvaluator(sentences, scores, write_csv=True)
  val_evaluator = PairEmbeddingSilhouetteEvaluator(
      val_sequences, val_targets, all_bgc_class_names, write_csv=True, bgcid_list=val_bgcid_list,
      savedir=IMGDIR
  )
  test_evaluator = TestSilhouetteEvaluator(
      current_df, all_index, test_index, all_bgc_class_names, columns=pair_columns, test_fold=ifold, write_csv=True,
      savedir=IMGDIR
  )
  evaluator = SequentialEvaluator([test_evaluator, val_evaluator])
  # evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores, write_csv=True)
  model.fit(
      train_objectives=[(train_dataloader, train_loss)],
      epochs=NEPOCHS, warmup_steps=100,
      output_path=BESTDIR.as_posix(),
      use_amp=True,
      checkpoint_path=SAVEDIR.as_posix(),
      checkpoint_save_total_limit=10,
      checkpoint_save_steps=NSTEPS//2,
      steps_per_epoch=NSTEPS//2,
      evaluator=evaluator,
      evaluation_steps= min(1000, NSTEPS//2)
  )
  # sentences = domains_df.loc[:, 7].values

  # embeddings = model.encode(sentences)

  # filename = SAVEDIR / "latest_embeddings.npz"
  # np.savez_compressed(filename.as_posix(), embeddings)

  del model
  train_dataloader = None
  test_dataloader = None
  # import gc
  torch.cuda.empty_cache()
  gc.collect()
  wandb.finish()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  0%|          | 0/38400 [00:00<?, ?it/s]

  0%|          | 0/38400 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

nsteps= 4800


config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.weight', 'esm.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Iteration:   0%|          | 0/2400 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



Batches:   0%|          | 0/3020 [00:00<?, ?it/s]

Batches:   0%|          | 0/3020 [00:00<?, ?it/s]



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
calinsky_score,▁▂▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇█████████████
pearson_cosine,▁ ▂ ▄ ▄ ▆ ▇ ▆ █ ▇ ██ █ █ ▇ ▇ ▇ █ █ █ ██
silhouette_full,▁▃▃▄▅▅▅▅▆▆▇▇▅▆▇▇▇▇▇▇███▇██▇█████████████
silhouette_test,▁▁▅▅▆▇▅▅▆█▄▆▅▆▅▆▇▆▃▅▃▅▅▃▅▃▃▃▄▃▃▂▄▃▃▃▂▂▃▃
silhouette_val,▁▃▄▄▆▅▆▅▆▆▇▇▅▆▆▇▇▇▇▇█████▇██████████████

0,1
calinsky_score,311.21182
pearson_cosine,0.56545
silhouette_full,0.20155
silhouette_test,0.20665
silhouette_val,0.21495


In [None]:
# del model
# data_loader=None
# # del model
# import gc
# torch.cuda.empty_cache()
# gc.collect()
