In [1]:
import os
import random
import toml
from sys import argv
from types import SimpleNamespace

from utils.collate import MultiencoderTokenizedDataset, TokenizedCollator
from utils.model_utils import get_sentence_embedding_dimension, load_encoder
from utils.utils import *
from utils.streaming_utils import load_streaming_embeddings, process_batch
from datasets import load_from_disk

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=Warning)

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "0"
cfg = toml.load(f'configs/unsupervised.toml')
unknown_cfg = read_args(argv)
cfg = SimpleNamespace(**{**{k: v for d in cfg.values() for k, v in d.items()}, **unknown_cfg})

In [3]:
sup_encs = {
    cfg.sup_emb: load_encoder(cfg.sup_emb, mixed_precision=cfg.mixed_precision if hasattr(cfg, 'mixed_precision') else None)
}

unsup_enc = {
    cfg.unsup_emb: load_encoder(cfg.unsup_emb, mixed_precision=cfg.mixed_precision if hasattr(cfg, 'mixed_precision') else None)
}

No sentence-transformers model found with name infgrad/stella-base-en-v2. Creating a new one with mean pooling.


In [4]:
num_workers = min(get_num_proc(), 8)
cfg.num_points = 10

dset = load_streaming_embeddings(cfg.dataset)
print(f"Using {num_workers} workers and {len(dset)} datapoints")

dset_dict = dset.train_test_split(test_size=cfg.val_size, seed=cfg.val_dataset_seed)
dset = dset_dict["train"]
valset = dset_dict["test"]

dset = dset.shuffle(seed=cfg.train_dataset_seed)
if hasattr(cfg, 'num_points'):
    assert cfg.num_points > 0 and cfg.num_points <= len(dset) // 2
    supset = dset.select(range(cfg.num_points))
    unsupset = dset.select(range(cfg.num_points, cfg.num_points * 2))
elif hasattr(cfg, 'unsup_points'):
    unsupset = dset.select(range(min(cfg.unsup_points, len(dset))))
    supset = dset.select(range(min(cfg.unsup_points, len(dset)), len(dset) - len(unsupset)))

Using 8 workers and 5332023 datapoints


In [5]:
supset = MultiencoderTokenizedDataset(
    dataset=supset,
    encoders=sup_encs,
    n_embs_per_batch=cfg.n_embs_per_batch,
    batch_size=cfg.bs,
    max_length=cfg.max_seq_length,
    seed=cfg.sampling_seed,
)
unsupset = MultiencoderTokenizedDataset(
    dataset=unsupset,
    encoders=unsup_enc,
    n_embs_per_batch=1,
    batch_size=cfg.bs,
    max_length=cfg.max_seq_length,
    seed=cfg.sampling_seed,
)

In [6]:
print(supset.dataset[0])

{'text': "got chokeslammed onto the stage, but Viscera saved her. Stratus would then form a short-lived alliance with Viscera, who was ordered to protect her. In May 2005, Stratus was sidelined with the Women's Championship after suffering a herniated disc, with the storyline explanation that Viscera had injured her at Backlash after she insulted him for losing to Kane. This left the company without a Women's Champion for four months, as Stratus remained the champion during the time of her injury, thus ignoring the 30 day clause stating that a champion must defend their respective title at least once every 30"}


In [7]:
print(unsupset.dataset[0])

{'text': 'census record 20,271 people living at DB; Over 50% of them are non-Chinese and DB is a sizeable community of expatriates from over fifty countries. DB is located 2 km west of Hong Kong Disneyland Resort and approximately 12 km west from the nearest point on Hong Kong Island. As of April 2018, Discovery Bay consists of 15 residential development phases with properties ranging from garden houses to low-, mid- and high-rise. The development also features a 400-metre-long privately owned beach (accessible to the public), four private membership clubs including a golf club and a marina club and a public'}
