In [40]:
import torch
import numpy as np
import datasets
import os
import umap
import evaluate
import accelerate
from pathlib import Path
from itertools import product
from IPython.core.debugger import set_trace
from datasets import Dataset, DatasetDict
from torch import nn
from torch.nn import functional as F
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from IPython.core.debugger import Pdb
from matplotlib import pyplot as plt
from transformers import AutoModel, AutoTokenizer
from pprint import pprint
from scipy.stats import spearmanr

# mltk stuff
import mltoolkit as mltk
from mltoolkit.utils import files, display
os.chdir(files.project_root())
display.note(f'changed directory to \'{os.getcwd()}\'')

datasets.disable_caching()

# Set this to whatever you want
seed = 10

torch.manual_seed(seed)
np.random.seed(seed)

%load_ext autoreload
%autoreload 2
%matplotlib inline

[[35mNOTE[0m] changed directory to '/data/john/projects/mltoolkit'
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load TransformerAE Model and Tokenizer

In [30]:
from mltoolkit.tasks.transformer_ae.model import TransformerAE
from mltoolkit import cfg_reader

device = 'cuda'

proot = files.project_root()

enc = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(enc)
cfg, kw = cfg_reader.load(f'{proot}/cfg/transformer_ae.yaml')

model = TransformerAE(
    n_vocab=len(tokenizer),
    **cfg.params,
)

accelerate.load_checkpoint_in_model(model, f'{proot}/results/transformer_ae/20240227-113144/ae-best_model')
model = model.to(device)
print(model)

TransformerAE(
  (encoder): MPNetModel(
    (embeddings): MPNetEmbeddings(
      (word_embeddings): Embedding(30527, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MPNetEncoder(
      (layer): ModuleList(
        (0-11): 12 x MPNetLayer(
          (attention): MPNetAttention(
            (attn): MPNetSelfAttention(
              (q): Linear(in_features=768, out_features=768, bias=True)
              (k): Linear(in_features=768, out_features=768, bias=True)
              (v): Linear(in_features=768, out_features=768, bias=True)
              (o): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
    

# Load in STS-b dataset

In [35]:
ds = datasets.load_dataset('mteb/stsbenchmark-sts').with_format('numpy')
print(ds)

DatasetDict({
    train: Dataset({
        features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
        num_rows: 5749
    })
    validation: Dataset({
        features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['split', 'genre', 'dataset', 'year', 'sid', 'score', 'sentence1', 'sentence2'],
        num_rows: 1379
    })
})


# Evaluate on Base Model

In [41]:
baseline = SentenceTransformer(enc)

emb_a = baseline.encode(ds['test']['sentence1'], convert_to_tensor=True)
emb_b = baseline.encode(ds['test']['sentence2'], convert_to_tensor=True)

scores = F.cosine_similarity(emb_a, emb_b).cpu().numpy()
refs = ds['test']['score']

print(spearmanr(scores, refs))


SignificanceResult(statistic=0.8342190973012376, pvalue=0.0)


# Evaluate on TransformerAE

In [47]:
emb_a = model.encode(ds['test']['sentence1'].tolist())
emb_b = model.encode(ds['test']['sentence2'].tolist())

scores = F.cosine_similarity(emb_a, emb_b).cpu().numpy()
refs = ds['test']['score']

print(spearmanr(scores, refs))

SignificanceResult(statistic=0.8275457735646709, pvalue=0.0)
