# Setup Environment

In [1]:
# External Imports
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
import datasets
import torch
import random
import senteval
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sentence_transformers import models as st_models
from datasets import Dataset
from pprint import pprint
from scipy.stats import spearmanr
from torch.nn import functional as F


# Internal Imports
import mltoolkit as mltk
from mltoolkit import (
    cfg_reader,
    models,
)
from mltoolkit.utils import (
    strings,
    files,
    display,
)

%load_ext autoreload
%autoreload 2


# Load Config, Tokenizer and Model Checkpoint

In [10]:
# select checkpoint
ckpt_choice  = 9
cfg_choice   = 7

# template the file paths
base_path_str = files.project_root() + '/checkpoints/{run}/best_model.pt'
base_cfg_str = files.project_root() + '/cfg/nlp/text_autoencoding/{name}'

checkpoints = [
    base_path_str.format(run='20231012-223600-text-autoencoding'), # no masking, 1-layer enc-dec
    base_path_str.format(run='20231017-144245-text-autoencoding'), # masking with 10%, 1-layer enc-dec
    base_path_str.format(run='20231019-165420-text-autoencoding'), # masking with 15%, 1-layer enc-dec
    base_path_str.format(run='20231017-221729-text-autoencoding'), # masking with 15%, 3-layer enc-dec
    base_path_str.format(run='20231021-034122-text-autoencoding'), # masking with 15%, 6-layer enc-dec
    base_path_str.format(run='20231025-134613-text-autoencoding'), # masking with 60%, 1-layer enc-dec
    base_path_str.format(run='20231025-144308-text-autoencoding'), # masking with 15%, 1-layer enc-dec, no relu after conv
    base_path_str.format(run='20231031-121119-text-autoencoding'), # masking with 15%, 1-layer enc-dec, no relu after linear
    base_path_str.format(run='20231107-181044-text-autoencoding'), # no masking, 1-layer enc-dec, no relu after linear, bert-base frozen
    base_path_str.format(run='20231107-181044-text-autoencoding'), # masking with 10%, 1-layer enc-dec, no relu after linear, bert-base frozen
    base_path_str.format(run='20231110-074442-text-autoencoding'), # masking with 15%, 1-layer enc-dec, no relu after linear, bert-base frozen
]

configs = [
    base_cfg_str.format(name='1l_15percent.yaml'),
    base_cfg_str.format(name='1l_15percent_no_relu.yaml'),
    base_cfg_str.format(name='dev_config2.yaml'),
    base_cfg_str.format(name='1l_60percent.yaml'),
    base_cfg_str.format(name='6l_20percent.yaml'),
    base_cfg_str.format(name='1l_15percent_linear.yaml'),
    base_cfg_str.format(name='1l_15percent_linear_bert.yaml'),
    base_cfg_str.format(name='1l_0percent_linear_bert.yaml'),
]

ckpt_cfg_pairs = {
    checkpoints[2]: configs[0],
    checkpoints[3]: configs[1],
    checkpoints[5]: configs[2],
}

def load_model(ckpt_choice, cfg_choice):
    
    ckpt_path = checkpoints[ckpt_choice]
    cfg_path = configs[cfg_choice]
    
    # read config
    cfg, keywords = cfg_reader.load(cfg_path)
    
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(cfg.data['tokenizer_name'])
    
    # assign config values for model
    cfg.model['vocab_size'] = len(tokenizer)
    cfg.model['pad_token_id'] = tokenizer.pad_token_id
    
    # load model
    model = models.TextAutoencoder(cfg).to('cuda:3')
    model.load_state_dict(torch.load(ckpt_path))
    model.eval()

    return model, tokenizer
    
model, tokenizer = load_model(ckpt_choice, cfg_choice)

# display model info
print(f'device is: {model.dec_embeddings.weight.device}')
print()
print(model)

device is: cuda:3

TextAutoencoder(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), ep

# Load Text Data and Encode Some Text

In [31]:
sample_size = 10

ds = Dataset.from_csv('/data/john/projects/mltoolkit/data/synthetic_overlap/synthetic_overlap_data.csv')
#ds = datasets.load_dataset('ptb_text_only')['test']
pprint(ds)

sample_indices = np.random.choice(
    np.arange(len(ds)), 
    (sample_size,)
)

sentences = ds[sample_indices]['s1']
enc = model.encode(sentences)

print(enc)
print(enc.shape)

Dataset({
    features: ['overlap', 's1', 's2'],
    num_rows: 39263
})
tensor([[-0.0656,  0.6106,  0.3526,  ...,  0.1219,  0.0516,  0.3051],
        [-0.0215,  0.6969, -0.2822,  ..., -0.0304,  0.0458, -0.4675],
        [ 0.1349,  0.9058,  0.1267,  ..., -0.1653,  0.0622, -0.2261],
        ...,
        [-0.0060,  1.2549,  0.4610,  ...,  0.2104,  0.2673, -0.2909],
        [-0.0241,  0.3344,  0.0157,  ...,  0.4011, -0.1932,  0.0690],
        [ 0.0350, -0.0371,  0.2508,  ..., -0.1800,  0.0606, -1.1357]],
       device='cuda:3')
torch.Size([10, 768])


# Decode the Encodings

In [32]:
dec = model.decode(enc)

for i, s in enumerate(dec):
    print(f'\nx:\t{sentences[i]}\nx_hat:\t{s}')




x:	The youngest Prince prepared for the South Pole by living in a giant freezer for 24 hours, rubbing his hands for warmth while the group practised using cross trainers inside MIRA's environmental test chambers.
x_hat:	the group of the freezer was for three hours, holding a giant freezer while his son worked in the ice freezer, living in the freeze of the sun's freezer while he hunted for ice cream.

x:	Dominic Young, a tech entrepreneur, described the event as "amazing" and "legendary," and its power to bring together individuals from various disciplines for serendipitous collaborations is seen as a force for innovation.
x_hat:	the event of a thousand - year - old, ryans, is a successful and successful, a media of technology to connect it as a media for it to be connected to the event.

x:	During the hearing, Bond Commissioner Paul Maselek revealed that the 43-year-old former state trooper, Laporto, is a self-employed real estate agent undergoing treatment for a back problem.
x_hat:

# Test on My Own Writing

In [71]:
my_sents = [
    'can you tell me how to find the gas station?',
    'i really cannot stand washing the dishes.',
]

my_enc = model.encode(my_sents)
my_dec = model.decode(my_enc)

print(my_dec)

['can you tell me how to find the gas station?', 'i really cannot stand washing the dishes.']


# Test on Overlap Dataset

In [7]:
# read in dataset
overlap_ds = Dataset.from_csv('/data/john/projects/mltoolkit/data/synthetic_overlap/synthetic_overlap_data.csv')

# get encodings
s1_enc = model.encode(overlap_ds[:10]['s1'])
s2_enc = model.encode(overlap_ds[:10]['s2'])
overlap_enc = model.encode(overlap_ds[:10]['overlap'])

# get intermediate encodings and decodings
middle_enc = (s1_enc + s2_enc)/2
middle_dec = model.decode(middle_enc)

# get euclidean distances between middle encodings and overlap encodings
l2_dist = torch.sum((middle_enc * overlap_enc)**2, dim=-1)
print(f'l2: {l2_dist}')

# compare cosine similarity
overlap_cos_sim = F.cosine_similarity(middle_enc, overlap_enc, dim=-1)
print(f'cosine similarities: {overlap_cos_sim}')

# compare decodings with overlap
for i, s in enumerate(middle_dec):
    print(f'\ny:\t{overlap_ds[i]["overlap"]}\ny_hat:\t{s}')



l2: tensor([0.0018, 0.0027, 0.0083, 0.0195, 0.0030, 0.0047, 0.0044, 0.0053, 0.0025,
        0.0036], device='cuda:3')
cosine similarities: tensor([0.4710, 0.4694, 0.5500, 0.6115, 0.5745, 0.4361, 0.5709, 0.5175, 0.5073,
        0.4880], device='cuda:3')

y:	She said: 'Mike and I have a great sex life.
y_hat:	he found then have a soul sex life comes drove. curring for ` to all crazy jokes each other and have a new ev life.

y:	Travellers will learn about the ship in a series of lectures in addition to the dive .
y_hat:	during the coopererspping japanese gondo : these jeremy crouched including trip along side fi throne path to save liam the ship of actively initiation in texas anchors.

y:	The crew ordered an evacuation after the Boeing 737-800 came to a stop, and passengers slid down the plane’s inflatable slides to safety.
y_hat:	primarily kat canyon @ - @ spinning along guards and eight crew metro connected training star with others having the 9th division's on the ground 00own von anl

# What Does $dec(enc(dec(\frac{enc(s_1) + enc(s_2)}{2})))$ Look Like?

In [73]:
double_enc = model.encode(middle_dec)
double_dec = model.decode(double_enc)

double_cos_sim = F.cosine_similarity(double_enc, middle_enc, dim=-1)

print(f'cos_sim(double_enc, middle_enc): {double_cos_sim}')

# compare double decodings with original decodings
for i, s in enumerate(middle_dec):
    print(f'\nx:\t{double_dec[i]}\nx_hat:\t{s}')

cos_sim(double_enc, middle_enc): tensor([0.8966, 0.8378, 0.6703, 0.8678, 0.9239, 0.9024, 0.8873, 0.8749, 0.8379,
        0.9515], device='cuda:3')

x:	he found i have a soul any mind during hold easter tibbed for at rid all crazy of each other and have a great legal life.
x_hat:	he found i have a soul any mind during hold easter tibbed for at rid all crazy of each other and have a great legal life.

x:	during the cooperbaloto team went unnoticed : those take accordingly he stage along the east poker path on save david the passenger propeller de la pickup truck and biplane.
x_hat:	during the cooperbal seekers team went unnoticed : those take accordingly he stage along the east poker path on save david the passenger propeller de bain pickup truck.

x:	someplace laid hurtafltream lee gracefully messages as eight crew hence 30 paths, with citizens being the maxwell'ammunition in an's academylaws in 130 @ - @ ground forces in the region.
x_hat:	someplace laid hurtafltream lee gracefully mes

# Evaluate Embeddings On STS

In [4]:

# load dataset
sts = datasets.load_dataset('mteb/stsbenchmark-sts').with_format('torch')
a = sts['test']['sentence1']
b = sts['test']['sentence2']
labels = sts['test']['score']

# encode sentences
enc_a = model.encode(a)
enc_b = model.encode(b)

# compute similarity scores
sim_scores = F.cosine_similarity(enc_a, enc_b, dim=-1)
dist_scores = torch.sum((enc_a-enc_b)**2, dim=-1)

# compute spearman correlation for cosine similarity scores and negative distance scores
rho_cos = spearmanr(sim_scores.cpu().numpy(), labels)
rho_dist = spearmanr(-dist_scores.cpu().numpy(), labels)

print(f'spearman score (cos): {rho_cos.statistic:.2f}')
print(f'spearman score (dist): {rho_dist.statistic:.2f}')

spearman score (cos): 0.59
spearman score (dist): 0.55


In [74]:
for ss, ds, l in zip(sim_scores, dist_scores, labels):
    print(f'sim score: {ss:.2f}, dist score: {ds:.2f}, label: {l}')

sim score: 1.00, dist score: 0.01, label: 3.0
sim score: 0.64, dist score: 0.62, label: 3.0
sim score: 0.99, dist score: 0.02, label: 0.0
sim score: 0.95, dist score: 0.26, label: 0.0
sim score: 0.93, dist score: 0.07, label: 2.0
sim score: 0.99, dist score: 0.04, label: 0.0
sim score: 1.00, dist score: 0.01, label: 5.0
sim score: 1.00, dist score: 0.03, label: 3.0
sim score: 1.00, dist score: 0.42, label: 2.0
sim score: 0.92, dist score: 0.48, label: 0.0
sim score: 0.92, dist score: 0.23, label: 2.0
sim score: 0.67, dist score: 0.27, label: 5.0
sim score: 0.99, dist score: 0.03, label: 1.0
sim score: 0.99, dist score: 0.07, label: 4.0
sim score: 1.00, dist score: 0.00, label: 4.0
sim score: 0.99, dist score: 0.09, label: 1.0
sim score: 0.68, dist score: 1.52, label: 1.0
sim score: 0.85, dist score: 0.35, label: 1.0
sim score: 0.73, dist score: 0.19, label: 4.0
sim score: 0.88, dist score: 0.58, label: 1.0
sim score: 1.00, dist score: 0.01, label: 5.0
sim score: 0.63, dist score: 0.27,

# Test Multihead Attention

In [42]:

# define matrix parameters
n = 3 # number of samples
s = 4 # sequence length
d = 6 # dimensionality
h = 3 # number of attention heads

# create multihead attention module
mha = torch.nn.MultiheadAttention(
    d, 
    h, 
    batch_first=True, 
    dtype=torch.float64
)

# create padding mask
pad_mask = torch.ones(n, s, dtype=torch.bool)
pad_mask[:, 0] = False

# create attention mask
attn_mask = torch.triu(torch.ones(s, s, dtype=torch.bool), diagonal=1)

q = torch.rand(n, s, d, dtype=torch.float64)
k = torch.rand(n, s, d, dtype=torch.float64)
v = torch.rand(n, s, d, dtype=torch.float64)

attn_out = mha(
    q,
    k,
    v,
    key_padding_mask=pad_mask,
    attn_mask=attn_mask,
    need_weights=False
)
print(attn_out)

(tensor([[[-0.0749,  0.0794, -0.0309, -0.0813,  0.1625, -0.0400]],

        [[-0.2123, -0.0965,  0.1663, -0.3127,  0.2242, -0.2399]],

        [[-0.1470, -0.0740,  0.1244, -0.1583,  0.1471, -0.1761]]],
       dtype=torch.float64, grad_fn=<TransposeBackward0>), None)


# Evaluate on SentEval

In [11]:


# set device
dev = 'cuda:0'

# load tsdae model
tsdae_ckpt = files.project_root() + '/evaluation_scripts/text_autoencoding/output/tsdae-model'
tsdae_model = SentenceTransformer(tsdae_ckpt)
#tsdae_model = SentenceTransformer('kwang2049/TSDAE-askubuntu2nli_stsb')
#tsdae_model[1] = st_models.Pooling(tsdae_model[0].get_word_embedding_dimension(), pooling_mode='cls')  # Note this model uses CLS-pooling

# load bert model
bert_model = AutoModel.from_pretrained('bert-base-uncased').to(dev)
bert_model = bert_model.eval()
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

params = {
    'task_path': '/data/john/projects/SentEval/data/',
    'usepytorch': True, 
    'kfold': 10,
    'classifier': {
        'nhid': 0,
        'optim': 'adam',
        'batch_size': 64,
        'tenacity': 5,
        'epoch_size': 4,
        #'dropout': .05,
    }
}
def prepare(params, samples):
    pass

def my_batcher(params, batch):
    batch = [' '.join(s) for s in batch]
    
    outputs = model.encode(batch)
    outputs = outputs.cpu().numpy()
    return outputs

def tsdae_batcher(params, batch):
    batch = [' '.join(s) for s in batch]
    outputs = tsdae_model.encode(
        batch,
        convert_to_numpy=True,
    )
    return outputs

def bert_batcher(params, batch):
    batch = [' '.join(s) for s in batch]
    tokens = bert_tokenizer(
        batch,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
    ).to(dev)

    with torch.no_grad():
        outputs = bert_model(tokens['input_ids'], tokens['attention_mask'])

    mask = (tokens['attention_mask'] == 0)
    # ['last_hidden_state', 'pooler_output']
    #mean
    outputs = outputs['last_hidden_state']
    outputs.masked_fill_(mask[..., None], 0)
    outputs = torch.mean(outputs, dim=-2)
    
    # cls
    """
    outputs = outputs['pooler_output']
    """
    return outputs.cpu().numpy()

transfer_tasks = [
    'CR', 
    'MR', 
    'MPQA', 
    'SUBJ', 
    'SST2', 
    'SST5', 
    'TREC', 
    'MRPC', 
    #'SNLI',
    'STSBenchmark',
]


se = senteval.engine.SE(
    params, 
    my_batcher,
    #tsdae_batcher,
    #bert_batcher,
    prepare
)

results = {}
for task in transfer_tasks:
    try:
        results[task] = se.eval([task])[task]
    except Exception as e:
        display.error(f'failed to execute task: {task}')
        print(e)
        
import json
pprint(results)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'CR': {'acc': 87.47, 'devacc': 87.85, 'ndev': 3775, 'ntest': 3775},
 'MPQA': {'acc': 87.4, 'devacc': 87.81, 'ndev': 10606, 'ntest': 10606},
 'MR': {'acc': 80.85, 'devacc': 81.45, 'ndev': 10662, 'ntest': 10662},
 'MRPC': {'acc': 72.06,
          'devacc': 74.31,
          'f1': 79.65,
          'ndev': 4076,
          'ntest': 1725},
 'SST2': {'acc': 85.23, 'devacc': 86.7, 'ndev': 872, 'ntest': 1821},
 'SST5': {'acc': 45.16, 'devacc': 46.41, 'ndev': 1101, 'ntest': 2210},
 'STSBenchmark': {'devpearson': 0.6196517516207767,
                  'mse': 1.5295099531708745,
                  'ndev': 1500,
                  'ntest': 1379,
                  'pearson': 0.6138308267842943,
                  'spearman': 0.6154312115082687,
                  'yhat': array([1.16614271, 1.0169503 , 1.15880967, ..., 4.31235803, 3.9458686 ,
       3.77113428])},
 'SUBJ': {'acc': 95.27, 'devacc': 95.63, 'ndev': 10000, 'ntest': 10000},
 'TREC': {'acc': 90.6, 'devacc': 86.35, 'ndev': 5452, 'ntest': 500}}


In [27]:
roberta = AutoModel.from_pretrained('roberta-base')
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
sentences = ['hey there', 'hello there', 'how do you do']
tokens = roberta_tokenizer(
    sentences,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
)

print(tokens.keys())


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


dict_keys(['input_ids', 'attention_mask'])
