In [1]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

# Tokenize input texts
texts = [
    "the object is safe to eat",
    "the object is heated until its insides turn solid",
    "the object are stirred"
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).last_hidden_state[:,0,:]

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

Cosine similarity between "the object is safe to eat" and "the object is heated until its insides turn solid" is: 0.720
Cosine similarity between "the object is safe to eat" and "the object are stirred" is: 0.672


In [23]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-distilroberta-v1")
model = AutoModel.from_pretrained("sentence-transformers/all-distilroberta-v1")

# Tokenize input texts
texts = [
    "the object is safe to eat",
    "the object is heated until its insides turn solid",
    "the object are stirred"
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).last_hidden_state[:,0,:]

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

Cosine similarity between "the object is safe to eat" and "the object is heated until its insides turn solid" is: 0.349
Cosine similarity between "the object is safe to eat" and "the object are stirred" is: 0.305


In [23]:
model(**inputs, output_hidden_states=True, return_dict=True).last_hidden_state[:,0,:]

tensor([[ 0.1606,  0.1054, -0.1212,  ...,  0.3991, -0.1608, -0.0174],
        [-0.1362,  0.3166,  0.0053,  ...,  0.1815,  0.2212,  0.1173],
        [ 0.0852,  0.4877, -0.0801,  ...,  0.1335,  0.3019, -0.1676]],
       grad_fn=<SliceBackward0>)

In [20]:
print(tokenizer)

PreTrainedTokenizerFast(name_or_path='princeton-nlp/sup-simcse-bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [7]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Try head

In [33]:
import torch.nn as nn
from transformers import AutoModel
class EffectEncModel(nn.Module):
    def __init__(self):
        super(EffectEncModel, self).__init__()
        
        self.base_model = AutoModel.from_pretrained('princeton-nlp/sup-simcse-bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.effect_head = nn.Linear(768, 768)
        self.object_head = nn.Linear(768, 768)
        
        
    def forward(self, input):
        hidden_state = self.base_model(**input).pooler_output
        hidden_state = self.dropout(hidden_state)
        effect_embed = self.effect_head(hidden_state)
        obj_embed = self.object_head(hidden_state)
        
        return effect_embed, obj_embed

model = EffectEncModel()

In [34]:
model(inputs)

(tensor([[ 0.0283,  0.1184,  0.1922,  ...,  0.0519, -0.0726, -0.1049],
         [ 0.1163,  0.1601,  0.1766,  ...,  0.1713, -0.2295, -0.0506],
         [ 0.1106,  0.1368,  0.1750,  ..., -0.0098, -0.1111, -0.0361]],
        grad_fn=<AddmmBackward0>),
 tensor([[-0.1355, -0.0773,  0.0082,  ..., -0.1016,  0.0389, -0.0758],
         [ 0.0469,  0.0519,  0.0534,  ..., -0.0736, -0.0787,  0.0126],
         [-0.0992, -0.0362,  0.0147,  ..., -0.0286, -0.0357,  0.0845]],
        grad_fn=<AddmmBackward0>))

In [39]:
from sentence_transformers import SentenceTransformer, models

In [18]:
modelPath = 'tune_results/sentence-transformers'

In [51]:
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [52]:
model._modules["1"].pooling_mode_mean_tokens = False
model._modules["1"].pooling_mode_cls_token = True

In [44]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 64, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [30]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 64, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [53]:
texts = [
    "the object is safe to eat",
    "the object is heated until its insides turn solid",
    "the object are stirred"
]
embeddings = [model.encode(text) for text in texts]

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

Cosine similarity between "the object is safe to eat" and "the object is heated until its insides turn solid" is: 0.349
Cosine similarity between "the object is safe to eat" and "the object are stirred" is: 0.305


## Try roberta


In [37]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base")

texts = [
    "the egg is safe to eat",
    "the egg is heated until its insides turn solid",
    "the egg are stirred"
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True)

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

ValueError: Input vector should be 1-D.