### Tensors

In [2]:
import torch

embed = torch.nn.Embedding(6, 3)

# 10 words in the vocabulary, 3 dimensional embeddings
print(embed.weight)

Parameter containing:
tensor([[-0.8146, -0.3438, -1.1104],
        [ 0.3662, -0.3272, -1.1627],
        [ 0.9963,  1.1391,  0.0909],
        [-1.7906, -1.0357,  0.9291],
        [-0.0374, -1.8921,  0.1530],
        [ 0.3227,  0.9455,  1.0787]], requires_grad=True)


In [6]:
sentence = torch.tensor([[1, 4, 5], [4, 3, 2]])

embedded_sentence = embed(sentence)

In [5]:
token_list = torch.tensor([1, 2, 3, 4], dtype=torch.long)

print(token_list[:3])

tensor([1, 2, 3])


In [16]:
input_embeddings = torch.tensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]], [[7, 8, 9], [10, 11, 12]], [[7, 8, 9], [10, 11, 12]]], dtype=torch.float32)
vocab_embeddings = torch.tensor([[1, 0, 0], [0,0,1]], dtype=torch.float32)

print(input_embeddings.shape)
print(vocab_embeddings.shape)

dot = torch.einsum('ntd,td->nt', input_embeddings, vocab_embeddings)


torch.Size([4, 2, 3])
torch.Size([2, 3])
torch.Size([4, 2])
tensor([[1.],
        [7.],
        [7.],
        [7.]])


### Test FlexVocab

In [6]:
%load_ext autoreload
%autoreload 2

from src.adv_sample.vocab import *
import logging
#set logger 
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
main()

DEBUG:src.adv_sample.vocab:FlexibleVocab: __init__(): initialized with 5 words/phrases
DEBUG:src.adv_sample.vocab:FlexibleVocab: compare_strict(): input_embeddings shape: torch.Size([2, 5])
DEBUG:src.adv_sample.vocab:FlexibleVocab: compare_strict(): vocab_embeddings shape: torch.Size([3, 2, 5])
DEBUG:src.adv_sample.vocab:FlexibleVocab: compare_strict(): dot_products shape: torch.Size([3])
DEBUG:src.adv_sample.vocab:FlexibleVocab: compare_strict_batch(): dot_products shape: torch.Size([2, 3])


Tokenized list: [[0], [1], [2, 3], [4, 5], [6, 7]]
Input tokens: [2, 3] (New York)
Similarities: tensor([10.9933, -2.2089, -3.1947])
Token list: [(2, 3), (4, 5), (6, 7)]
Token list (string): [['New', 'York'], ['Los', 'Angeles'], ['San', 'Francisco']]
Input tokens (batch): [[2, 3], [2, 1]] (New York, New world)
Batch similarities: tensor([[10.9933, -2.2089, -3.1947],
        [ 4.5514, -3.9152, -1.1237]])
Token list: [(2, 3), (4, 5), (6, 7)]
Token list (string): [['New', 'York'], ['Los', 'Angeles'], ['San', 'Francisco']]


### Semantic search

In [7]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (31 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.1-cp312-cp312-macosx_10_13_x86_64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2024.11.6-cp312-cp312-macosx_10_13_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Collectin

In [None]:
from sentence_transformers.util import semantic_search
#this will work when we have single embeddings for each sentence

### Projection

In [None]:
top_k = 1
query_embedding = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.float32)

In [19]:
import torch

# Example data
mask = torch.tensor([[0, 1, 0, 2, 2], [0, 1, 0, 1, 1]])  # Shape: [batch, sentence length]
batch_emb = torch.rand(2, 5, 8)  # Shape: [batch, sentence length, emb dim]

batch_emb_old = batch_emb.clone()
# Example perturbation function
def function_x(embeddings,x):
    return embeddings + x*0.1  # Adds 0.1 to all embeddings as a simple example

# Iterate over unique values in mask
for i in torch.unique(mask):
    # Get the indices where mask == i
    indices = mask == i

    # Expand indices to match the embedding dimensions
    expanded_indices = indices.unsqueeze(-1).expand_as(batch_emb)

    # Select embeddings where mask == i
    selected_embeddings = batch_emb[expanded_indices].view(-1, batch_emb.size(-1))

    # Apply the perturbation function
    perturbed_embeddings = function_x(selected_embeddings, i)
    print(i)
    print(perturbed_embeddings.shape)
    # Update the embeddings in batch_emb
    batch_emb[expanded_indices] = perturbed_embeddings.view(-1)

# Output updated batch_emb
print(batch_emb-batch_emb_old)


tensor(0)
torch.Size([4, 8])
tensor(1)
torch.Size([4, 8])
tensor(2)
torch.Size([2, 8])
tensor([[[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000]],

        [[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000],
         [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000]]])


In [46]:
%load_ext autoreload
%autoreload 2

from src.adv_sample.vocab import FlexibleVocab
from src.adv_sample.projection import *
import logging
#set logger 
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
# Dummy embedding matrix (8 tokens, 5 dimensions)
embedding_matrix = torch.randn(8, 5)

# Tokenized vocabulary (words/phrases)
vocab_string = [["hello"], ["world"], ["New", "York"], ["Los", "Angeles"], ["San", "Francisco"]]
words_to_ids = {"hello": 0, "world": 1, "New": 2, "York": 3, "Los": 4, "Angeles": 5, "San": 6, "Francisco": 7}
ids_to_words = {v: k for k, v in words_to_ids.items()}

vocab_tokens = [[words_to_ids[word] for word in token] for token in vocab_string]

print("Tokenized list:", vocab_tokens)

# Create FlexibleVocab object
flex_vocab = FlexibleVocab(vocab_tokens, embedding_matrix)

sentences = [["hello world New York"], ["Los Angeles San Francisco"]]
print(f"Sentences: {sentences}")
tokenized_sentences = torch.tensor([[words_to_ids[word] for word in sentence[0].split()] for sentence in sentences])
print(f"Tokenized sentences: {tokenized_sentences}")
embedded_sentences = flex_vocab.embedding_matrix[tokenized_sentences.view(-1)].view(tokenized_sentences.shape[0], tokenized_sentences.shape[1], -1)
print(f"Embedded sentences shape: {embedded_sentences.shape}")

#add random noise to the embeddings
perturbed_embeddings = embedded_sentences + torch.randn(embedded_sentences.shape)*0.0001

batch_emb, batch_tokens  = project_embeddings(sample_embeddings = perturbed_embeddings,
                       sample_tokens = torch.tensor(tokenized_sentences),
                       vocab = flex_vocab,
                       mask = torch.ones_like(tokenized_sentences)+torch.ones_like(tokenized_sentences),
                       method = 'strict')

print('embedding_matrix:', flex_vocab.embedding_matrix)
print("Batch tokens:", batch_tokens)
print("Batch embeddings:", batch_emb)


DEBUG:src.adv_sample.vocab:FlexibleVocab: __init__(): initialized with 5 words/phrases
  sample_tokens = torch.tensor(tokenized_sentences),
DEBUG:src.adv_sample.projection:project_embeddings: batch_emb.shape: torch.Size([2, 4, 5])
DEBUG:src.adv_sample.projection:project_embeddings: i: 2
DEBUG:src.adv_sample.projection:project_embeddings: expanded_indices.shape: torch.Size([2, 4, 5])
DEBUG:src.adv_sample.vocab:FlexibleVocab: compare_strict_batch(): dot_products shape: torch.Size([4, 3])
DEBUG:src.adv_sample.projection:project_embeddings: embedding_closest.shape: torch.Size([40])
DEBUG:src.adv_sample.projection:project_embeddings: batch_emb[expanded_indices] shape: torch.Size([40])
DEBUG:src.adv_sample.projection:project_embeddings: batch_emb.shape: torch.Size([2, 4, 5])
DEBUG:src.adv_sample.projection:project_embeddings: batch_tokens.shape: torch.Size([2, 4])


Tokenized list: [[0], [1], [2, 3], [4, 5], [6, 7]]
Sentences: [['hello world New York'], ['Los Angeles San Francisco']]
Tokenized sentences: tensor([[0, 1, 2, 3],
        [4, 5, 6, 7]])
Embedded sentences shape: torch.Size([2, 4, 5])
embedding_matrix: tensor([[ 0.0153, -0.4999, -0.3092,  0.7763,  0.4409],
        [ 1.1743, -0.6821, -0.1023, -0.8058,  1.2322],
        [-1.6690, -0.9466,  1.6276, -0.4205,  1.0767],
        [-0.1436,  0.7899,  1.3661, -0.3493, -0.0877],
        [ 0.3316,  0.4878,  0.3246,  0.0042, -0.0951],
        [-1.0491,  0.0913, -0.2221,  1.3203,  0.0719],
        [-1.6340, -0.0591, -0.5550, -0.6465, -2.0284],
        [-1.6568,  0.0786, -0.7852, -0.5429, -1.2553]])
Batch tokens: tensor([[2, 3, 2, 3],
        [4, 5, 6, 7]])
Batch embeddings: tensor([[[-1.6690, -0.9466,  1.6276, -0.4205,  1.0767],
         [-0.1436,  0.7899,  1.3661, -0.3493, -0.0877],
         [-1.6690, -0.9466,  1.6276, -0.4205,  1.0767],
         [-0.1436,  0.7899,  1.3661, -0.3493, -0.0877]],

    