<a href="https://colab.research.google.com/github/kjahan/semantic_similarity/blob/main/examples/hesemanticsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Encryption Semantic Search using Homomorphic Encryption

This is a tutorial for implementing encrypted semantic search using Homomorphic Encryption.

https://medium.com/@ronantech/encrypted-semantic-search-through-homomorphic-encryption-841e0758ae1d

https://arxiv.org/pdf/2310.06816

https://youtu.be/Lxg9YyFJ8s0?si=zaOHn4zfVt2KWD3o

# Install requirements



In [1]:
pip install tenseal sentence_transformers

Collecting tenseal
  Downloading tenseal-0.3.16-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-m

In [2]:
import tenseal as ts # pip install tenseal
from sentence_transformers import SentenceTransformer

#Setup Embedding Model

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "I watched the sunset over the ocean.",
    "Artificial intelligence is transforming industries.",
    "The library was quiet and smelled like old books.",
    "He dreamed of traveling to distant galaxies.",
    "Innovation drives progress in the tech world.",
    "The chef prepared a delicious meal for the guests.",
    "Climate change poses a significant threat to global biodiversity.",
    "The athlete trained rigorously for the upcoming marathon.",
    "Music has the power to evoke deep emotional responses."
]

# Get and Store Sentence Embeddings

In [5]:
embeddings = model.encode(sentences)
print(embeddings)

[[ 0.04393358  0.05893443  0.04817836 ...  0.05216279  0.05610649
   0.10206392]
 [ 0.02740658  0.04301414  0.03891344 ... -0.01506041 -0.08190967
   0.0356679 ]
 [ 0.03608519 -0.02447892  0.05895455 ... -0.0826278   0.07626036
  -0.02810952]
 ...
 [-0.04316907  0.12324493  0.09774671 ... -0.13736211 -0.05429137
  -0.02212726]
 [-0.01195206  0.08340717 -0.03579247 ... -0.00149681  0.00464321
  -0.00069311]
 [ 0.05495491 -0.01101491  0.0960835  ...  0.05075207  0.01033212
  -0.03917955]]


# Get Query Embedding

In [6]:
query_text = "fox"
query_embedding = model.encode([query_text])
print(query_embedding)

[[ 1.10053308e-02 -2.71984152e-02 -4.23476696e-02  2.68690754e-02
   1.24752689e-02 -8.15260876e-03  4.80137318e-02 -2.42797304e-02
  -2.33289646e-03 -5.91702163e-02 -2.49941032e-02  1.63972266e-02
   2.21033860e-03  3.52101251e-02 -6.22175112e-02 -2.45897975e-02
  -4.41654958e-02 -9.53475833e-02  7.82252382e-03 -5.37793748e-02
  -1.33924574e-01 -2.80652829e-02  3.92268412e-02  2.02563144e-02
   1.93844158e-02 -1.97539199e-02 -1.83801111e-02  1.11274635e-02
   4.36563864e-02 -1.61914930e-01  3.24179120e-02  7.76521042e-02
   1.56489126e-02  1.42848641e-02 -1.09123997e-01 -3.61411199e-02
   4.34143431e-02 -3.68130319e-02 -2.02566441e-02  6.56031668e-02
  -1.88652668e-02 -4.23035249e-02 -1.20622488e-02  4.05694209e-02
  -6.23182654e-02 -3.29615958e-02 -4.26614732e-02 -1.31064337e-02
   6.30795807e-02  5.93835935e-02 -3.97166423e-02 -1.18213650e-02
  -5.47252297e-02  6.31510392e-02  6.42525852e-02  8.26243218e-03
  -4.80825314e-03 -1.55517664e-02 -1.95879787e-02  2.20050588e-02
   3.07974

# Setup CKKS Homomorphic Encryption

In [7]:
context = ts.context(
            ts.SCHEME_TYPE.CKKS,
            poly_modulus_degree = 8192,
            coeff_mod_bit_sizes = [60, 40, 40, 60]
          )

context.generate_galois_keys()
context.global_scale = 2**40

In [8]:
secret_context = context.serialize(save_secret_key = True)

In [9]:
context.make_context_public() #drop the secret_key from the context
public_context = context.serialize()

In [10]:
context = ts.context_from(secret_context)

# Setup Vector Database

In [11]:
class VectorStore:
    def __init__(self):
        self.vector_data = {}  # A dictionary to store vectors
        self.vector_index = {}  # An indexing structure for retrieval

    def add_vector(self, vector_id, vector):
        """
        Add a vector to the store.

        Args:
            vector_id (str or int): A unique identifier for the vector.
            vector (numpy.ndarray): The vector data to be stored.
        """
        self.vector_data[vector_id] = vector
        self._update_index(vector_id, vector)

    def get_vector(self, vector_id):
        """
        Retrieve a vector from the store.

        Args:
            vector_id (str or int): The identifier of the vector to retrieve.

        Returns:
            numpy.ndarray: The vector data if found, or None if not found.
        """
        return self.vector_data.get(vector_id)

    def _update_index(self, vector_id, vector):
        """
        Update the index with the new vector.

        Args:
            vector_id (str or int): The identifier of the vector.
            vector (numpy.ndarray): The vector data.
        """
        # In this simple example, we use brute-force cosine similarity for indexing
        for existing_id, existing_vector in self.vector_data.items():
            similarity = np.dot(vector, existing_vector) / (np.linalg.norm(vector) * np.linalg.norm(existing_vector))
            if existing_id not in self.vector_index:
                self.vector_index[existing_id] = {}
            self.vector_index[existing_id][vector_id] = similarity

    def find_similar_vectors(self, query_vector, num_results=5):
        """
        Find similar vectors to the query vector using brute-force search.

        Args:
            query_vector (numpy.ndarray): The query vector for similarity search.
            num_results (int): The number of similar vectors to return.

        Returns:
            list: A list of (vector_id, similarity_score) tuples for the most similar vectors.
        """
        results = []
        for vector_id, vector in self.vector_data.items():
            similarity = np.dot(query_vector, vector) / (np.linalg.norm(query_vector) * np.linalg.norm(vector))
            results.append((vector_id, similarity))

        # Sort by similarity in descending order
        results.sort(key=lambda x: x[1], reverse=True)

        # Return the top N results
        return results[:num_results]

In [12]:
vector_store = VectorStore()

# Run Encrypted Search Calculations

Encrypt the query embedding as an encrypted vector

In [13]:
enc_queryvec = ts.ckks_vector(context, query_embedding[0].tolist())

Measure Cosine Similarity

In [14]:
cosine_similarity_ranking = []
for i in range(len(sentences)):
    enc_sentence = ts.ckks_vector(context, embeddings[i].tolist())
    dot_product = enc_queryvec.dot(enc_sentence)
    cosine_similarity = dot_product.decrypt()[0]
    cosine_similarity_ranking.append({sentences[i]: abs(cosine_similarity)})

# Present Results

In [15]:
search_results = sorted(cosine_similarity_ranking, key=lambda x: list(x.values())[0], reverse=True)
for item in search_results:
    print(item)
    print()


{'The quick brown fox jumps over the lazy dog.': 0.48099331832474596}

{'I watched the sunset over the ocean.': 0.09014474117989793}

{'The athlete trained rigorously for the upcoming marathon.': 0.07812404354629308}

{'Artificial intelligence is transforming industries.': 0.07767134256925286}

{'He dreamed of traveling to distant galaxies.': 0.050605523226931196}

{'The library was quiet and smelled like old books.': 0.02383649699729636}

{'The chef prepared a delicious meal for the guests.': 0.02240555187805931}

{'Climate change poses a significant threat to global biodiversity.': 0.01684595588643951}

{'Innovation drives progress in the tech world.': 0.006422471301399394}

{'Music has the power to evoke deep emotional responses.': 0.0036509296590212016}



## Run wo encryption

In [16]:
cosine_similarity_ranking_2 = []
for i in range(len(sentences)):
    cosine_similarity = query_embedding.dot(embeddings[i])
    cosine_similarity_ranking_2.append({sentences[i]: abs(cosine_similarity)})


search_results_2= sorted(cosine_similarity_ranking_2, key=lambda x: list(x.values())[0], reverse=True)
for item in search_results_2:
    print(item)
    print()

{'The quick brown fox jumps over the lazy dog.': array([0.48099548], dtype=float32)}

{'I watched the sunset over the ocean.': array([0.0901469], dtype=float32)}

{'The athlete trained rigorously for the upcoming marathon.': array([0.07812604], dtype=float32)}

{'Artificial intelligence is transforming industries.': array([0.07767335], dtype=float32)}

{'He dreamed of traveling to distant galaxies.': array([0.05060757], dtype=float32)}

{'The library was quiet and smelled like old books.': array([0.02383857], dtype=float32)}

{'The chef prepared a delicious meal for the guests.': array([0.02240771], dtype=float32)}

{'Climate change poses a significant threat to global biodiversity.': array([0.01684396], dtype=float32)}

{'Innovation drives progress in the tech world.': array([0.00642471], dtype=float32)}

{'Music has the power to evoke deep emotional responses.': array([0.00365298], dtype=float32)}



## TenSeal librray

https://github.com/OpenMined/TenSEAL

`A library for doing homomorphic encryption operations on tensors`


In [18]:
import tenseal as ts

# Setup TenSEAL context
context = ts.context(
            ts.SCHEME_TYPE.CKKS,
            poly_modulus_degree=8192,
            coeff_mod_bit_sizes=[60, 40, 40, 60]
          )
context.generate_galois_keys()
context.global_scale = 2**40

v1 = [0, 1, 2, 3, 4]
v2 = [4, 3, 2, 1, 0]

# encrypted vectors
enc_v1 = ts.ckks_vector(context, v1)
enc_v2 = ts.ckks_vector(context, v2)

result = enc_v1 + enc_v2
result.decrypt() # ~ [4, 4, 4, 4, 4]



[4.000000008453453,
 4.000000005898446,
 3.999999998718605,
 4.000000000515795,
 4.000000000237454]

In [19]:
result = enc_v1.dot(enc_v2)
result.decrypt() # ~ [10]

[10.00000071755919]

In [20]:
matrix = [
  [73, 0.5, 8],
  [81, -5, 66],
  [-100, -78, -2],
  [0, 9, 17],
  [69, 11 , 10],
]
result = enc_v1.matmul(matrix)
result.decrypt() # ~ [157, -90, 153]

[157.00002152658183, -90.00001201204175, 153.00002053010746]