# Code Similarity Search
## Setup

In [2]:
%pip install -qU --no-cache-dir langchain transformers

Note: you may need to restart the kernel to use updated packages.


In [7]:
%pip show langchain

Name: langchain
Version: 0.0.285
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /media/limcheekin/My Passport/langchain-playground/venv/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, dataclasses-json, langsmith, numexpr, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-experimental
Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip show transformers 

Name: transformers
Version: 4.33.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /media/limcheekin/My Passport/langchain-playground/venv/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers
Note: you may need to restart the kernel to use updated packages.


In [7]:
different_codes = [
    "Hello world!!",
    "def my_sum(a, b): return a+b"
]

not_so_similar_codes = [
    "def sum(a, b): return a+b",
    "def minus(a, b): return a-b"
]

similar_codes = [
    "def sum(a, b): return a+b",
    "def add(a, b): return a+b"
]

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

def get_embeddings(model_name, normalize_embeddings = False):
    encode_kwargs = { "normalize_embeddings": normalize_embeddings}
    embedding = HuggingFaceEmbeddings(
                model_name=model_name, encode_kwargs=encode_kwargs)
    return embedding


## Similarity Search

### flax-sentence-embeddings/st-codesearch-distilroberta-base
https://huggingface.co/flax-sentence-embeddings/st-codesearch-distilroberta-base

In [11]:
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.schema import Document

embeddings = get_embeddings("flax-sentence-embeddings/st-codesearch-distilroberta-base", True)

In [12]:
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, k=3, similarity_threshold=0.1)
query = different_codes[0]
text = different_codes[1]
relevant_docs = embeddings_filter.compress_documents([Document(page_content=text)], query)
relevant_docs

[]

In [17]:
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, k=3, similarity_threshold=0.7) # 0.8 no matched
query = not_so_similar_codes[0]
text = not_so_similar_codes[1]
relevant_docs = embeddings_filter.compress_documents([Document(page_content=text)], query)
relevant_docs

[_DocumentWithState(page_content='def minus(a, b): return a-b', metadata={}, state={'embedded_doc': [0.008230754174292088, 0.03332541510462761, -0.014657034538686275, 0.034727465361356735, -0.029198000207543373, 0.029760310426354408, 0.018251676112413406, 0.006322886329144239, -0.07438208907842636, 0.027163559570908546, -0.06512726843357086, -0.004351519048213959, 0.019957546144723892, 0.0114294970408082, 0.021411119028925896, 0.05009784549474716, 0.022515181452035904, -0.019878538325428963, 0.08738671988248825, 0.033173881471157074, -0.01091730035841465, 0.050297316163778305, 0.009348535910248756, 0.019123796373605728, 0.045457497239112854, -0.007370969746261835, -0.005877358838915825, 0.041818637400865555, 0.00866997055709362, -0.03136255592107773, -0.06933791190385818, -0.007248480338603258, -0.013956286944448948, 0.06818598508834839, -0.042350612580776215, 0.03364570811390877, -0.02642647549510002, -0.008222113363444805, 0.03502107039093971, -0.006781139876693487, 0.045976206660270

In [24]:
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, k=3, similarity_threshold=0.6) # 0.7 not matched
query = similar_codes[0]
text = similar_codes[1]
relevant_docs = embeddings_filter.compress_documents([Document(page_content=text)], query)
relevant_docs

[_DocumentWithState(page_content='def add(a, b): return a+b', metadata={}, state={'embedded_doc': [-0.02227601781487465, 0.03486264869570732, -0.017127836123108864, 0.05940736457705498, -0.07785016298294067, 0.0661739856004715, 0.017108680680394173, -0.030703052878379822, -0.03521185368299484, 0.025134187191724777, -0.09091367572546005, 0.07813747227191925, 0.03816189989447594, 0.019312266260385513, -0.004350805655121803, 0.08430055528879166, 0.07840937376022339, 0.0324038490653038, 0.04109436646103859, 0.040014881640672684, -0.017611755058169365, 0.04042954370379448, 0.044790223240852356, 0.017359133809804916, 0.006561123766005039, 0.025429224595427513, 0.04437395557761192, 0.0295557901263237, -0.025271669030189514, -0.040009498596191406, -0.08488870412111282, -0.001098329434171319, -0.003482363885268569, 0.023456940427422523, -0.042991455644369125, 0.02947199158370495, -0.01868848130106926, -0.0439484603703022, 0.021448660641908646, 0.044122613966464996, -0.012387655675411224, -0.046