# Creating A Vector Store with Simple Embeddings

In [1]:
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from typing import List
import numpy as np
import json
import os
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import certifi

# Keep it free for now with a BERT model
class SentenceTransformerEmbeddings:
    """Wrapper for SentenceTransformers model to work with LangChain vector stores."""

    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for a list of documents."""
        embeddings = self.model.encode(texts, convert_to_numpy=True)  # Ensure NumPy array
        return embeddings.tolist()  # Convert array to list of lists

    def embed_query(self, text: str) -> List[float]:
        """Generate an embedding for a single query."""
        embedding = self.model.encode(text, convert_to_numpy=True)  # Ensure NumPy array
        return embedding.tolist()  # Convert array to a list


embedding_model = SentenceTransformerEmbeddings("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
uri = f"mongodb+srv://{os.getenv('whiskeydb_admin')}:{os.getenv('whiskeydb_pwd')}@whiskeyrecommender.mvfds.mongodb.net/?retryWrites=true&w=majority&appName=WhiskeyRecommender"
ca = certifi.where()

# Create a new client and connect to the server
client = MongoClient(uri,
    server_api=ServerApi('1'),
    tls=True,
    tlsAllowInvalidCertificates=False,
    tlsCAFile=ca)
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


### Connect to the Atlas Store

In [3]:
reddit_reviews = client.reddit_reviews
whiskies = reddit_reviews['whiskies']

whiskey_docs = [x for x in whiskies.find()]

In [4]:
collection = client["whiskey_database"]["vectordb_singleembed"]

In [37]:
vector_store = MongoDBAtlasVectorSearch(
   collection = collection,         # Collection to store embeddings
   embedding = embedding_model,   # Embedding model to use
   index_name = "test_index",    # Name of the vector search index
   relevance_score_fn = "cosine"   # Similarity score function, can also be "euclidean" or "dotProduct"
)

In [35]:
from langchain_mongodb import index
index.drop_vector_search_index(index_name="vector_index", collection=collection)

OperationFailure: Search index whiskey_database.vectordb_singleembed.vector_index cannot be found, full error: {'ok': 0.0, 'errmsg': 'Search index whiskey_database.vectordb_singleembed.vector_index cannot be found', 'code': 27, 'codeName': 'IndexNotFound', '$clusterTime': {'clusterTime': Timestamp(1738894772, 8), 'signature': {'hash': b'\\\xb9M\xd6\xd7a\xcdp\x90\xbdN,}\x9f\x9c\x97\xd1/W\xc6', 'keyId': 7422016508999499779}}, 'operationTime': Timestamp(1738894772, 8)}

In [38]:
#vector_store.create_vector_search_index(
#   dimensions = 384
#)
vector_store.create_vector_search_index(
    dimensions=384,
    filters=[
        {"type": "filter", "path": "distillery_region"}
    ]
)

In [33]:
whiskey_doc_format = []

for doc in whiskey_docs:
    text = doc.get('distillery') + ' ' +  doc.get('whiskey_name') + ' ' + doc.get('age') + ": \n" + 'Nose Notes:' + \
           ', '.join(doc.get('nose_tags')) + '\n' + 'Palette/Taste Notes:' + ', '.join(doc.get('palette_tags')) + '\n' \
           + 'Finish Notes: ' + ', '.join(doc.get('finish_tags'))
    doc_formatted = Document(page_content=text, metadata=doc)
    whiskey_doc_format.append(doc_formatted)

### Add Documents

In [37]:
vector_store.add_documents(documents=whiskey_doc_format)

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

### Execute Simple Query

In [47]:
vector_store.similarity_search_with_score('oak', k=5, pre_filter={'distillery':{"$eq":'Talisker'}, 'distillery_region':{'$eq':'Isle of Skye'}})

[(Document(metadata={'_id': 5874, 'year': 2016, 'whiskey_name': 'Port Ruighe', 'age': '3', 'distillery': 'Talisker', 'whiskey_country_of_origin': 'Scotland', 'distillery_region': 'Isle of Skye', 'is_blend': 'False', 'whiskey_type': 'Scotch', 'nose_tags': ['Sandalwood', 'Plums', 'Unrefined oak boards', 'Cherries', 'Brine', 'Acetone', 'Sticky red fruits', 'Woody talisker smoke', 'Wood glue', 'Woodspice', 'Sandalwood'], 'palette_tags': ['Salt', 'Peppermint', 'Fruity punchy note', 'Peat', 'Milk chocolate'], 'finish_tags': ['Long', 'Charred fruits', 'Citrus zest', 'Dark chocolate', 'Bitter wood', 'Pepper', 'Woodsmoke']}, page_content='Talisker Port Ruighe 3: \nNose Notes:Sandalwood, Plums, Unrefined oak boards, Cherries, Brine, Acetone, Sticky red fruits, Woody talisker smoke, Wood glue, Woodspice, Sandalwood\nPalette/Taste Notes:Salt, Peppermint, Fruity punchy note, Peat, Milk chocolate\nFinish Notes: Long, Charred fruits, Citrus zest, Dark chocolate, Bitter wood, Pepper, Woodsmoke'),
  0.

### Create Multiple Embeddings Per Doc

In [56]:
whiskey_docs[0:2]

[{'_id': 0,
  'year': 2018,
  'whiskey_name': 'Thor',
  'age': '',
  'distillery': 'Highland Park',
  'whiskey_country_of_origin': 'Scotland',
  'distillery_region': 'Island',
  'is_blend': 'False',
  'whiskey_type': 'Scotch',
  'nose_tags': [],
  'palette_tags': [],
  'finish_tags': []},
 {'_id': 1,
  'year': 2017,
  'whiskey_name': 'Peated',
  'age': '',
  'distillery': 'Longrow',
  'whiskey_country_of_origin': 'Scotland',
  'distillery_region': 'Campbeltown',
  'is_blend': 'False',
  'whiskey_type': 'Scotch',
  'nose_tags': [],
  'palette_tags': [],
  'finish_tags': []}]

In [82]:
def empty_stringify(text):
    if text is None:
        return ""
    else:
        return text

nose_tags = [','.join(x.get('nose_tags',[])) for x in whiskey_docs]
palette_tags = [','.join(x.get('palette_tags',[])) for x in whiskey_docs]
finish_tags = [','.join(x.get('finish_tags',[])) for x in whiskey_docs]

In [85]:
nose_embeddings = embedding_model.embed_documents(nose_tags)

In [86]:
palette_embeddings = embedding_model.embed_documents(palette_tags)

In [87]:
finish_embeddings = embedding_model.embed_documents(finish_tags)

In [97]:
from bson.binary import Binary, BinaryVectorDtype

formatted_docs = []

for i, doc in enumerate(whiskey_docs):

    doc['nose_embedding'] = Binary.from_vector(nose_embeddings[i], BinaryVectorDtype.FLOAT32)
    doc['palette_embedding'] = Binary.from_vector(palette_embeddings[i], BinaryVectorDtype.FLOAT32)
    doc['finish_embedding'] = Binary.from_vector(finish_embeddings[i], BinaryVectorDtype.FLOAT32)

    formatted_docs.append(doc)

### New collection in vectordb

In [98]:
whisky_db = client.whiskey_database
multiembed = whisky_db['vectordb_multiembed']

multiembed.insert_many(formatted_docs)

InsertManyResult([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 21

In [169]:
test_embedding =  Binary.from_vector(embedding_model.embed_query('smoke'), BinaryVectorDtype.FLOAT32)

nose_results = multiembed.aggregate([
    {
        "$vectorSearch": {
            "index": "multiembed_index_test",
            "path": "nose_embedding",
            "queryVector": test_embedding,
            "numCandidates": 5000,
            "limit": 5
        }
    },
    {
        "$project": {
            "_id": 1,
            "score": { "$meta": "vectorSearchScore" }
        }
    }
])

In [170]:
list(nose_results)

[{'_id': 6934, 'score': 0.8256073594093323},
 {'_id': 2769, 'score': 0.8231710195541382},
 {'_id': 1184, 'score': 0.8177462816238403},
 {'_id': 9922, 'score': 0.8085463047027588},
 {'_id': 8840, 'score': 0.8044465780258179}]

In [181]:
from collections import defaultdict
from collections import defaultdict
import numpy as np

def query_multiple(nose_tags=None, palette_tags=None, finish_tags=None, filters=None, top_k=5):

    if filters:
        filter_criteria = filters
    else:
        filter_criteria = {}

    if nose_tags:
        nose_embedding = Binary.from_vector(embedding_model.embed_query('oak'), BinaryVectorDtype.FLOAT32)
        nose_results = list(multiembed.aggregate([
            {"$vectorSearch": {
                "index": "multiembed_index_test",
                "path": "nose_embedding",
                "queryVector": nose_embedding,
                "numCandidates": 5000,
                "limit": top_k,
                "filter": filter_criteria
            }},
            {
                "$project": {
                    "_id": 1,
                    "score": {"$meta": "vectorSearchScore"}
                }
            }
        ]))

    if palette_tags:
        palette_embedding = Binary.from_vector(embedding_model.embed_query('cinnamon'), BinaryVectorDtype.FLOAT32)
        palette_results = list(multiembed.aggregate([
            {"$vectorSearch": {
                "index": "multiembed_index_test",
                "path": "palette_embedding",
                "queryVector": palette_embedding,
                "numCandidates": 5000,
                "limit": top_k,
                "filter": filter_criteria
            }},
            {
                "$project": {
                    "_id": 1,
                    "score": {"$meta": "vectorSearchScore"}
                }
            }
        ]))

    if finish_tags:
        finish_embedding = Binary.from_vector(embedding_model.embed_query('lingering'), BinaryVectorDtype.FLOAT32)
        finish_results = list(multiembed.aggregate([
            {"$vectorSearch": {
                "index": "multiembed_index_test",
                "path": "finish_embedding",
                "queryVector": finish_embedding,
                "numCandidates": 5000,
                "limit": top_k,
                "filter": filter_criteria
            }},
            {
                "$project": {
                    "_id": 1,
                    "score": {"$meta": "vectorSearchScore"}
                }
            }
        ]))

    # Combine and rerank
    scores = defaultdict(lambda: [])  # Store a list of scores for each whiskey

    # Assign scores (higher rank = lower index)
    if 'finish_results' in locals():
        for result in finish_results:
            scores[result["_id"]].append(result["score"])

    if 'palette_results' in locals():
        for result in palette_results:
            scores[result["_id"]].append(result["score"])

    if 'nose_results' in locals():
        for result in nose_results:
            scores[result["_id"]].append(result["score"])

    # Compute final score as the average of available scores
    final_scores = {k: np.mean(v) for k, v in scores.items()}  # Only average non-empty scores

    # Sort by best score
    sorted_results = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

    # Fetch final top results
    final_whiskeys = [(multiembed.find_one({"_id": whiskey_id}), score) for whiskey_id, score in sorted_results[:top_k]]

    return final_whiskeys

In [185]:
results = query_multiple(nose_tags='oak', palette_tags='caramel')

In [186]:
results

[({'_id': 9119,
   'year': 2017,
   'whiskey_name': 'Organic Ontario Oat',
   'age': 'N/A',
   'distillery': 'Toronto Distillery Co',
   'whiskey_country_of_origin': 'Canada',
   'distillery_region': 'Canada',
   'is_blend': 'False',
   'whiskey_type': 'Whiskey',
   'nose_tags': ['oak', 'sweet'],
   'palette_tags': ['nuts', 'caramel', 'honey', 'sweet', 'sour', 'molasses'],
   'finish_tags': ['long', 'sweet'],
   'nose_embedding': Binary(b'\'\x00o\xde_=\xc2\xe86\xbd\xdbx:\xbb\x00\xb3\xb7<\xebY\n\xbc\x97\xab\xc1\xbd\x1b\x9ej=\xe2|1\xbd\xeaV\x9c<\x03\xfd~<U7\x82<\x1e\x8er\xbd\xb3d<\xbd\xcet\x0c\xbdzv\x90<n\xb2\x14\xbd\xb0e*=\xc5\x0b\xcb\xbd\xa0c\x18\xbc\xf7\x9e\xf5\xbd\'c\x15\xbd0\xa2\xcc<H\x8c\xfd\xbc\x19\x89A=)bd<<r\xcf;\xcf\xbaK=\xbb\xaf\xf5<\xcc\xd4n\xbd\xe5\n\x90\xbd\xe9\xd9Q=\xd4g\xe7<\xf5\x13\xe4<KD\x06\xbc\xedy6\xbd\xda\xdf\n\xbc\x1c\x90\xb9\xbd/\xefy\xbde\xe7\xd0<\xaf\x8d =\xff\xcfS\xbd\xb3:u=8\xf2"=\xb1\xa3\x85=8\x84\xdc\xbd]{\xcc<$~";<\xd6\xea:\n98=*\xf1\x90=\xec*\x88<h6\xa3\xb