In [1]:
# Installing necessary dependencies
# %pip install tqdm pymongo python-dotenv together

In [2]:
# Importing libraries
import os
import tqdm
import pymongo
import together
from typing import List
from dotenv import load_dotenv

In [3]:
# Load env variables
load_dotenv()

MONGO_CONN_URI = os.getenv('MONGO_CONN_URI')
TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY')

if MONGO_CONN_URI and TOGETHER_API_KEY:
    print("Env. variables intialized")

Env. variables intialized


In [4]:
# connect to mongodb using pymongo client
mongo_client = pymongo.MongoClient(MONGO_CONN_URI)

In [5]:
def generate_embeddings(input_texts: List[str], model_api_string: str) -> List[List[float]]:
    """
    Generate embeddings for a list of input texts using a specified model.

    Args:
        input_texts (List[str]): A list of strings for which embeddings need to be generated.
        model_api_string (str): The API string identifier for the model to be used for generating embeddings.

    Returns:
        List[List[float]]: A list of embeddings, where each embedding is represented as a list of floats.

    Example:
        input_texts = ["Hello world", "How are you?"]
        model_api_string = "example-model"
        embeddings = generate_embeddings(input_texts, model_api_string)
    """
    together_client = together.Together(api_key=TOGETHER_API_KEY)
    outputs = together_client.embeddings.create(
        input=input_texts,
        model=model_api_string,
    )
    return [x.embedding for x in outputs.data]


In [6]:
# Define constants
EMBEDDING_MODEL_STRING = 'togethercomputer/m2-bert-80M-8k-retrieval'
VECTOR_DATABASE_FIELD_NAME = 'embedding_together_m2-bert-8k-retrieval'
NUM_DOC_LIMIT = 800

In [7]:
db = mongo_client.sample_airbnb
collection = db.listingsAndReviews

keys_to_extract = ["name", "summary", "space", "description", "neighborhood_overview", "notes", "transit", "access", "interaction", "house_rules", "property_type", "room_type", "bed_type", "minimum_nights", "maximum_nights", "accommodates", "bedrooms", "beds"]

In [8]:
for doc in tqdm.tqdm(collection.find({"summary":{"$exists": True}}).limit(NUM_DOC_LIMIT), desc="Document Processing "):
  extracted_str = "\n".join([k + ": " + str(doc[k]) for k in keys_to_extract if k in doc])
  if VECTOR_DATABASE_FIELD_NAME not in doc:
    doc[VECTOR_DATABASE_FIELD_NAME] = generate_embeddings([extracted_str], EMBEDDING_MODEL_STRING)[0]
  collection.replace_one({'_id': doc['_id']}, doc)
print("Processing Done")

Document Processing : 800it [03:19,  4.01it/s]

Processing Done



