In [None]:
%pip install pinecone-client modal sentence-transformers

# ENV Vars and Pinecone Index

In [75]:
import os
import pinecone
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("PINECONE_API_KEY")
if not api_key:
    print("Warning: No Pinecone API key found.")
modal_token_id = os.getenv("MODAL_TOKEN_ID")
if not modal_token_id:
    print("Warning: No MODAL_TOKEN_ID found.")
modal_token_secret = os.getenv("MODAL_TOKEN_SECRET")
if not modal_token_secret:
    print("Warning: No MODAL_TOKEN_SECRET found.")

pinecone.init(api_key=os.getenv(api_key), environment="us-west1-gcp-free")

In [131]:
INDEX_NAME = "cs191"
if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(INDEX_NAME, dimension=512)


# Populate Embeddings JSON in Modal

In [95]:
STATIC_DIR = os.path.dirname(os.path.abspath(os.curdir))
FLASK_DIR = os.path.dirname(STATIC_DIR)
ROOT_DIRECTORY = os.path.dirname(FLASK_DIR)
INSTANCE_DIR = os.path.join(ROOT_DIRECTORY, "instance")
SYMLINK_DIR = os.path.join(INSTANCE_DIR, "symlink")
DB_PATH = os.path.join(INSTANCE_DIR, "photos.db")


In [119]:
# Get Images we are embedding and metadata we will add to the index
import sqlite3

with sqlite3.connect(DB_PATH) as conn:
    c = conn.cursor()

    c.execute("SELECT DISTINCT PhotoID, DateTaken FROM copied ORDER BY DateTaken DESC")
    rows = c.fetchall()
    images_to_embed = [row[0] for row in rows]
    image_dates = {row[0]: row[1].split(" ")[0] if row[1] else None for row in rows}
len(images_to_embed)

846

In [121]:
import os
import shutil

# Create a temporary directory for the symlinks
os.makedirs(SYMLINK_DIR, exist_ok=True)

# Remove any existing symlinks
for filename in os.listdir(SYMLINK_DIR):
    file_path = os.path.join(SYMLINK_DIR, filename)
    if os.path.islink(file_path):
        os.unlink(file_path)

# Create symlinks to the subset of images we are embedding
failed_images = []
for image in images_to_embed:
    src = os.path.join(f"{STATIC_DIR}/converted_photos", image)
    dst = os.path.join(SYMLINK_DIR, image)
    try:
        os.symlink(src, dst)
    except FileExistsError:
        failed_images.append(image)
failed_images

[]

In [127]:
# Embeddings Generation
from modal.object import Object
from modal import Stub, Image, Mount, method

stub = Stub()


def download_models():
    # Caches the model inside the Modal image, so subsequent cold starts are faster.
    from sentence_transformers import SentenceTransformer

    SentenceTransformer("sentence-transformers/clip-ViT-B-32")


container_image = (
    Image.debian_slim()
    .pip_install("sentence-transformers")
    .run_function(download_models)
)

REMOTE_PATH = "/root/instance"


@stub.cls(
    image=container_image,
    mounts=[
        # JIT Mounting of needed files
        Mount.from_local_dir(INSTANCE_DIR, remote_path=REMOTE_PATH),
    ],
)
class ModalEmbedding:
    def __enter__(self):
        from sentence_transformers import SentenceTransformer

        self.model = SentenceTransformer("sentence-transformers/clip-ViT-B-32")

    @method()
    def generate(self, images_to_embed, image_dates):
        from tqdm import tqdm
        import json

        # Generate embeddings for each image
        # with open(f'/root/instance/embeddings.json', 'r') as json_file:
        embeddings = []
        for image_name in tqdm(images_to_embed, desc="Generating embeddings"):
            if image_name in embeddings:
                continue
            import PIL.Image

            image_path = f"{REMOTE_PATH}/symlink/{image_name}"
            try:
                image = PIL.Image.open(image_path).convert("RGB")
            except Exception as exc:
                print(image_path, exc)
                return None

            img_emb = self.model.encode(image).tolist()
            embeddings_object = {
                "id": image_name,
                "metadata": {"date": image_dates[image_name], "image_name": image_name},
                "values": img_emb,
            }
            embeddings.append(embeddings_object)

        return embeddings


In [None]:
with stub.run() as stub:
    embeddings = ModalEmbedding().generate.remote(images_to_embed, image_dates)


In [129]:
import json

with open(f"{INSTANCE_DIR}/embeddings.json", "w") as json_file:
    json.dump(embeddings, json_file)

# Populate Pinecone With Embeddings

In [None]:
with open(f"{INSTANCE_DIR}/embeddings.json", "r") as json_file:
    embeddings = json.load(json_file)

embeddings

In [132]:
index = pinecone.Index(INDEX_NAME)
upsert_response = index.upsert(
    vectors=embeddings,
    namespace='image_embeddings',
)

In [9]:
def generate_embedding(query):
    from sentence_transformers import SentenceTransformer, util
    import numpy

    model = SentenceTransformer("clip-ViT-B-32")
    # First, we encode the query (which can either be an image or a text string)
    query_emb = model.encode([query], show_progress_bar=False)
    return numpy.ndarray.tolist(query_emb)
