In [1]:
from mm_rag.embeddings.bridgetower_embeddings import (
    BridgeTowerEmbeddings
)
from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
import lancedb
import json
import os
from PIL import Image
from utils import load_json_file
from utils import display_retrieved_results


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from mm_rag.embeddings.bridgetower_embeddings import (
  from .autonotebook import tqdm as notebook_tqdm


BridgeTowerConfig {
  "_attn_implementation_autoset": true,
  "contrastive_hidden_size": 512,
  "drop_rate": 0.1,
  "head_hidden_scale": 2,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "init_layernorm_from_vision_encoder": false,
  "initializer_factor": 1,
  "layer_norm_eps": 1e-05,
  "link_tower_type": "add",
  "logit_scale_init_value": 2.6592,
  "model_type": "bridgetower",
  "num_attention_heads": 16,
  "num_hidden_layers": 6,
  "share_cross_modal_transformer_layers": true,
  "share_link_tower_layers": false,
  "text_config": {
    "_attn_implementation_autoset": true,
    "architectures": [
      "BridgeTowerTextModel"
    ],
    "attention_probs_dropout_prob": 0.1,
    "classifier_dropout": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 1024,
    "initializer_factor": 1,
    "initializer_range": 0.02,
    "intermediate_size": 4096,
    "layer_norm_eps": 1e-05,
    "max_position_embeddings": 514,
    "model_type": "bridgetower_text_model",
  

In [2]:
# declare host file
LANCEDB_HOST_FILE = "./shared_data/.lancedb"
# declare table name
TBL_NAME = "test_tbl"
# initialize vectorstore
db = lancedb.connect(LANCEDB_HOST_FILE)

In [3]:
# load metadata files
vid1_metadata_path = './shared_data/videos/video1/metadatas.json'
# vid2_metadata_path = './shared_data/videos/video2/metadatas.json'
vid1_metadata = load_json_file(vid1_metadata_path)
# vid2_metadata = load_json_file(vid2_metadata_path)

# collect transcripts and image paths
vid1_trans = [vid['transcript'] for vid in vid1_metadata]
vid1_img_path = [vid['extracted_frame_path'] for vid in vid1_metadata]

# vid2_trans = [vid['transcript'] for vid in vid2_metadata]
# vid2_img_path = [vid['extracted_frame_path'] for vid in vid2_metadata]

In [4]:
# for video1, we pick n = 7
n = 7
updated_vid1_trans = [
 ' '.join(vid1_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
 ' '.join(vid1_trans[0 : i + int(n/2)]) for i in range(len(vid1_trans))
]

# also need to update the updated transcripts in metadata
for i in range(len(updated_vid1_trans)):
    vid1_metadata[i]['transcript'] = updated_vid1_trans[i]

In [5]:
print(f'A transcript example before update:\n"{vid1_trans[6]}"')
print()
print(f'After update:\n"{updated_vid1_trans[6]}"')

A transcript example before update:
"spacewalk and to now have the chance to have done"

After update:
"two months. The view is always amazing I didn't think I would do another spacewalk and to now have the chance to have done four more was just icing on the cake for a a wonderful mission."


Ingest Data to Lance DB

In [6]:
# initialize an BridgeTower embedder 
embedder = BridgeTowerEmbeddings()


# you can pass in mode="append" 
# to add more entries to the vector store
# in case you want to start with a fresh vector store,
# you can pass in mode="overwrite" instead 

_ = MultimodalLanceDB.from_text_image_pairs(
    texts=updated_vid1_trans,
    image_paths=vid1_img_path,
    embedding=embedder,
    metadatas=vid1_metadata,
    connection=db,
    table_name=TBL_NAME,
    mode="overwrite", 
)

  4%|▍         | 1/26 [00:04<01:44,  4.16s/it]

Embedding shape after squeeze: (2048,)


  8%|▊         | 2/26 [00:07<01:30,  3.77s/it]

Embedding shape after squeeze: (2048,)


 12%|█▏        | 3/26 [00:11<01:23,  3.65s/it]

Embedding shape after squeeze: (2048,)


 15%|█▌        | 4/26 [00:14<01:16,  3.47s/it]

Embedding shape after squeeze: (2048,)


 19%|█▉        | 5/26 [00:17<01:11,  3.42s/it]

Embedding shape after squeeze: (2048,)


 23%|██▎       | 6/26 [00:20<01:06,  3.34s/it]

Embedding shape after squeeze: (2048,)


 27%|██▋       | 7/26 [00:24<01:03,  3.32s/it]

Embedding shape after squeeze: (2048,)


 31%|███       | 8/26 [00:27<00:59,  3.33s/it]

Embedding shape after squeeze: (2048,)


 35%|███▍      | 9/26 [00:30<00:56,  3.34s/it]

Embedding shape after squeeze: (2048,)


 38%|███▊      | 10/26 [00:34<00:53,  3.32s/it]

Embedding shape after squeeze: (2048,)


 42%|████▏     | 11/26 [00:37<00:49,  3.31s/it]

Embedding shape after squeeze: (2048,)


 46%|████▌     | 12/26 [00:40<00:46,  3.29s/it]

Embedding shape after squeeze: (2048,)


 50%|█████     | 13/26 [00:43<00:42,  3.28s/it]

Embedding shape after squeeze: (2048,)


 54%|█████▍    | 14/26 [00:47<00:39,  3.27s/it]

Embedding shape after squeeze: (2048,)


 58%|█████▊    | 15/26 [00:50<00:36,  3.32s/it]

Embedding shape after squeeze: (2048,)


 62%|██████▏   | 16/26 [00:54<00:33,  3.35s/it]

Embedding shape after squeeze: (2048,)


 65%|██████▌   | 17/26 [00:57<00:30,  3.34s/it]

Embedding shape after squeeze: (2048,)


 69%|██████▉   | 18/26 [01:00<00:26,  3.35s/it]

Embedding shape after squeeze: (2048,)


 73%|███████▎  | 19/26 [01:04<00:23,  3.35s/it]

Embedding shape after squeeze: (2048,)


 77%|███████▋  | 20/26 [01:07<00:20,  3.36s/it]

Embedding shape after squeeze: (2048,)


 81%|████████  | 21/26 [01:11<00:17,  3.42s/it]

Embedding shape after squeeze: (2048,)


 85%|████████▍ | 22/26 [01:14<00:13,  3.35s/it]

Embedding shape after squeeze: (2048,)


 88%|████████▊ | 23/26 [01:17<00:09,  3.30s/it]

Embedding shape after squeeze: (2048,)


 92%|█████████▏| 24/26 [01:20<00:06,  3.28s/it]

Embedding shape after squeeze: (2048,)


 96%|█████████▌| 25/26 [01:23<00:03,  3.22s/it]

Embedding shape after squeeze: (2048,)


100%|██████████| 26/26 [01:26<00:00,  3.34s/it]

Embedding shape after squeeze: (2048,)





Create connection to a table in lance db

In [7]:
# open a connection to table TBL_NAME
tbl = db.open_table(TBL_NAME)

print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
# display the first 3 rows of the table
tbl.to_pandas()[['text', 'image_path']].head(3)

There are 26 rows in the table


Unnamed: 0,text,image_path
0,As I look back on the the mission that we've h...,./shared_data/videos/video1\extracted_frame\fr...
1,As I look back on the the mission that we've h...,./shared_data/videos/video1\extracted_frame\fr...
2,As I look back on the the mission that we've h...,./shared_data/videos/video1\extracted_frame\fr...


Creating a retreiver from lance db vector store

In [8]:
# Creating a LanceDB vector store 
vectorstore = MultimodalLanceDB(
    uri=LANCEDB_HOST_FILE, 
    embedding=embedder, 
    table_name=TBL_NAME)

# creating a retriever for the vector store
# search_type="similarity" 
#  declares that the type of search that the Retriever should perform 
#  is similarity search
# search_kwargs={"k": 1} means returning top-1 most similar document
retriever = vectorstore.as_retriever(
    search_type='similarity', 
    search_kwargs={"k": 1}
)

In [12]:
print(vectorstore._embedding)





In [9]:
retriever = vectorstore.as_retriever(
    search_type='similarity', 
    search_kwargs={"k": 1})
query2 = (
        "an astronaut's spacewalk "
        "with an amazing view of the earth from space behind"
)
results2 = retriever.invoke(query2)
display_retrieved_results(results2)

Output keys (text-only): odict_keys(['last_hidden_state', 'pooler_output'])
Pooler output shape (text-only): torch.Size([1, 1024])
Embedding shape after squeeze (text-only): (1024,)


RuntimeError: lance error: Invalid user input: query dim(1024) doesn't match the column vector vector dim(2048), C:\Users\runneradmin\.cargo\registry\src\index.crates.io-6f17d22bba15001f\lance-0.24.1\src\dataset\scanner.rs:724:25