### Indexing
This is developed according to 
https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/feature_store/online_feature_serving_and_vector_retrieval_bigquery_data_with_feature_store.ipynb

We use FeatureStore as it is the best approach for profduction ready development and online servings.

In [6]:
# Install the packages
!pip3 install --upgrade --quiet google-cloud-aiplatform\
                                 google-cloud-bigquery\
                                 db-dtypes

In [8]:
!pip install protobuf==3.20

Collecting protobuf==3.20
  Downloading protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (698 bytes)
Downloading protobuf-3.20.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires grpcio!=1.48.0,<2,>=1.33.1, but you have grpcio 1.48.0 which is incompatible.
google-api-core 2.11.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0.dev0,>=3.19.5, but y

In [7]:
#restart kernel
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [2]:
from google.cloud import bigquery
from google.cloud.aiplatform_v1.types import NearestNeighborQuery

In [3]:
from google.cloud import bigquery
from google.cloud.aiplatform_v1.types import NearestNeighborQuery
from vertexai.resources.preview import (FeatureOnlineStore, FeatureView,
                                        FeatureViewBigQuerySource)
from vertexai.resources.preview.feature_store import utils

#set project info
PROJECT = !gcloud config get-value project
PROJECT_ID = PROJECT[0]
REGION = "us-central1" 

bq_client = bigquery.Client(project=PROJECT_ID)


### 1- Create Online Feature Ftore

In [5]:
FEATURE_ONLINE_STORE_ID = "nine_quality_test_multimodal_featurestore"  # @param {type: "string"}
ofs = FeatureOnlineStore.create_optimized_store(FEATURE_ONLINE_STORE_ID)


# get full information of the created feature online store instance
ofs.gca_resource

Creating FeatureOnlineStore
Create FeatureOnlineStore backing LRO: projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/operations/4466007496615526400
FeatureOnlineStore created. Resource name: projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore
To use this FeatureOnlineStore in another session:
feature_online_store = aiplatform.FeatureOnlineStore('projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore')


name: "projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore"
create_time {
  seconds: 1724138637
  nanos: 344398000
}
update_time {
  seconds: 1724138638
  nanos: 226688000
}
etag: "AMEw9yMGNX63FuuH30kgFsQSG5TdCWBm6EKyMKvP84WJ4zdjIEAMGJJXYxvZQ_0c1nk="
state: STABLE
dedicated_serving_endpoint {
}
optimized {
}

In [6]:
# Use get to verify the store is created.
FeatureOnlineStore(FEATURE_ONLINE_STORE_ID).gca_resource

name: "projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore"
create_time {
  seconds: 1724138637
  nanos: 344398000
}
update_time {
  seconds: 1724138638
  nanos: 226688000
}
etag: "AMEw9yOrrqOrwQ021QPiM91OudAanEkolvtDTDPpY5ySPjRUt72nnNDVqs7WljYTjBM="
state: STABLE
dedicated_serving_endpoint {
}
optimized {
}

### 2-Create Feature View Instance

In [7]:
FEATURE_VIEW_ID = "feature_view_nine_quality_test"  # @param {type: "string"}
# A schedule is created based on cron setting.
CRON_SCHEDULE = "TZ=America/Los_Angeles 00 13 11 8 *"  # @param {type: "string"}

In [8]:
# Index configs
DIMENSIONS = 1408  # @param {type: "number"}
EMBEDDING_COLUMN = "multimodal_embedding"  # @param {type: "string"}
# Optional
LEAF_NODE_EMBEDDING_COUNT = 10000  # @param {type: "number"}
# Optional
#to do: set this later
#CROWDING_COLUMN = "cited_by_filing_date"  # @param {type: "string"}
# Optional
#for multimodal embeddings this can be set to None
FILTER_COLUMNS = ["id","media_type","path","end_offset_sec_chapter","start_offset_sec_chapter"]  # @param

In [10]:
BQ_DATASET_ID='Nine_Quality_Test'
BQ_TABLE_ID='multimodal_embeddings'
BQ_TABLE_ID_FQN = f"{PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}"
DATA_SOURCE = f"bq://{BQ_TABLE_ID_FQN}"

big_query_source = FeatureViewBigQuerySource(
    uri=DATA_SOURCE, entity_id_columns=["id"]
)

index_config = utils.IndexConfig(
    embedding_column=EMBEDDING_COLUMN,
    dimensions=DIMENSIONS,
    crowding_column=None,# to do:  define CROWDING_COLUMN
    #filter_columns=FILTER_COLUMNS, #for multimodal embeddings this can be set to None
    algorithm_config=utils.TreeAhConfig(),
)

print(f"index_config: {index_config}")

nine_fv = ofs.create_feature_view(
    FEATURE_VIEW_ID,
    source=big_query_source,
     # Optional, can be set to None.
    #to do: set to CRON_SCHEDULE
    sync_config=None, #CRON_SCHEDULE
    index_config=index_config,
)

index_config: IndexConfig(embedding_column='multimodal_embedding', dimensions=1408, algorithm_config=TreeAhConfig(leaf_node_embedding_count=None), filter_columns=None, crowding_column=None, distance_measure_type=None)
Creating FeatureView
Create FeatureView backing LRO: projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/featureViews/feature_view_nine_quality_test/operations/7807396945147723776
FeatureView created. Resource name: projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/featureViews/feature_view_nine_quality_test
To use this FeatureView in another session:
feature_view = aiplatform.FeatureView('projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/featureViews/feature_view_nine_quality_test')


In [11]:
#Verify that the FeatureView instance is created by getting the feature view.
FeatureView(
    FEATURE_VIEW_ID, feature_online_store_id=FEATURE_ONLINE_STORE_ID
).gca_resource
     

name: "projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/featureViews/feature_view_nine_quality_test"
create_time {
  seconds: 1724138866
  nanos: 263997000
}
update_time {
  seconds: 1724138866
  nanos: 774961000
}
etag: "AMEw9yPf7sWwZBx9EmLitHfdVpv2rZsZ0UwvjNtky-p3nYsfW-1A5vf5OF0qMW4m4-zF"
big_query_source {
  uri: "bq://nine-quality-test.Nine_Quality_Test.multimodal_embeddings"
  entity_id_columns: "id"
}
index_config {
  embedding_column: "multimodal_embedding"
  embedding_dimension: 1408
  distance_measure_type: DOT_PRODUCT_DISTANCE
  tree_ah_config {
    leaf_node_embedding_count: 1000
  }
}

In [84]:
sync_response = nine_fv.sync()


In [85]:

import time

while True:
    feature_view_sync = nine_fv.get_sync(
        sync_response.resource_name.split("/")[9]
    ).gca_resource
    if feature_view_sync.run_time.end_time.seconds > 0:
        status = "Succeed" if feature_view_sync.final_status.code == 0 else "Failed"
        print(f"Sync {status} for {feature_view_sync.name}. \n {feature_view_sync}")
        # wait a little more for the job to properly shutdown
        time.sleep(30)
        break
    else:
        print("Sync ongoing, waiting for 30 seconds.")
    time.sleep(30)

Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync ongoing, waiting for 30 seconds.
Sync Succeed for projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/featureViews/feature_view_nine_quality_test/featureViewSyncs/7008757002046275584. 
 name: "projects/494586852359/locations/us-central1/featureOnlineStores/nine_quality_test_multimodal_featurestore/featureViews/feature_view_nine_quality_test/featureViewSyncs/7008757002046275584"
create_time {
  seconds: 1724144047
  nanos: 379210000
}
final_status {
}
run_time {
  start_time {
    seconds: 1724144047
    nanos: 379210000
  }
  end_time {
    seconds: 1724144166
    nanos: 606870000
  }
}
sync_summary {
  row_synced: 609
  total_slot: 4207
}



In [90]:
#for mutimodal embeddings we can ignore this, unless having a description column that want to search through it
# country_filter = NearestNeighborQuery.StringFilter(
#     name="country",
#     allow_tokens=["WIPO (PCT)"],  # try different allow tokens
#     deny_tokens=["United States"],  # try different deny tokens
# )

In [91]:
# EMBEDDINGS = [1] * DIMENSIONS

# result=nine_fv.search(
#     embedding_value=EMBEDDINGS,
#     neighbor_count=10,
#     #string_filters=[country_filter],#for multimodal embedding this can be set to None, unless having a description column
#     return_full_entity=True,  # returning entities with metadata
# )

In [92]:
# result=result.to_dict()

In [93]:
# nearest_neighbours=[]
# for neighbour in result['neighbors']:
#     nearest_neighbour={}
#     nearest_neighbour['entity_id']=neighbour['entity_id']
#     nearest_neighbour['distance']=neighbour['distance']
  
#     for feature in neighbour['entity_key_values']['key_values']['features']:
#         if 'value' in feature:
#             if type(list(feature['value'].values())[0]) is dict:
#                 nearest_neighbour[feature['name']]=[]#list(list(feature['value'].values())[0].values())[0]             
#             else:
#                 nearest_neighbour[feature['name']]=list(feature['value'].values())[0]             
#         else :
#             nearest_neighbour[feature['name']]=None
   
#     nearest_neighbours.append(nearest_neighbour)
    

In [94]:
# nearest_neighbours