In [1]:
!pip install --upgrade --user google-cloud-aiplatform google-cloud-storage
!pip install langchain_community
!pip install google-cloud-aiplatform
!pip install google-cloud-storage
!pip install langchain_community
!pip install langchain
!pip install pypdf
!pip install google-cloud-firestore
!pip install textract
!pip install PyPDF2
!pip install pandas
!pip install firebase_admin



In [2]:
import IPython
from IPython.display import Markdown, display
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"

# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")
PROJECT_ID

'qwiklabs-gcp-02-9649b4c6cbee'

In [2]:
#enable apis
!gcloud services enable \
  aiplatform.googleapis.com \
  storage.googleapis.com \
  run.googleapis.com \
  compute.googleapis.com \
  cloudresourcemanager.googleapis.com \
  container.googleapis.com \
  artifactregistry.googleapis.com \
  containerregistry.googleapis.com \
  containerscanning.googleapis.com

Operation "operations/acat.p2-365954090631-2694ef0f-da7b-4ae8-a8be-3030cec22261" finished successfully.


In [7]:
#create firestore db
#!gcloud config set project PROJECT_ID
!gcloud firestore databases create \
  --project=qwiklabs-gcp-02-9649b4c6cbee \
  --location=us-central1 \
  --type=firestore-native

metadata:
  '@type': type.googleapis.com/google.firestore.admin.v1.CreateDatabaseMetadata
name: projects/qwiklabs-gcp-02-9649b4c6cbee/databases/(default)/operations/7fYx3UWLrKbSQb3GwRd5qxAqMWxhcnRuZWMtc3ULIgUQA7aayeAQBrOhz44IDAovGg
response:
  '@type': type.googleapis.com/google.firestore.admin.v1.Database
  appEngineIntegrationMode: DISABLED
  concurrencyMode: PESSIMISTIC
  createTime: '2024-06-11T15:04:46.918988Z'
  deleteProtectionState: DELETE_PROTECTION_DISABLED
  earliestVersionTime: '2024-06-11T15:04:46.918988Z'
  etag: IIDesrHp04YDMMyKsrHp04YD
  locationId: us-central1
  name: projects/qwiklabs-gcp-02-9649b4c6cbee/databases/(default)
  pointInTimeRecoveryEnablement: POINT_IN_TIME_RECOVERY_DISABLED
  type: FIRESTORE_NATIVE
  uid: ab7917c1-c6bd-41d2-a6ac-8b45dd31f6ed
  updateTime: '2024-06-11T15:04:46.918988Z'
  versionRetentionPeriod: 3600s


In [8]:
def create_data_packet(file_name, file_type, page_number, file_content):
    """Creating a simple dictionary to store all information (content and metadata)
    extracted from the document"""
    data_packet = {}
    data_packet["file_name"] = file_name
    data_packet["file_type"] = file_type
    data_packet["page_number"] = page_number
    data_packet["content"] = file_content
    return data_packet

In [9]:
# if needed to load from cloud storage
# !mkdir test
!gsutil -m cp -r gs://qwiklabs-gcp-02-9649b4c6cbee/fpc-manual.pdf .

Copying gs://qwiklabs-gcp-02-9649b4c6cbee/fpc-manual.pdf...
/ [1/1 files][  8.5 MiB/  8.5 MiB] 100% Done                                    
Operation completed over 1 objects/8.5 MiB.                                      


In [10]:
#pdf parser
import os
import textract
from PyPDF2 import PdfReader

final_data = []

file_name = 'fpc-manual'
file_type ='pdf'

path = f"fpc-manual.pdf"

# loading pdf files, with page numbers as metadata.
reader = PdfReader(path)
for i, page in enumerate(reader.pages):

    text = page.extract_text()
    if text:
        packet = create_data_packet(
              file_name, file_type, page_number=int(i + 1), file_content=text
        )

        final_data.append(packet)

In [11]:
# converting the data that has been read from GCS to Pandas DataFrame for easy readibility and downstream logic
import pandas as pd
pdf_data = pd.DataFrame.from_dict(final_data)
pdf_data.reset_index(inplace=True, drop=True)
pdf_data.head()

Unnamed: 0,file_name,file_type,page_number,content
0,fpc-manual,pdf,1,The Health Code\nThese are regulations that we...
1,fpc-manual,pdf,2,The United States has one of the\nsafest food ...
2,fpc-manual,pdf,3,What is Ready-To- Eat Food?\nAny food product ...
3,fpc-manual,pdf,4,Ice-Point Method\n/H20321/H20321Fill a contain...
4,fpc-manual,pdf,5,Fresh fish\nThere is no inspection for fresh\n...


In [12]:
# init the vertexai package
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [13]:
# Load the text embeddings model
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")

In [14]:
#batch embeddings to embedding api
import time
import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs


In [15]:
# get embeddings for the question titles and add them as "embedding" column
pdf_data = pdf_data.assign(embedding=get_embeddings_wrapper(list(pdf_data.content)))
pdf_data.head()

100%|██████████| 19/19 [00:23<00:00,  1.26s/it]


Unnamed: 0,file_name,file_type,page_number,content,embedding
0,fpc-manual,pdf,1,The Health Code\nThese are regulations that we...,"[-0.013569382950663567, -0.033456653356552124,..."
1,fpc-manual,pdf,2,The United States has one of the\nsafest food ...,"[-0.01585172675549984, -0.012489386834204197, ..."
2,fpc-manual,pdf,3,What is Ready-To- Eat Food?\nAny food product ...,"[-0.011286059394478798, -0.011021223850548267,..."
3,fpc-manual,pdf,4,Ice-Point Method\n/H20321/H20321Fill a contain...,"[-0.013480892404913902, -0.017172927036881447,..."
4,fpc-manual,pdf,5,Fresh fish\nThere is no inspection for fresh\n...,"[0.0021019037812948227, -0.018236981704831123,..."


In [16]:
#load data to firestore db
import pandas as pd
import firebase_admin
from firebase_admin import firestore

# Initialize the Firestore client
firebase_admin.initialize_app()
db = firestore.client()

# Get the PDF data
#pdf_data = pd.read_csv("pdf_data.csv", index_col=0)

# Iterate over the rows in the PDF data
for index, row in pdf_data.iterrows():
    # Get the page number
    page_number = row["page_number"]

    # Get the content and embedding
    content = row["content"]
    embedding = row["embedding"]

    # Create a document reference
    doc_ref = db.collection("pages").document(str(page_number))

    # Set the document data
    doc_ref.set({
        "content": content,
        "embedding": embedding
    })


In [17]:
#create a embedding.jsonl file
import pandas as pd
import json

# Get the PDF data
#pdf_data = pd.read_csv("pdf_data.csv", index_col=0)

# Create a list of dictionaries, where each dictionary contains the page number and embedding
embeddings = []
for index, row in pdf_data.iterrows():
    embeddings.append({
        "page_number": row["page_number"],
        "embedding": row["embedding"]
    })

# Convert the list of dictionaries to JSON-L format
json_l = "\n".join(json.dumps(embedding) for embedding in embeddings)

# Write the JSON-L data to a file
with open("embeddings.jsonl", "w") as f:
    f.write(json_l)

In [18]:
#save the embeddings.jsonl file to cloud storage
from google.cloud import storage

# Create a storage client
storage_client = storage.Client()

# Get the bucket
bucket = storage_client.bucket(PROJECT_ID)

# Upload the file
bucket.blob("embeddings.jsonl").upload_from_filename("embeddings.jsonl")

# Print a success message
print("File uploaded successfully to Google Cloud Storage.")

File uploaded successfully to Google Cloud Storage.


In [19]:
# init the aiplatform package
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

BUCKET_URI = "gs://"+PROJECT_ID+"/embeddings.jsonl"

In [20]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"assessment-index",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=5,
)


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Creating MatchingEngineIndex
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Create MatchingEngineIndex backing LRO: projects/365954090631/locations/us-central1/indexes/8226796687671164928/operations/7377610081916944384
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:MatchingEngineIndex created. Resource name: projects/365954090631/locations/us-central1/indexes/8226796687671164928
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:To use this MatchingEngineIndex in another session:
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:index = aiplatform.MatchingEngineIndex('projects/365954090631/locations/us-central1/indexes/8226796687671164928')


In [21]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"assessment-index-endpoint", public_endpoint_enabled=True
)


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Creating MatchingEngineIndexEndpoint
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Create MatchingEngineIndexEndpoint backing LRO: projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320/operations/3417257159598014464
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint created. Resource name: projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:To use this MatchingEngineIndexEndpoint in another session:
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320')


In [22]:
#DEPLOYED_INDEX_ID = f"assessmentindexendpoint"
DEPLOYED_INDEX_ID =  f"assessment_deployed1"
print(DEPLOYED_INDEX_ID)

assessment_deployed1


In [23]:
# deploy the Index to the Index Endpoint

my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320/operations/4373709130460823552
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x78c704d81870> 
resource name: projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320

In [24]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(
    index_endpoint_name ="projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320"
)

In [33]:
def text_embedding(text_to_embed) -> list:
    """Text embedding with a Large Language Model."""
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")
    embeddings = model.get_embeddings(text_to_embed)
    for embedding in embeddings:
        vector = embedding.values
        #print(f"Length of Embedding Vector: {len(vector)}")
    return vector

# #query the vector database
# #question =

from vertexai.language_models import  TextEmbeddingInput

QUESTION="is the content about food safety"

question_with_task_type = TextEmbeddingInput(
    text=QUESTION,
    task_type='RETRIEVAL_QUERY'
)



In [34]:
#question_embeddings = get_embeddings_wrapper(question_with_task_type)
question_embeddings = text_embedding([question_with_task_type])

In [35]:

question_embeddings[0]

0.0032463090028613806

In [53]:

# run query
response = my_index_endpoint.find_neighbors(
    deployed_index_id = DEPLOYED_INDEX_ID,
    queries = [question_embeddings],
    num_neighbors = 5
)

# if response.results:
#     print("Query successful! Neighbors found.")
#     for neighbor in response.results:
#         print(f"  - Distance: {neighbor.distance:.2f}, ID: {neighbor.id}")
# else:
#     print("Query failed or no neighbors found.")

In [56]:
response

[]

In [57]:
# from google.cloud import aiplatform

# # ... (Your existing code)

# # Create a query embedding
# #query_embedding = [1.0, 2.0, 3.0]  # Example embedding

# # Create an IndexEndpointService client
# index_endpoint_service = aiplatform.gapic.IndexEndpointService()


# # Send a query
# response = index_endpoint_service.search_index_endpoint(
#     index_endpoint="projects/365954090631/locations/us-central1/indexEndpoints/2402234994597560320",
#     deployed_index_id=DEPLOYED_INDEX_ID,  # Replace with your deployed index ID
#     queries=[query_embeddings],
#     num_neighbors=5
# )

# # Check the response
# if response.results:
#     print("Query successful! Neighbors found.")
#     for neighbor in response.results:
#         print(f"  - Distance: {neighbor.distance:.2f}, ID: {neighbor.id}")
# else:
#     print("Query failed or no neighbors found.")


In [38]:
# ... (Your existing code)

# Run the query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=[question_embeddings],
    num_neighbors=5
)

# Print the results
for idx, neighbor in enumerate(response[0]):
    print(f"{idx+1}. Distance: {neighbor.distance:.2f}, ID: {neighbor.id}")


IndexError: list index out of range

In [None]:
# # show the results
# for idx, neighbor in enumerate(response[0]):
#     print(f"{neighbor.distance:.2f} {neighbor.id}")


In [None]:
# documents = {}
# for index, neighbor in enumerate(response[0]):
#     id = str(neighbor.id)
#     document = db.collection(collection_name).document(id).get()
#     documents.append(document.to_dict["pages"])

In [None]:
# pages = "\n\n".join(documents)
# print(len(pages))
# pages

0


''

In [None]:
# # show the result
# import numpy as np

# for idx, neighbor in enumerate(response[0]):
#     id = np.int64(neighbor.id)
#     similar = df.query("id == @id", engine="python")
#     print(f"{neighbor.distance:.4f} {similar.title.values[0]}")

In [None]:
#  documents = {}
#     for index, neighbor in enumerate(respone[0]):
#         id = str(neighbor.id)
#         document = db.collection(collection_name).document(id).get()
#         documents.append(document.to_dict["page"])


#     pages = "\n\n".join(documents)

In [None]:
# create a firestore database for the pages