In [19]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=Tyl2uIlK696vDs6cAcp5BnpZoyvxxH&access_type=offline&code_challenge=WHhJLV9TjVXDIXHHrMuU9Fv82Dzsx97Qpt0p0GCBWyA&code_challenge_method=S256


Credentials saved to file: [C:\Users\sandi\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "cloudrun-test-415115" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


In [20]:
from google.cloud import storage
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
import PyPDF2

import re
import os
import random
import json
import uuid

In [41]:
#Initialize variables

project = "cloudrun-test-415115"
location = "us-central1"

pdf_path = "D:\Projects\Dementia-Hackathon\Data\WhatIsDementia.pdf"
bucket_name = "dementia-hackathon-text-embeddings"
embed_file_path = "D:\Projects\Dementia-Hackathon\Files\dementia_embeddings.json"
sentence_file_path = "D:\Projects\Dementia-Hackathon\Files\dementia_sentences.json"
index_name = "dementia_index"

In [22]:
# Extract sentences from PDF

def extract_sentences_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            if page.extract_text() is not None:
                text += page.extract_text() + " "
    sentences = [sentence.strip() for sentence in text.split('. ') if sentence.strip()]
    return sentences

In [23]:
# Generate Embeddings

def generate_text_embeddings(sentences) -> list: 
  aiplatform.init(project=project,location=location)
  model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
  embeddings = model.get_embeddings(sentences)
  vectors = [embedding.values for embedding in embeddings]
  return vectors

In [24]:
#Save Embeddings

def generate_and_save_embeddings(pdf_path, sentence_file_path, embed_file_path):
    def clean_text(text):
        cleaned_text = re.sub(r'\u2022', '', text)  # Remove bullet points
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespaces and strip
        return cleaned_text
    
    sentences = extract_sentences_from_pdf(pdf_path)
    if sentences:
        embeddings = generate_text_embeddings(sentences)
        
        with open(embed_file_path, 'w') as embed_file, open(sentence_file_path, 'w') as sentence_file:
            for sentence, embedding in zip(sentences, embeddings):
                cleaned_sentence = clean_text(sentence)
                id = str(uuid.uuid4())
                
                embed_item = {"id": id, "embedding": embedding}
                sentence_item = {"id": id, "sentence": cleaned_sentence}
                
                json.dump(sentence_item, sentence_file)
                sentence_file.write('\n') 
                json.dump(embed_item, embed_file)
                embed_file.write('\n')  

In [39]:
#Upload file to GCS bucket

def upload_file(bucket_name,file_path):
    storage_client = storage.Client()
    bucket = storage_client.create_bucket(bucket_name,location=location)
    file_name = file_path.split("\\")[-1] # This path slicing is for Windows
    blob = bucket.blob(file_name)
    blob.upload_from_filename(file_path)

In [26]:
#Create Vector Index

def create_vector_index(bucket_name, index_name):
    dt_rag_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = index_name,
    contents_delta_uri = "gs://"+bucket_name,
    dimensions = 768,
    approximate_neighbors_count = 10,
    )
                  
    dt_rag_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = index_name,
    public_endpoint_enabled = True
    )                      

    dt_rag_index_endpoint.deploy_index(
    index = dt_rag_index, deployed_index_id = index_name
    )

In [27]:
generate_and_save_embeddings(pdf_path,sentence_file_path,embed_file_path)

In [42]:
upload_file(bucket_name,embed_file_path)

In [43]:
create_vector_index(bucket_name, index_name)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1025759812276/locations/us-central1/indexes/4329942760879554560/operations/2854268173513916416
MatchingEngineIndex created. Resource name: projects/1025759812276/locations/us-central1/indexes/4329942760879554560
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1025759812276/locations/us-central1/indexes/4329942760879554560')
Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1025759812276/locations/us-central1/indexEndpoints/4083089206304309248/operations/7130436019702202368
MatchingEngineIndexEndpoint created. Resource name: projects/1025759812276/locations/us-central1/indexEndpoints/4083089206304309248
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1025759812276/locations/us-central1/indexEndpoints/4083089206304309248')
Deploying index 