In [30]:
import asyncio
import uuid

from langchain_google_vertexai import VertexAIEmbeddings
from sqlalchemy import text

from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore

import pandas as pd

In [31]:
# Set this to true this if you want to use cloudsql
# USE_CLOUDSQL = False
USE_CLOUDSQL = True

project_id = "imrenagi-gemini-experiment" #change this to your project id
region = "us-central1"
gemini_embedding_model = "text-embedding-004"

if not USE_CLOUDSQL:
    # use pgvector docker image for local development
    database_password = "pyconapac"
    database_name = "pyconapac"
    database_user = "pyconapac"
    database_host = "localhost"
else:
    # use cloudsql credential if you want to use cloudsql
    instance_name="pyconapac-demo"
    database_password = 'testing'
    database_name = 'testing'
    database_user = 'testing'

assert database_name, "⚠️ Please provide a database name"
assert database_user, "⚠️ Please provide a database user"
assert database_password, "⚠️ Please provide a database password"

embeddings_table_name = "course_content_embeddings"


In [32]:
#@markdown ###Authenticate your Google Cloud Account and enable APIs.
# Authenticate gcloud.
# from google.colab import auth
# auth.authenticate_user()

# Configure gcloud.
!gcloud config set project {project_id}

# Grant Cloud SQL Client role to authenticated user
current_user = !gcloud auth list --filter=status:ACTIVE --format="value(account)"
print(f"{current_user}")
# enable aiplatform apiservices

Updated property [core/project].
['imre.nagi2812@gmail.com']


In [33]:
if USE_CLOUDSQL:
  print(f"Granting Cloud SQL Client role to {current_user[0]}")
  # granting cloudsql client role to the current user
  !gcloud projects add-iam-policy-binding {project_id} \
    --member=user:{current_user[0]} \
    --role="roles/cloudsql.client"
  # Enable Cloud SQL Admin API
  !gcloud services enable sqladmin.googleapis.com

Granting Cloud SQL Client role to imre.nagi2812@gmail.com


Updated IAM policy for project [imrenagi-gemini-experiment].
bindings:
- members:
  - serviceAccount:service-896489987664@gcp-sa-aiplatform-cc.iam.gserviceaccount.com
  role: roles/aiplatform.customCodeServiceAgent
- members:
  - serviceAccount:service-896489987664@gcp-sa-vertex-ex-cc.iam.gserviceaccount.com
  role: roles/aiplatform.extensionCustomCodeServiceAgent
- members:
  - serviceAccount:service-896489987664@gcp-sa-vertex-ex.iam.gserviceaccount.com
  role: roles/aiplatform.extensionServiceAgent
- members:
  - serviceAccount:service-896489987664@gcp-sa-vertex-rag.iam.gserviceaccount.com
  role: roles/aiplatform.ragServiceAgent
- members:
  - serviceAccount:service-896489987664@gcp-sa-aiplatform-re.iam.gserviceaccount.com
  role: roles/aiplatform.reasoningEngineServiceAgent
- members:
  - serviceAccount:service-896489987664@gcp-sa-aiplatform.iam.gserviceaccount.com
  role: roles/aiplatform.serviceAgent
- members:
  - serviceAccount:service-896489987664@gcp-sa-artifactregistry.iam.g

In [34]:
if USE_CLOUDSQL:
  #@markdown Create and setup a Cloud SQL PostgreSQL instance, if not done already.
  database_version = !gcloud sql instances describe {instance_name} --format="value(databaseVersion)"
  if database_version[0].startswith("POSTGRES"):
    print("Found an existing Postgres Cloud SQL Instance!")
  else:
    print("Creating new Cloud SQL instance...")
    !gcloud sql instances create {instance_name} --database-version=POSTGRES_15 \
      --region={region} --cpu=1 --memory=4GB --root-password={database_password} \
      --authorized-networks=0.0.0.0/0
  # Create the database, if it does not exist.
  out = !gcloud sql databases list --instance={instance_name} --filter="NAME:{database_name}" --format="value(NAME)"
  if ''.join(out) == database_name:
    print("Database %s already exists, skipping creation." % database_name)
  else:
    !gcloud sql databases create {database_name} --instance={instance_name}
  # Create the database user for accessing the database.
  !gcloud sql users create {database_user} \
    --instance={instance_name} \
    --password={database_password}

Found an existing Postgres Cloud SQL Instance!
Database testing already exists, skipping creation.
Creating Cloud SQL user...done.                                                
Created user [testing].


In [35]:
if USE_CLOUDSQL:
    # get the ip address of the instance
    ip_addresses = !gcloud sql instances describe {instance_name} --project {project_id} --format 'value(ipAddresses.ipAddress)'
    # Split the IP addresses and take the first one
    database_host = ip_addresses[0].split(';')[0].strip()
    print(f"Using database host: {database_host}")

Using database host: 35.232.5.157


In [36]:
db_conn_string = f"postgres://{database_user}:{database_password}@{database_host}:5432/{database_name}"
db_conn_string

'postgres://testing:testing@35.232.5.157:5432/testing'

In [37]:
# Read the JSONL file into a pandas DataFrame
df = pd.read_json('course_content.jsonl', lines=True)
df.head(5)

Unnamed: 0,id,title,content,file_path,slug
0,1,REST Security Cheat Sheet,# REST Security Cheat Sheet\n\n## Introduction...,sources/REST_Security_Cheat_Sheet.md,rest-security-cheat-sheet
1,2,Forgot Password Cheat Sheet,# Forgot Password Cheat Sheet\n\n## Introducti...,sources/Forgot_Password_Cheat_Sheet.md,forgot-password-cheat-sheet
2,3,Authentication Cheat Sheet,# Authentication Cheat Sheet\n\n## Introductio...,sources/Authentication_Cheat_Sheet.md,authentication-cheat-sheet
3,4,Password Storage Cheat Sheet,# Password Storage Cheat Sheet\n\n## Introduct...,sources/Password_Storage_Cheat_Sheet.md,password-storage-cheat-sheet
4,5,Authorization Cheat Sheet,# Authorization Cheat Sheet\n\n## Introduction...,sources/Authorization_Cheat_Sheet.md,authorization-cheat-sheet


In [38]:
from langchain.text_splitter import MarkdownTextSplitter

text_splitter = MarkdownTextSplitter(
  chunk_size=1000, 
  chunk_overlap=200)

from langchain_core.documents import Document

chunked = []
for index, row in df.iterrows():
    course_content_id = row["id"]
    title = row["title"]
    content = row["content"]
    splits = text_splitter.create_documents([content])
    for s in splits:
        metadata = {"course_content_id": course_content_id, "title": title}
        doc = Document(page_content=s.page_content, metadata=metadata)
        chunked.append(doc)

chunked[0]

Document(metadata={'course_content_id': 1, 'title': 'REST Security Cheat Sheet'}, page_content="# REST Security Cheat Sheet\n\n## Introduction\n\n[REST](http://en.wikipedia.org/wiki/Representational_state_transfer) (or **RE**presentational **S**tate **T**ransfer) is an architectural style first described in [Roy Fielding](https://en.wikipedia.org/wiki/Roy_Fielding)'s Ph.D. dissertation on [Architectural Styles and the Design of Network-based Software Architectures](https://www.ics.uci.edu/~fielding/pubs/dissertation/top.htm).\n\nIt evolved as Fielding wrote the HTTP/1.1 and URI specs and has been proven to be well-suited for developing distributed hypermedia applications. While REST is more widely applicable, it is most commonly used within the context of communicating with services via HTTP.")

In [39]:
gemini_embedding_model = "textembedding-gecko@latest"

In [40]:
from langchain_google_vertexai import VertexAIEmbeddings
import time
import vertexai

# Initialize Vertex AI
vertexai.init(project=project_id, location=region)
# Create a Vertex AI Embeddings service
embeddings_service = VertexAIEmbeddings(model_name=gemini_embedding_model)

In [41]:
async def create_vectorstore():
    engine = await PostgresEngine.afrom_instance(
        project_id,
        region,
        instance_name,
        database_name,
        user=database_user,
        password=database_password,
    )

    await engine.ainit_vectorstore_table(
        table_name=embeddings_table_name, vector_size=768, overwrite_existing=True
    )

    vector_store = await PostgresVectorStore.create(
        engine,
        table_name=embeddings_table_name,
        embedding_service=embeddings_service,
    )

    ids = [str(uuid.uuid4()) for i in range(len(chunked))]
    await vector_store.aadd_documents(chunked, ids=ids)

await create_vectorstore()