# Data Preparation

In [24]:
# installing dependencies
%pip install --upgrade --quiet \
    google-cloud-aiplatform \
    langchain \
    langchain_core \
    langchain_community \
    langchain-google-vertexai \
    langchain-openai \
    langchain_postgres \
    psycopg \
    cloudpickle \
    pydantic \
    langchain_google_community \
    google-cloud-discoveryengine \
    google-api-python-client \
    google-auth


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# dependency imports
from langchain_openai import OpenAIEmbeddings

from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownTextSplitter

from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

import os

In [26]:
# constan definitions
OPENAI_APIKEY = os.environ.get('OPENAI_API_KEY')
OPENAI_EMBEDDING_MODEL = "text-embedding-3-small"

In [5]:
markdown_path = "CheatSheetSeries/cheatsheets/Authentication_Cheat_Sheet.md"
loader = UnstructuredMarkdownLoader(markdown_path)
data = loader.load()

[Document(metadata={'source': 'CheatSheetSeries/cheatsheets/Authentication_Cheat_Sheet.md'}, page_content='Authentication Cheat Sheet\n\nIntroduction\n\nAuthentication (AuthN) is the process of verifying that an individual, entity, or website is who or what it claims to be by determining the validity of one or more authenticators (like passwords, fingerprints, or security tokens) that are used to back up this claim.\n\nDigital Identity is the unique representation of a subject engaged in an online transaction. A digital identity is always unique in the context of a digital service but does not necessarily need to be traceable back to a specific real-life subject.\n\nIdentity Proofing establishes that a subject is actually who they claim to be. This concept is related to KYC concepts and it aims to bind a digital identity with a real person.\n\nSession Management is a process by which a server maintains the state of an entity interacting with it. This is required for a server to remembe

In [43]:
# Initialize the MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = text_splitter.split_documents(data)

# Add content_length to metadata
for doc in split_docs:
    doc.metadata["content_length"] = len(doc.page_content)

print(f"Number of split documents: {len(split_docs)}")
print(f"{split_docs[0].page_content[:200]}...")

Number of split documents: 36
Authentication Cheat Sheet

Introduction

Authentication (AuthN) is the process of verifying that an individual, entity, or website is who or what it claims to be by determining the validity of one or...


In [44]:
split_docs[0].metadata

{'source': 'CheatSheetSeries/cheatsheets/Authentication_Cheat_Sheet.md',
 'content_length': 737}

In [28]:
embeddings_model = OpenAIEmbeddings(model=OPENAI_EMBEDDING_MODEL, api_key=OPENAI_APIKEY)
# Create embeddings for all split documents
embeddings = embeddings_model.embed_documents([doc.page_content for doc in split_docs])

print(f"Number of embeddings created: {len(embeddings)}")
print(f"Dimension of each embedding: {len(embeddings[0])}")

Number of embeddings created: 36
Dimension of each embedding: 1536


In [45]:
# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://pyconapac:pyconapac@localhost:5432/pyconapac"  # Uses psycopg3!
collection_name = "my_docs"

vectorstore = PGVector.from_documents(
                embedding=embeddings_model,
                documents=split_docs,
                connection=connection,
                collection_name=collection_name,
                use_jsonb=True,
                async_mode=False,
            )

In [None]:
results = vectorstore.similarity_search(
    "authentication", k=10,
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")