# Data Preparation


In [24]:
# installing dependencies
%pip install --upgrade --quiet \
    google-cloud-aiplatform \
    langchain \
    langchain_core \
    langchain_community \
    langchain-google-vertexai \
    langchain-openai \
    langchain_postgres \
    psycopg \
    cloudpickle \
    pydantic \
    langchain_google_community \
    google-cloud-discoveryengine \
    google-api-python-client \
    google-auth


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# dependency imports
from langchain_google_vertexai import VertexAIEmbeddings

from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import MarkdownTextSplitter

from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

import os

In [42]:
# constan definitions
PROJECT_ID = "imrenagi-gemini-experiment" #change this to your project id
LOCATION = "us-central1" #change this to project location

NUM_OF_DOCUMENTS = 2 # number of documents to be processed
DOCUMENT_DIRECTORY = "CheatSheetSeries/cheatsheets" # directory of the documents

GEMINI_EMBEDDING_MODEL = "text-embedding-004"

PGVEC_CONNECTION = "postgresql+psycopg://pyconapac:pyconapac@localhost:5432/pyconapac"  # Uses psycopg3!
PGVEC_COLLECTION_NAME = "courses"

import vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [43]:

markdown_files = [f for f in os.listdir(DOCUMENT_DIRECTORY) if f.endswith('.md')][:NUM_OF_DOCUMENTS]
data = []

for file in markdown_files:
    markdown_path = os.path.join(DOCUMENT_DIRECTORY, file)
    loader = UnstructuredMarkdownLoader(markdown_path)
    data.extend(loader.load())

print(f"Loaded {len(data)} documents.")

for doc in data:
  print("title: ", doc.metadata["source"])

Loaded 2 documents.
title:  CheatSheetSeries/cheatsheets/Symfony_Cheat_Sheet.md
title:  CheatSheetSeries/cheatsheets/Deserialization_Cheat_Sheet.md


In [47]:
# Initialize the MarkdownTextSplitter
text_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=200)

split_docs = text_splitter.split_documents(data)
for doc in split_docs:
  doc.metadata["content_length"] = len(doc.page_content)

print(f"Number of split documents: {len(split_docs)}")
print(f"{split_docs[0].page_content[:200]}...")

Number of split documents: 54
Symfony Cheat Sheet

Introduction

This cheat sheet aims to provide developers with security tips when building applications using the Symfony framework.
It covers common vulnerabilities and best prac...


In [49]:
# Initialize the a specific Embeddings Model version
embeddings_model = VertexAIEmbeddings(model_name=GEMINI_EMBEDDING_MODEL)

# Create embeddings for all split documents
embeddings = embeddings_model.embed_documents([doc.page_content for doc in split_docs])

print(f"Number of embeddings created: {len(embeddings)}")
print(f"Dimension of each embedding: {len(embeddings[0])}")

print("Embeddings:", embeddings[0])

Number of embeddings created: 54
Dimension of each embedding: 768
Embeddings: [-0.0014167791232466698, -0.018394969403743744, -0.07092701643705368, -0.04370620474219322, 0.015060554258525372, -0.030686041340231895, 0.04158584028482437, 0.043601199984550476, -0.012581294402480125, 0.05282050743699074, -0.006957633420825005, 0.009388705715537071, 0.06234574317932129, -0.032270774245262146, -0.023218780755996704, -0.06949851661920547, -0.01650051400065422, -0.04405389353632927, -0.08122014999389648, 0.03288331255316734, 0.032570090144872665, -0.0045962524600327015, 0.008447544649243355, -0.0019308238988742232, -0.07021798938512802, -0.0037519768811762333, 0.016630006954073906, 0.0005860924138687551, -0.03136564791202545, 0.03141026198863983, 0.005134090315550566, 0.01834082417190075, 0.01607775129377842, -0.016795029863715172, 0.014221941120922565, 0.00489181699231267, 0.002634611912071705, 0.03677445650100708, 0.031205855309963226, -0.021951455622911453, -0.008959117345511913, 0.04643293

In [8]:
# See docker command above to launch a postgres instance with pgvector enabled.
vectorstore = PGVector.from_documents(
                embedding=embeddings_model,
                documents=split_docs,
                connection=PGVEC_CONNECTION,
                collection_name=PGVEC_COLLECTION_NAME,
                use_jsonb=True,
                async_mode=False,
            )

In [50]:
results = vectorstore.similarity_search(
    "authentication", k=10,
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* Authentication Cheat Sheet

Introduction

Authentication (AuthN) is the process of verifying that an individual, entity, or website is who or what it claims to be by determining the validity of one or more authenticators (like passwords, fingerprints, or security tokens) that are used to back up this claim.

Digital Identity is the unique representation of a subject engaged in an online transaction. A digital identity is always unique in the context of a digital service but does not necessarily need to be traceable back to a specific real-life subject.

Identity Proofing establishes that a subject is actually who they claim to be. This concept is related to KYC concepts and it aims to bind a digital identity with a real person. [{'source': 'CheatSheetSeries/cheatsheets/Authentication_Cheat_Sheet.md', 'content_length': 737}]
* Identity Proofing establishes that a subject is actually who they claim to be. This concept is related to KYC concepts and it aims to bind a digital identity wi