In [24]:
import os

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter

from supabase.client import Client, create_client
from dotenv import dotenv_values

In [25]:
# Set up environment variables
env_vars = dotenv_values(".env")
os.environ["OPENAI_API_KEY"] = env_vars['OPENAI_API_KEY']
os.environ["LANGCHAIN_API_KEY"] = env_vars['LANGCHAIN_API_KEY']
os.environ["LANGCHAIN_TRACING_V2"] = "true"
supabase_url = env_vars['SUPABASE_URL']
supabase_key = env_vars['SUPABASE_KEY']

In [26]:
# Load the documents from 
loader = DirectoryLoader('./sfdr-docs/', glob="**/*.pdf", show_progress=True, use_multithreading=True, loader_cls=PyPDFLoader)

In [27]:
docs = loader.load()

100%|██████████| 3/3 [00:00<00:00,  7.53it/s]


In [28]:
# Add metadata
for doc in docs:
    if doc.metadata['source'] == 'sfdr-docs/12-07-2020-SFDR-amended.pdf':
        doc.metadata.update({"source_url":'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A02019R2088-20200712&qid=1715177988813' })
    
    if doc.metadata['source'] == 'sfdr-docs/09-01-2024-SFDR-consolidated.pdf':
        doc.metadata.update({"source_url":'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A02019R2088-20240109&qid=1715177988813' })

    if doc.metadata['source'] == 'sfdr-docs/27-11-2019-SFDR-original.pdf':
        doc.metadata.update({"source_url":'https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A32019R2088&qid=1715177988813' })

In [29]:
# Split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)
docs = text_splitter.split_documents(docs)

In [30]:
# Create the embedding function
embedding_function = OpenAIEmbeddings()

# Connect to database
supabase = create_client(supabase_url, supabase_key)

# Access vectorstore
vector_store = SupabaseVectorStore(
    client=supabase,
    embedding=embedding_function,
    table_name="documents",
    query_name="match_documents",
)

In [31]:
# Add documents to vectorstore
doc_ids = vector_store.add_documents(docs)