In [None]:
# Make sure you are in conda environment w/ requirements.txt installed
# From root of repo, run: 
# conda env create -f environment.yml
# conda activate semantic_retrieval
# Then when selecting kernel for this notebook, pick the conda semantic_retrieval kernel

In [1]:
%pip install -e .

Obtaining file:///Users/suyogsonwalkar/Projects/semantic-retrieval/python
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: semantic-retrieval
  Building editable for semantic-retrieval (pyproject.toml) ... [?25ldone
[?25h  Created wheel for semantic-retrieval: filename=semantic_retrieval-0.1.0-0.editable-py3-none-any.whl size=1704 sha256=01e0d701c2e42807dda4b44266f38e67d28477a94b2a77446bc9acc508169b7b
  Stored in directory: /private/var/folders/6m/pqj9tppx6vdg693y_4vlbs2r0000gn/T/pip-ephem-wheel-cache-j9tmqc25/wheels/67/f8/8a/49b91a1167a230dc2c74ea3f6c474e28026431d337e3e4c4a2
Successfully built semantic-retrieval
Installing collected packages: semantic-retrieval
  Attempting uninstall: semantic-retrieval
    Found existing installation: sema

In [2]:
# Need to have .env envrionment file (I put mine in root of project for this notebook)
# It should have OPENAI_API_KEY, PINECONE_INDEX_NAME, PINECONE_ENVIRONMENT, PINECONE_API_KEY in it
# Otherwise will error out on creating embeddings & uploading to pinecone

In [3]:
import os
from semantic_retrieval.document.metadata.in_memory_document_metadata_db import (
    InMemoryDocumentMetadataDB,
)
from semantic_retrieval.data_store.vector_dbs.pinecone_vector_db import PineconeVectorDB, PineconeVectorDBConfig
from semantic_retrieval.transformation.embeddings.openai_embeddings import OpenAIEmbeddings, OpenAIEmbeddingsConfig
from semantic_retrieval.ingestion.data_sources.fs.file_system import FileSystem
from semantic_retrieval.document_parsers.multi_document_parser import (
    MultiDocumentParser,
    ParserConfig,
)
from semantic_retrieval.transformation.document.text.separator_text_chunker import (
    SeparatorTextChunker,
    SeparatorTextChunkerParams,
)
from semantic_retrieval.transformation.document.text.text_chunk_transformer import (
    TextChunkConfig,
)
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


In [4]:
metadata_db = InMemoryDocumentMetadataDB()

async def test_create_index():
    load_dotenv()

    # other_example_dir = "examples/example_data/financial_report/portfolios"
    example_dir = "../examples/example_data/ingestion/DonQuixote.txt"
    cwd = os.path.normpath(os.getcwd())
    full_path = os.path.join(cwd, example_dir)
    file_system = FileSystem(full_path)
    raw_documents = file_system.load_documents()

    parsed_documents = await MultiDocumentParser().parse_documents(
        raw_documents,
        parser_config=ParserConfig(
            # TODO: Add Access Control Policy Factory & need separate docs / users for this
            metadata_db=metadata_db, access_control_policy_factory=None
        ),
    )

    documentTransformer = SeparatorTextChunker(
        SeparatorTextChunkerParams(
            metadata_db=metadata_db,
            text_chunk_config=TextChunkConfig(
                chunk_size_limit=500, chunk_overlap=100, size_fn=len
            ),
        )
    )

    # Transform the parsed documents
    transformed_documents = await documentTransformer.transform_documents(
        parsed_documents
    )

    # TODO: Commenting out for now to get tests to pass, will add back in later - want to ship to have notebook ready
    # Create the embeddings, use dotenv to get the environment vars & setup properly
    await PineconeVectorDB.from_documents(
        transformed_documents,
        PineconeVectorDBConfig(
            index_name=os.getenv("PINECONE_INDEX_NAME", ""),
            api_key=os.getenv("PINECONE_API_KEY", ""),
            environment=os.getenv("PINECONE_ENVIRONMENT", ""),
            namespace=os.getenv("PINECONE_NAMESPACE", "abc"),
        ),
        embeddings=OpenAIEmbeddings(
            OpenAIEmbeddingsConfig(api_key=os.getenv("OPENAI_API_KEY"))
        ),
        metadata_db=metadata_db,
    )

In [5]:
await test_create_index()