In [1]:
# Ollama serves as the backend to host the LLM
from langchain_community.llms import Ollama

# packages to help load in the pdf file. 
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain_huggingface import HuggingFaceEmbeddings
from bs4 import BeautifulSoup as Soup
from langchain.utils.html import (PREFIXES_TO_IGNORE_REGEX,
                                  SUFFIXES_TO_IGNORE_REGEX)

from config import *
import logging
import sys
import os


In [2]:
INDEX_PERSIST_DIRECTORY = os.getenv('INDEX_PERSIST_DIRECTORY', "./data/chromadb")

In [3]:
# file storage for pdfs
dir_path = "file_storage/"
files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f[0] != '.']

In [4]:
# define the embedding creator and the db initialization
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
collection_name = 'de-confluence'
chroma_db = Chroma(collection_name, embeddings, persist_directory=INDEX_PERSIST_DIRECTORY)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [6]:
for i in dir_path:
    document = PyPDFLoader(
        file_path=dir_path + files[0]
    ).load()
    
    document_split = text_splitter.split_documents(document)
    chroma_db.add_documents(document_split)

In [7]:
chroma_db.get()

{'ids': ['000002f7-2d5b-4b21-9ed9-81501becc53e',
  '0000778a-88da-44db-9209-81f5c0fbe199',
  '000615c2-e052-4bf4-98a1-a390330bdd5e',
  '00064b86-22b9-4580-a33a-82bf680ccfc4',
  '000dede0-96db-4683-a370-8605537f9dc6',
  '001370a5-3ba3-4785-92ee-fef8e95ecbc3',
  '0015c4a0-5ddb-4fbc-b318-2e50ccc8d46e',
  '0016f63c-4865-4091-83ee-badc4d98c7f2',
  '001ee087-2bca-4174-8bb6-fe4c4dfb0e77',
  '002a11af-488f-44fe-ade3-163e5555c9d5',
  '002e5602-c9d5-4f67-b3f5-22126454b0f4',
  '0031565d-088a-4749-aef9-835e473f71d8',
  '003e33b6-8b5b-4afa-9fd3-a88e7652fe76',
  '003f7e31-dacb-43d5-b3b2-34d58697bd46',
  '003f8bdc-ab13-41e0-8aea-c52e71a23d32',
  '0044d7c2-fe66-4ea8-952f-65253425ff0e',
  '00467ff0-82d0-4a27-bb57-97f5b071d427',
  '0047d693-edd6-4b9a-bafe-a98035d9c548',
  '0049df94-d25c-4753-97e7-9f88378681c3',
  '00558c1c-c19b-46e4-b348-ccb795a1188a',
  '00598b1f-3637-47d3-b0a9-f2fcb49080e6',
  '005e19b5-7538-4a76-8fe4-0b24cd7bcffb',
  '0060ae55-9ab8-4c01-8e28-f0b6eba9e01e',
  '00622b89-d2d1-43c0-983d-

In [8]:
!pip install -U langchain-huggingface

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.0.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
