In [40]:
%pwd

'/Users/arda/Desktop/A.I./Projects/FinanceChatbot'

In [41]:
# Import libraries
import os
import warnings
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from langchain.embeddings import HuggingFaceEmbeddings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [42]:
# Load environment variables from .env file
load_dotenv()

# Check if .env file exists and API keys are loaded
if not os.path.exists('.env'):
    print("Warning: .env file not found!")
elif not os.getenv("QDRANT_API_KEY") or not os.getenv("QDRANT_URL"):
    print("Warning: QDRANT_API_KEY or QDRANT_URL not found in .env file!")
else:
    print("Environment variables loaded successfully.")

# Settings
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")
COLLECTION_NAME = "finance-chatbot"
DATA_DIR = "Data"

Environment variables loaded successfully.


In [43]:
# Load and extract data from PDFs
def load_pdf_file(data_dir):
    loader = DirectoryLoader(
        data_dir,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

extracted_data = load_pdf_file(DATA_DIR)

# Verify the number of loaded PDFs by checking unique file sources
unique_files = set(doc.metadata.get('source', 'Unknown') for doc in extracted_data)
print(f"Number of unique PDF files loaded: {len(unique_files)}")
print("Loaded files:")
for i, file in enumerate(unique_files, 1):
    print(f"File {i}: {file}")

# Check if the expected number of PDFs (3) were loaded
if len(unique_files) == 3:
    print("Success: All 3 PDFs (Basics.pdf, Statementanalysis.pdf, Financialterms.pdf) have been loaded.")
else:
    print(f"Warning: Expected 3 PDFs, but {len(unique_files)} unique files were loaded. Check the Data directory.")

# Additional info: Total number of pages (documents)
print(f"Total number of pages loaded: {len(extracted_data)}")

Number of unique PDF files loaded: 3
Loaded files:
File 1: Data/Basics.pdf
File 2: Data/Statementanalysis.pdf
File 3: Data/Financialterms.pdf
Success: All 3 PDFs (Basics.pdf, Statementanalysis.pdf, Financialterms.pdf) have been loaded.
Total number of pages loaded: 547


In [44]:
# Split the data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of Text Chunks:", len(text_chunks))

Length of Text Chunks: 2756


In [45]:
# Download embeddings from Hugging Face
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Verify embedding dimension
query_result = embeddings.embed_query("Hello world")
print("Embedding Dimension:", len(query_result))

Embedding Dimension: 384


In [47]:
# Initialize Qdrant client and create/upload to collection
try:
    qdrant = QdrantVectorStore.from_documents(
        documents=text_chunks,
        embedding=embeddings,
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
        collection_name=COLLECTION_NAME
    )
    print("Qdrant collection created and populated successfully.")
except Exception as e:
    print(f"Error creating Qdrant collection: {e}")

Qdrant collection created and populated successfully.
