In [1]:
# Necessary Libraries for Vector Database
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader           # Extract the text from dataset, then convert it into a format that Lang Chain can work with
from langchain_text_splitters import RecursiveCharacterTextSplitter                     # Split the whole document which containing all text into meaningful chunks
from langchain_huggingface import HuggingFaceEmbeddings                                 # Convert text chunks into numerical representations
from langchain_community.vectorstores import FAISS                                      # Use to store, index, and search through large collection of vector embeddings efficiently

from dotenv import load_dotenv



In [8]:
# Constant Variables
DATASET_PATH = "./data/"
VECTOR_DATABASE_PATH = "./vector_database/db_faiss"

In [3]:
# Dataset Preparation
docs_loader = DirectoryLoader(DATASET_PATH, glob = "*.pdf", loader_cls = PyPDFLoader)
medical_docs = docs_loader.load()

# Dataset Checking
print(f"Number of PDF pages: {len(medical_docs)}")

Number of PDF pages: 759


In [5]:
# Create Text Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)

dataset = text_splitter.split_documents(medical_docs)
dataset[:10]

[Document(metadata={'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': 'data/The-Glae-Encyclopedia-of-Medicine.pdf', 'total_pages': 759, 'page': 0, 'page_label': '1'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017-05-01T10:37:35-07:00', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'source': 'data/The-Glae-Encyclopedia-of-Medicine.pdf', 'total_pages': 759, 'page': 1, 'page_label': '2'}, page_content='The G ALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nC-F\n2'),
 Document(metadata={'producer': 'GPL Ghostscript 9.10', 'creator': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'moddate': '2017

In [7]:
# Load API Key from .env file
load_dotenv()

# Create Vector Embeddings
embedding_model = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Store Embeddings in FAISS
vector_database = FAISS.from_documents(dataset, embedding_model)
vector_database.save_local(VECTOR_DATABASE_PATH)