In [1]:

import os
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


In [2]:

# === 1. Extract Text from PDF Files ===
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return "\n".join([page.get_text() for page in doc])

def load_all_pdfs_from_directory(directory):
    texts = []
    for filename in os.listdir(directory):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(directory, filename)
            print(f"Extracting: {full_path}")
            text = extract_text_from_pdf(full_path)
            texts.append(text)
    return texts

# === 2. Chunk the Text ===
def chunk_texts(texts, chunk_size=500, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = splitter.create_documents(texts)
    return documents

# === 3. Embed the Chunks ===
def embed_documents(documents):
    embeddings = OpenAIEmbeddings()  # Requires OPENAI_API_KEY env var
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

# === 4. Setup RAG ===
def setup_rag(vectorstore):
    retriever = vectorstore.as_retriever(search_type="similarity", k=5)
    llm = ChatOpenAI()  # Defaults to gpt-3.5-turbo
    qa_chain = RetrievalQA(llm=llm, retriever=retriever)
    return qa_chain

In [3]:

# Section 1: Extract and process PDFs
pdf_directory = "pdf"  # Change this to your PDF directory path
texts = load_all_pdfs_from_directory(pdf_directory)

Extracting: pdf/IntroductionToFuelCellAndElectrolyzerModule.pdf
Extracting: pdf/CorrosionModuleUsersGuide.pdf
Extracting: pdf/IntroductionToPorousMediaFlowModule.pdf
Extracting: pdf/IntroductionToPolymerFlowModule.pdf
Extracting: pdf/IntroductionToElectrodepositionModule.pdf
Extracting: pdf/IntroductionToOptimizationModule.pdf
Extracting: pdf/fnp_LicAdmin.pdf
Extracting: pdf/IntroductionToThermodynamicProperties.pdf
Extracting: pdf/COMSOL_MultiphysicsInstallationGuide.pdf
Extracting: pdf/CFDModuleUsersGuide.pdf
Extracting: pdf/IntroductionToMEMSModule.pdf
Extracting: pdf/IntroductionToSubsurfaceFlowModule.pdf
Extracting: pdf/IntroductionToLiveLinkForRevit.pdf
Extracting: pdf/IntroductionToLiquidAndGasPropertiesModule.pdf
Extracting: pdf/PorousMediaFlowModuleUsersGuide.pdf
Extracting: pdf/IntroductionToACDCModule.pdf
Extracting: pdf/IntroductionToLiveLinkForPTCCreoParametric.pdf
Extracting: pdf/ModelManagerServerManual.pdf
Extracting: pdf/ElectrodepositionModuleUsersGuide.pdf
Extracting

In [4]:

# Save texts to a file
output_file = "extracted_texts.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for i, text in enumerate(texts):
        f.write(f"=== Document {i+1} ===\n")
        f.write(text)
        f.write("\n\n")
print(f"Texts saved to {output_file}")


Texts saved to extracted_texts.txt


In [8]:
pip install -U sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import VLLM
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

In [2]:

# Load texts
with open("extracted_texts.txt", "r", encoding="utf-8") as f:
    texts = f.read().split("=== Document ")[1:]
    texts = [doc.split("\n", 1)[1] for doc in texts]

# Chunk texts
def chunk_texts(texts):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    return [Document(page_content=chunk) for text in texts for chunk in splitter.split_text(text)]

documents = chunk_texts(texts)


In [3]:

# # Embeddings with CUDA
# embeddings = HuggingFaceEmbeddings(
#     model_name="sentence-transformers/all-MiniLM-L6-v2",
#     model_kwargs={"device": "cuda"}
# )

# Use Qwen2 embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="Alibaba-NLP/gte-Qwen2-7B-instruct",
    model_kwargs={"device": "cuda"}  # Make sure you have enough GPU memory
)


  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 7/7 [00:00<00:00, 201.42it/s]


In [4]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 32768, 'do_lower_case': False}) with Transformer model: Qwen2Model 
  (1): Pooling({'word_embedding_dimension': 3584, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': True})
  (2): Normalize()
), model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [5]:

# Build vectorstore (can be swapped with Chroma if needed)
vectorstore = FAISS.from_documents(documents, embeddings)
# Save to local folder
vectorstore.save_local("faiss_index")

KeyboardInterrupt: 

In [7]:
# from tqdm import tqdm
# from langchain.vectorstores import Chroma

# chroma = Chroma(embedding_function=embeddings, persist_directory="chroma_db")

# for doc in tqdm(documents, desc="Adding to Chroma"):
#     chroma.add_documents([doc])
# chroma.persist()

# batched
from langchain.vectorstores import Chroma
from tqdm import tqdm

# Setup Chroma
vectorstore = Chroma(embedding_function=embeddings, persist_directory="chroma_db")

# Batched insertion
batch_size = 64
for i in tqdm(range(0, len(documents), batch_size), desc="Adding to Chroma (batched)"):
    batch = documents[i:i + batch_size]
    vectorstore.add_documents(batch)

# Save to disk
vectorstore.persist()



Adding to Chroma (batched):   6%|▋         | 118/1837 [06:27<1:34:00,  3.28s/it]


KeyboardInterrupt: 

In [None]:

# Use DeepSeek LLM via vLLM
llm = VLLM(
    model="deepseek-ai/deepseek-llm-7b-chat",
    trust_remote_code=True,
    tensor_parallel_size=1,
    max_tokens=1024,
    top_p=0.95,
    temperature=0.7
)

# Build RAG chain
retriever = vectorstore.as_retriever()
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Run query
response = qa.run("Summarize this corpus.")
print(response)