In [6]:
# Load and clean PDF using pdfminer
from pdfminer.high_level import extract_text
from pathlib import Path
import re

# Step 1: Load PDF
pdf_path = 'C:\\Users\\iamas\\Desktop\\amlgo-rag-chatbot\\data\\training_document.pdf'
text = extract_text(pdf_path)

# Step 2: Clean the extracted text
def clean_text(raw_text):
    # Remove extra spaces and newlines
    cleaned = re.sub(r'\n+', '\n', raw_text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()

cleaned_text = clean_text(text)

# Step 3: Save cleaned text to a file (optional)
Path("../data/cleaned_document.txt").write_text(cleaned_text)

print("Document loaded and cleaned successfully.")


Document loaded and cleaned successfully.


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json

# Step 1: Load cleaned document
with open('../data/cleaned_document.txt', 'r', encoding='latin1') as f:
    full_text = f.read()

# Step 2: Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,       # Approx. 200 words (adjustable)
    chunk_overlap=200,     # Ensure context continuity
    length_function=len,
    separators=["\n\n", "\n", ".", "!", "?", " "]
)

# Step 3: Split into chunks
chunks = text_splitter.split_text(full_text)
print(f"Number of chunks created: {len(chunks)}")

# Step 4: Save chunks to JSON
with open('../chunks/chunks.json', 'w', encoding='utf-8') as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)

print("Chunks saved to /chunks/chunks.json")


Number of chunks created: 91
Chunks saved to /chunks/chunks.json


In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os

# Step 1: Load chunks
with open('../chunks/chunks.json', 'r', encoding='latin1') as f:
    chunks = json.load(f)

# Step 2: Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 3: Generate embeddings
embeddings = model.encode(chunks, show_progress_bar=True)

# Step 4: Create FAISS index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Step 5: Save FAISS index
os.makedirs('../vectordb', exist_ok=True)
faiss.write_index(index, '../vectordb/index.faiss')

# Save chunks (in same order as embeddings) for reference during retrieval
with open('../vectordb/chunks.json', 'w', encoding='utf-8') as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)

print(" FAISS index created and saved!")
