In [31]:
%pip install ollama PyMuPDF langchain langchain-community scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Import the classes from the rag.py file
from rag import SimplePDFProcessor, IntelligentTextSplitter, OllamaEmbeddingGenerator, SimpleVectorSearch

In [None]:
# Initialize the PDF processor
pdf_processor = SimplePDFProcessor("hec_outline.pdf")
print("📄 PDF Processor initialized!")

📄 PDF Processor initialized!


In [33]:
# Initialize the splitter
text_splitter = IntelligentTextSplitter(chunk_size=1000, chunk_overlap=200)
print("✂️ Intelligent Text Splitter initialized!")

✂️ Intelligent Text Splitter initialized!


In [34]:
# Initialize the embedding generator
embedding_generator = OllamaEmbeddingGenerator()
print("🧠 Ollama Embedding Generator initialized!")

🧠 Ollama Embedding Generator initialized!


In [35]:
# Step 1: Extract text from PDF
print("📖 Extracting text from PDF...")
raw_text = pdf_processor.extract_text()

# Step 2: Clean the text
cleaned_text = pdf_processor.clean_text(raw_text)

print(f"📊 Text extraction summary:")
print(f"   - Raw text length: {len(raw_text)} characters")
print(f"   - Cleaned text length: {len(cleaned_text)} characters")
print(f"   - Text preview: {cleaned_text[:200]}...")


📖 Extracting text from PDF...
✅ Extracted 286158 characters from PDF
📊 Text extraction summary:
   - Raw text length: 286158 characters
   - Cleaned text length: 273998 characters
   - Text preview: 1 CURRICULUM OF COMPUTER SCIENCE, SOFTWARE ENGINEERING, AND INFORMATION TECHNOLOGY (Bachelors & Masters Programs) (Revised 2017) HIGHER EDUCATION COMMISSION ISLAMABAD CURRICULUM DIVISION, HEC 2 Prof. ...


In [36]:
# Step 4: Split text into intelligent chunks
print("✂️ Splitting text into semantic chunks...")

# Spilt text into chunks using intelligent overlap
chunks = text_splitter.split_with_overlap(cleaned_text)

# Create documents with metadata
documents = text_splitter.create_documents_with_metadata(chunks)

print(f"📊 Text splitting summary:")
print(f"   - Number of chunks: {len(chunks)}")
print(f"   - Average chunk size: {np.mean([len(chunk) for chunk in chunks]):.0f} characters")
print(f"   - Chunk size range: {min([len(chunk) for chunk in chunks])} - {max([len(chunk) for chunk in chunks])} characters")

# Display first few chunks as examples
print(f"\n📝 Sample chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"   Chunk {i+1}: {chunk[:100]}...")


✂️ Splitting text into semantic chunks...
📊 Text splitting summary:
   - Number of chunks: 381
   - Average chunk size: 809 characters
   - Chunk size range: 57 - 1000 characters

📝 Sample chunks:
   Chunk 1: 1 CURRICULUM OF COMPUTER SCIENCE, SOFTWARE ENGINEERING, AND INFORMATION TECHNOLOGY (Bachelors & Mast...
   Chunk 2: . Curricula Consideration .............................................................................
   Chunk 3: . Master Degree Programs in Computing .................................................................


In [37]:
# Create and save embeddings
embeddings = embedding_generator.generate_embeddings_batch(chunk_texts, batch_size=5)
embedding_generator.save_embeddings(embeddings, chunk_texts, "hec_outline_embeddings.json")

print(f"✅ Generated {len(embeddings)} embeddings")
print(f"📊 Embedding dimension: {len(embeddings[0]) if embeddings else 'N/A'}")

🔄 Generating embeddings for 381 texts...
📊 Progress: 5/381 embeddings generated
📊 Progress: 10/381 embeddings generated
📊 Progress: 15/381 embeddings generated
📊 Progress: 20/381 embeddings generated
📊 Progress: 25/381 embeddings generated
📊 Progress: 30/381 embeddings generated
📊 Progress: 35/381 embeddings generated
📊 Progress: 40/381 embeddings generated
📊 Progress: 45/381 embeddings generated
📊 Progress: 50/381 embeddings generated
📊 Progress: 55/381 embeddings generated
📊 Progress: 60/381 embeddings generated
📊 Progress: 65/381 embeddings generated
📊 Progress: 70/381 embeddings generated
📊 Progress: 75/381 embeddings generated
📊 Progress: 80/381 embeddings generated
📊 Progress: 85/381 embeddings generated
📊 Progress: 90/381 embeddings generated
📊 Progress: 95/381 embeddings generated
📊 Progress: 100/381 embeddings generated
📊 Progress: 105/381 embeddings generated
📊 Progress: 110/381 embeddings generated
📊 Progress: 115/381 embeddings generated
📊 Progress: 120/381 embeddings gener

In [39]:
# Initialize vector search if embeddings exist
if 'embeddings' in locals() and 'chunk_texts' in locals():
    vector_search = SimpleVectorSearch(embeddings, chunk_texts)
    print("🔍 Vector search ready!")
else:
    print("⚠️ Load embeddings first to initialize vector search")


✅ Vector search initialized with 381 documents
🔍 Vector search ready!


In [40]:
# Test Vector Search
if 'vector_search' in locals() and 'embedding_generator' in locals():
    print("🔍 Testing vector search...")
    
    # Example search queries
    queries = [
        "computer science curriculum",
        "software engineering requirements", 
        "information technology courses"
    ]
    
    for query in queries:
        print(f"\n🔎 Searching for: '{query}'")
        results = vector_search.search_by_text(query, embedding_generator, top_k=3)
        
        for i, result in enumerate(results, 1):
            print(f"  {i}. Similarity: {result['similarity']:.3f}")
            print(f"     Text: {result['text'][:100]}...")
            print()
else:
    print("⚠️ Vector search not initialized. Run the previous cells first.")


🔍 Testing vector search...

🔎 Searching for: 'computer science curriculum'
  1. Similarity: 0.798
     Text: . Development in Computer Science .....................................................................

  2. Similarity: 0.798
     Text: . Computer Science is the application of a systematic, disciplined and quantifiable approach to the ...

  3. Similarity: 0.787
     Text: . Master Degree Programs in Computing .................................................................


🔎 Searching for: 'software engineering requirements'
  1. Similarity: 0.793
     Text: . Describe the requirements engineering process 2. Effectively analyze software requirements for the...

  2. Similarity: 0.771
     Text: . Published by IEEE Computer Society Press and McGraw-Hill Book Company, 2008 Requirements Engineeri...

  3. Similarity: 0.766
     Text: . Teaching Methodology: Lecturing, Written Assignments, Project, Report Writing Course Assessment: S...


🔎 Searching for: 'information techno