## **Import Required Libraries**

In [3]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import json

## **Install Dependencies**

In [18]:
!{sys.executable} -m pip install -r ../requirements.txt

Collecting tiktoken (from -r ../requirements.txt (line 15))
  Downloading tiktoken-0.12.0-cp310-cp310-win_amd64.whl.metadata (6.9 kB)
Downloading tiktoken-0.12.0-cp310-cp310-win_amd64.whl (879 kB)
   ---------------------------------------- 0.0/879.4 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/879.4 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/879.4 kB ? eta -:--:--
   ----------------------------------- ---- 786.4/879.4 kB 1.0 MB/s eta 0:00:01
   ---------------------------------------- 879.4/879.4 kB 1.0 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.12.0


## **Setup Working Directory**

In [16]:
## Add project root to system path
CURRENT_DIR=Path.cwd()
PROJECT_ROOT=CURRENT_DIR

while not (PROJECT_ROOT/'src').exists() and PROJECT_ROOT!=PROJECT_ROOT.parent:
    PROJECT_ROOT=PROJECT_ROOT.parent 

if not (PROJECT_ROOT/'src').exists():
    raise Exception("Could not find the 'src' folder.Check the folder structure")

## add project root to the python path
sys.path.insert(0,str(PROJECT_ROOT))

load_dotenv(PROJECT_ROOT/'.env')

## check API keys configurations
openrouter_api_key=os.getenv("OPENROUTER_API_KEY")
openai_api_key=os.getenv("OPENAI_API_KEY")

## check whether API keys there or not
if not  openrouter_api_key or not openai_api_key:
    raise ValueError(
        "API keys are not Found.\n"
        "Please add those API keys into the .env file if there is anything missing."
    )


from src.context_engineering.config import(
    VECTOR_DIR,EMBEDDING_MODEL,CRAWL_OUT_DIR,PROVIDER
)

print("="*60)
print("Enviroment Loadeding............")
print("="*60)

print(f"Provider:{PROVIDER}")
print(f"Project Root Directory:{PROJECT_ROOT}")
print("âœ…Environment Loaded")

Enviroment Loadeding............
Provider:openrouter
Project Root Directory:c:\UOC pdf\AI Engineering Bootcamp\mini-project-03
âœ…Environment Loaded


## **Chunking Strategies**

In [22]:
from src.context_engineering.chunking.chunkers import Chunking

print("Chunking strategies loaded from the chunking layer")
print("Location:src.context_engineering.chunking")
print("="*60)
print("Available Chunking Strategies:")

print("   1. semantic_chunk      - Split by heading structure")
print("   2. fixed_chunk         - Uniform token chunks with overlap")
print("   3. sliding_chunk       - Overlapping windows for better recall")
print("   4. parent_child_chunk  - Two-tier parent-child chunking")
print("   5. late_chunk          - Large base chunks (late splitting)")
print("   ðŸ”„ late_retrieval      - Split + context window (retrieval phase)")

Chunking strategies loaded from the chunking layer
Location:src.context_engineering.chunking
Available Chunking Strategies:
   1. semantic_chunk      - Split by heading structure
   2. fixed_chunk         - Uniform token chunks with overlap
   3. sliding_chunk       - Overlapping windows for better recall
   4. parent_child_chunk  - Two-tier parent-child chunking
   5. late_chunk          - Large base chunks (late splitting)
   ðŸ”„ late_retrieval      - Split + context window (retrieval phase)


## **Load Prime Lands Corpus**

In [28]:
## open and read jsonl file 
corpus_path=PROJECT_ROOT/CRAWL_OUT_DIR/"prime_lands_corpus.jsonl"

if not corpus_path.exists():
    raise FileNotFoundError("Corpus can't be found in provided directory.")

with open(corpus_path,"r",encoding="UTF-8") as f:
    documents=[json.loads(line) for line in f]

print("="*60)
print("Loading Documents..............")
print("="*60)
print(f"Numbebr of Documents:{len(documents)}")
print(f"Total content size: {sum(len(d['content']) for d in documents):,} chars")
print("Document Loaded")

Loading Documents..............
Numbebr of Documents:20
Total content size: 50,479 chars
Document Loaded


## **Apply Each Chunking Strategies**