## **Import Required Libraries**

In [1]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
import time
import json
import nest_asyncio
import asyncio
import re

In [2]:
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

## **Initialize the Project Directory and Other Configurations**

In [3]:
## Add project root to system path
CURRENT_DIR=Path.cwd()
PROJECT_ROOT=CURRENT_DIR

while not (PROJECT_ROOT/'src').exists() and PROJECT_ROOT!=PROJECT_ROOT.parent:
    PROJECT_ROOT=PROJECT_ROOT.parent 

if not (PROJECT_ROOT/'src').exists():
    raise Exception("Could not find the 'src' folder.Check the folder structure")

## add project root to the python path
sys.path.insert(0,str(PROJECT_ROOT))

load_dotenv(PROJECT_ROOT/'.env')

## get the API keys from LLM providers
openrouter_api_key=os.getenv("OPENROUTER_API_KEY")
openai_api_key=os.getenv("OPENAI_API_KEY")
gemini_api_key=os.getenv("GEMINI_API_KEY")


## validate API keys
if not openrouter_api_key and not openai_api_key and not gemini_api_key:
    raise(
        "No API keys found"
        "Add API keys into the .env file"
    )

provider="openrouter" if openrouter_api_key else openai_api_key

print(f"LLM provider:{provider}")
print(f"Project Root Directory:{PROJECT_ROOT}")

LLM provider:openrouter
Project Root Directory:c:\UOC pdf\AI Engineering Bootcamp\mini-project-03


## **Install Dependencise**

In [4]:
!{sys.executable} -m pip install -r ../requirements.txt
!{sys.executable} -m playwright install



## **Setup Web Crawler Directries**

In [5]:
## Import configuration 
from src.context_engineering.config import (
    MARKDOWN_DIR,
    CRAWL_OUT_DIR,
    BASE_URL,
    MAX_DEPTH,
    MAX_PAGES,
    TIMEOUT,
    RATE_LIMIT_SECONDS,
    show_confiurations
)

##print all the configurations
show_confiurations()

## directries for web crawling
print("=========================DIRECTRIES FOR WEB CRAWLING======================")
print(f"WEB CRAWLING OUTPUT DIR:{CRAWL_OUT_DIR}")
print(F"WEB CRAWLING MARKDOWN DIR:{MARKDOWN_DIR}")
print("="*60)

Directory not exists:c:\UOC pdf\AI Engineering Bootcamp\config\config.yaml
Directory not exists:c:\UOC pdf\AI Engineering Bootcamp\config\llm_models.yaml
Pirinting All the Configuation(Non-Secrets)
PROVIDER
LLM PROVIDER:openrouter
LLM MODEL TIER:general
LLM CHAT MODEL:openai/gpt-4o-mini
EMBEDDING MODEL:openai/text-embedding-3-large
DIRECTRIES
DATA DIRECTORTY:data
VECTOR DB STORE DIRECTORY:data/vectorstore
WEB CRAWLING OUTPUT DIRECTORY:data/processed
MARKDWON DIRECTORY:data/markdown
WEB CRAWLING OUTPUT DIR:data/processed
WEB CRAWLING MARKDOWN DIR:data/markdown


## **Import Crawler Service**

In [6]:
## Import crawl web service for Prime Lands url
from src.context_engineering.crawler.primelands_crawler import PrimeLandsCrawler


print("‚úÖ PrimeLandsCrawler loaded from service layer")
print("üìç Location: src.context_engineering.crawler.primelands_crawler.PrimeLandsCrawler")

‚úÖ PrimeLandsCrawler loaded from service layer
üìç Location: src.context_engineering.crawler.primelands_crawler.PrimeLandsCrawler


## **Load Crawl Configurations**

In [7]:
## crawl configurations
BASE_URL="https://www.primelands.lk"

START_URL = [
    "/land/en",
    "/house/en",
    "/apartment/ongoing/en",
    "/apartment/completed/en",
    "/portfolio-property/en"
]

START_URL=[BASE_URL+path for path in START_URL]


print("="*60)
print("Crawl Configurations....")
print("="*60)
print(f"Base URL:{len(START_URL)}")
print(f"Max Depth:{MAX_DEPTH}")
print(f"Max pages:{MAX_PAGES}")
print(f"Timeout Limit:{TIMEOUT}")
print(f"Rate Limit Seconds:{RATE_LIMIT_SECONDS}")


Crawl Configurations....
Base URL:5
Max Depth:3
Max pages:20
Timeout Limit:30000
Rate Limit Seconds:2.0


## **Execute Crawl Setup**

In [8]:
## start time initialized
start_time=time.time()

## call the "PrimeLandsCrawler" class and create an object
land_crawler=PrimeLandsCrawler(
    base_url=BASE_URL,
    max_depth=MAX_DEPTH,
    max_pages=MAX_PAGES,
    timeout=TIMEOUT,
    rate_limit_seconds=RATE_LIMIT_SECONDS
    )

## execute crawler
print(f"\nüöÄ Starting crawl at {time.strftime('%H:%M:%S')}\n")
documents= land_crawler.crawl(START_URL)

elapsed=time.time() -start_time

print(f"\n‚úÖ Crawl complete in {elapsed:.1f}s")
print(f"üìÑ Documents collected: {len(documents)}")
print(f"üîó URLs visited: {len(land_crawler.visited)}")


üöÄ Starting crawl at 12:14:39

[0] Crawling:https://www.primelands.lk/land/en
 Saved (2597 chars)
[0] Crawling:https://www.primelands.lk/house/en
 Saved (2599 chars)
[0] Crawling:https://www.primelands.lk/apartment/ongoing/en
 Saved (2631 chars)
[0] Crawling:https://www.primelands.lk/apartment/completed/en
 Saved (603 chars)
[0] Crawling:https://www.primelands.lk/portfolio-property/en
 Saved (2618 chars)
[1] Crawling:https://www.primelands.lk/land/LUXORA-KIRIBATHGODA/en
Error:Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://www.primelands.lk/land/LUX
[1] Crawling:https://www.primelands.lk/land/district/Ratnapura/en
 Saved (2635 chars)
[1] Crawling:https://www.primelands.lk/agriculture-land/en
 Saved (2614 chars)
[1] Crawling:https://www.primelands.lk/land/THALAHENA-LUXE/en
 Saved (2631 chars)
[1] Crawling:https://www.primelands.lk/land/NEXUS-TOWN-ALUTHGAMA/en
Error:Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://www.primelands.lk/la

## **Save Output Files(JSONL and Markdown Format)**

In [9]:
## save output as a JASONL format
name_of_file="prime_lands_corpus.jsonl"
JSON_PATH=PROJECT_ROOT/CRAWL_OUT_DIR/name_of_file
with open(JSON_PATH,"w",encoding="UTF-8") as f:
    for doc in documents:
        f.write(json.dumps(doc,ensure_ascii=False)+'\n')
print(f"Save Prime Lands Corpus to {JSON_PATH}")

Save Prime Lands Corpus to c:\UOC pdf\AI Engineering Bootcamp\mini-project-03\data\processed\prime_lands_corpus.jsonl


In [12]:
## save output as Markdown format
for doc in documents:
    property_id = doc.get("property_id") or "page"
    filename = re.sub(r'[^a-zA-Z0-9_-]', '_', str(property_id))
    filepath = PROJECT_ROOT/MARKDOWN_DIR / f"{filename}.md"

    markdown_content = f"""# {doc.get('title', 'No Title')}

URL: {doc.get('url')}
Headings: {doc.get('headings')}
Price: {doc.get('price')}
Bedrooms: {doc.get('bedrooms')}
Bathrooms: {doc.get('bathrooms')}
Size (sqft): {doc.get('sqft')}
Agent: {doc.get('agent')}

---

## Content

{doc.get('content')}
"""

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(markdown_content)

print(f"‚úÖ Saved Markdown files to {PROJECT_ROOT/MARKDOWN_DIR}")

‚úÖ Saved Markdown files to c:\UOC pdf\AI Engineering Bootcamp\mini-project-03\data\markdown
