In [None]:
from bs4 import BeautifulSoup

html = '''
<div class="col-md-6 col-lg-4">
    <a class="project-card shadow-sm" href="https://residensiwilayah.jwp.gov.my/projek/jhZOy7OMDf9UThdx6ngJrB3wQlgHAM">
      <div class="project-card-img" style="background-image: url('https://erumawip.jwp.gov.my/images/imageProjek/68b7a2702521e.png')">
        <div class="project-status status-open">
          Permohonan dibuka
        </div>
      </div>
      <div class="project-card-info">
        <div class="project-card-name">
          Residensi Metropolitan Kepong (Residensi Wilayah)
        </div>
        <div class="project-card-list">
          <div class="title"><i class="uil uil-sign-right"></i> Harga</div>
          <div class="font-weight-bold">RM 300,000</div>
        </div>
        <div class="project-card-list">
          <div class="title"><i class="uil uil-key-skeleton"></i> Bil. Unit</div>
          <div>552 Unit </div>
        </div>
        <div class="project-card-list">
          <div class="title"><i class="uil uil-expand-arrows-alt"></i> Keluasan</div>
          <div>800 kps</div>
        </div>
        <div class="project-card-footer">
          <div>Kepong </div>
        </div>
      </div>
    </a>
</div>
'''

# Parse HTML
soup = BeautifulSoup(html, 'html.parser')

# Extract info safely
project_name = soup.select_one('.project-card-name').get_text(strip=True)
price = soup.select_one('.font-weight-bold').get_text(strip=True)

# Instead of find(text=...), use CSS-based search
unit_section = soup.select_one('.project-card-list:has(.uil-key-skeleton) div:not(.title)')
area_section = soup.select_one('.project-card-list:has(.uil-expand-arrows-alt) div:not(.title)')

unit_info = unit_section.get_text(strip=True) if unit_section else 'N/A'
area = area_section.get_text(strip=True) if area_section else 'N/A'

# Print
print("Project Name:", project_name)
print("Price:", price)
print("Units:", unit_info)
print("Area:", area)



In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

# Setup Chrome options
chrome_options = Options()
#chrome_options.add_argument("--headless")  # comment this line if you want to see Chrome open
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Use webdriver_manager to automatically get the right ChromeDriver
service = Service(ChromeDriverManager().install())

# Start Chrome
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the website
url = "https://residensiwilayah.jwp.gov.my/projek"
driver.get(url)

# Wait for page to load JS
time.sleep(5)

# Parse page
soup = BeautifulSoup(driver.page_source, "html.parser")

# Find all project cards
projects = soup.select(".project-card")

print(f"Total projects found: {len(projects)}\n")

# Loop through projects
for idx, project in enumerate(projects, start=1):
    name = project.select_one(".project-card-name")
    price = project.select_one(".font-weight-bold")
    unit_section = project.select_one(".project-card-list:has(.uil-key-skeleton) div:not(.title)")
    area_section = project.select_one(".project-card-list:has(.uil-expand-arrows-alt) div:not(.title)")

    name = name.get_text(strip=True) if name else "N/A"
    price = price.get_text(strip=True) if price else "N/A"
    units = unit_section.get_text(strip=True) if unit_section else "N/A"
    area = area_section.get_text(strip=True) if area_section else "N/A"

    print(f"{idx}. {name}")
    print(f"   Price: {price}")
    print(f"   Units: {units}")
    print(f"   Area:  {area}\n")




In [None]:
from bs4 import BeautifulSoup
import time

all_projects = []

# Step 1: Find total number of pages
soup = BeautifulSoup(driver.page_source, "html.parser")
pagination = soup.select("ul.pagination li a.page-link")

# Extract all page URLs
page_links = [a['href'] for a in pagination if 'page=' in a['href']]
page_links = list(dict.fromkeys(page_links))  # remove duplicates
page_links.insert(0, "https://residensiwilayah.jwp.gov.my/projek")  # add page 1

print(f"Found {len(page_links)} pages.\n")

# Step 2: Loop through each page
for page_num, page_url in enumerate(page_links, start=1):
    driver.get(page_url)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    projects = soup.select(".project-card")

    print(f"--- Page {page_num}: {len(projects)} projects ---")

    for project in projects:
        name = project.select_one(".project-card-name")
        price = project.select_one(".font-weight-bold")
        unit_section = project.select_one(".project-card-list:has(.uil-key-skeleton) div:not(.title)")
        area_section = project.select_one(".project-card-list:has(.uil-expand-arrows-alt) div:not(.title)")

        name = name.get_text(strip=True) if name else "N/A"
        price = price.get_text(strip=True) if price else "N/A"
        units = unit_section.get_text(strip=True) if unit_section else "N/A"
        area = area_section.get_text(strip=True) if area_section else "N/A"

        project_data = {
            "Page": page_num,
            "Name": name,
            "Price": price,
            "Units": units,
            "Area": area
        }

        all_projects.append(project_data)
        print(f"- {name} | {price} | {units} | {area}")

    print()

print(f"\n✅ Total projects scraped: {len(all_projects)}")

# Close browser when done
driver.quit()


In [None]:
import json
import os

if not all_projects:
    print("⚠️ No project data found. Make sure the scraping loop ran first.")
else:
    output_file = "ExtractedProjectRumaWip.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_projects, f, ensure_ascii=False, indent=4)
    print(f"\n💾 Data successfully saved to '{os.path.abspath(output_file)}'")
