In [5]:
import requests
from bs4 import BeautifulSoup

url = "https://www.immoweb.be/en/classified/house/for-sale/kortrijk/8500/20229361"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
if response.status_code == 200:
    print("Success!")
    
else:
    print(f"HTTP error occurred: {response.status_code}")

print (soup.prettify())

Success!
<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="en">
 <head>
  <meta charset="utf-8"/>
  <script>
   if (!Array.prototype.flat) { window.location.replace('https://www.immoweb.be/en/outdated-browser') }
  </script>
  <script type="text/javascript">
   const variantName = "display_ai_tab_for_unauth_user";
    const isABTestWithCookie = false;

    /* START: AB Test Homepage - AI tab for unauthenticated users. */
    const LDConsent = hasCookieConsentById("vagdYWLSXoC06v");
    /* END: AB Test Homepage - AI tab for unauthenticated users. */

    if (
    /* START: AB Test Homepage - AI tab for unauthenticated users. */
    !LDConsent || 
    /* END: AB Test Homepage - AI tab for unauthenticated users. */
    !variantName || isABTestWithCookie) { // AB test is not configured or started using cookie for storing variant
        localStorage.removeItem("ab-test");
        window.ABTestVariant = null;
    } else {
        setABTestVariant();
    }

    fu

The final dataset should be a `csv` file with at least the following 18 columns:
- Property ID
- Locality name
- Postal code
- Price
- Type of property (house or apartment)
- Subtype of property (bungalow, chalet, mansion, ...)
- Type of sale (_note_: exclude life sales)
- Number of rooms
- Living area (area in m²)
- Equipped kitchen (0/1)
- Furnished (0/1)
- Open fire (0/1)
- Terrace (area in m² or null if no terrace)
- Garden (area in m² or null if no garden)
- Number of facades
- Swimming pool (0/1)
- State of building (new, to be renovated, ...)

property_type = soup.find ('h1', class_="classified__title").text.strip() if soup.find('span', class_='classified__type') else None

subtype = soup.find('span', class_='classified__subtype').text.strip() if soup.find('span', class_='classified__subtype') else None

furnished = '1' if soup.find('span', class_='classified__furnished') else '0'


In [49]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re


driver = webdriver.Chrome()  
url = "https://www.immoweb.be/en/classified/house/for-sale/kortrijk/8500/20229361"

# for getting html
def get_html(url):
    driver.get(url)
    time.sleep(3)  
    return driver.page_source

# Function that parse the info
def parse_property_info(html):
    soup = BeautifulSoup(html, 'html.parser')

    property_id = soup.find('div', class_="classified__header--immoweb-code").text.strip()
    price = soup.find('p', class_="classified__price").find('span', {'aria-hidden': 'true'}).text.strip()

    address_full = driver.find_elements(By.XPATH, "//span[@class='classified__information--address-row']")
    postal_code, locality = address_full[1].text.split('—') if len(address_full) > 1 else (None, None)

    number_of_rooms = soup.find('span', class_='overview__text').text.strip() if soup.find('span', 'overview__text') else None
    # try to find all rows with info
    rows = soup.find_all('tr', class_='classified-table__row')
    living_area = None
    
    # Find area
    for row in rows:
        header = row.find('th')
        if header and "Living area" in header.text:  
            living_area = row.find('td').text.strip() if row.find('td') else None
            break  # if find go out
    if living_area:
        living_area = re.sub(r'[^\d]', '', living_area)  # Clean 
        living_area = living_area.strip() 

    return {
        'Property ID': property_id,
        'Locality name': locality.strip() if locality else None,
        'Postal code': postal_code.strip() if postal_code else None,
        'Price': price,
        'Number of rooms': number_of_rooms,
        'Living area (m²)': living_area
    }


html = get_html(url)
property_info = parse_property_info(html)
print(property_info)


driver.quit()


{'Property ID': 'Immoweb code : 20229361', 'Locality name': 'Kortrijk', 'Postal code': '8500', 'Price': '€164,500', 'Number of rooms': '3 bedrooms', 'Living area (m²)': '120'}


In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

driver = webdriver.Chrome()


base_url = "https://www.immoweb.be/en/search/house/for-sale?page="

# List for saving links
all_urls = []

# The ammount of pages 
number_of_pages = 5

# Loop for overpass pages
for page_num in range(1, number_of_pages + 1):
    url = base_url + str(page_num)
    driver.get(url)
    time.sleep(3)  # Page loading delay
    html = driver.page_source

    # Parsing HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find the "similar-results-container" block to exclude its links
    similar_results_container = soup.find('div', class_='similar-results-container')
   
    # Find all links of class 'card__title-link'
    listings = soup.find_all('a', class_='card__title-link')

    # Get the URL for each and filter out those in "similar-results-container"
    for listing in listings:
        # Exclude links inside the "similar-results-container"
        if similar_results_container and similar_results_container.find('a', href=listing['href']):
            continue  # Skip this link if it's inside the similar results block
        all_urls.append(listing['href'])  # Add valid link to the list

    print(f"Collect {len(all_urls)} links from page {page_num}")

# Show collected links in total 
print(f"Total number of collected links : {len(all_urls)}")

driver.quit()


Collect 30 links from page 1
Collect 60 links from page 2
Collect 90 links from page 3
Collect 120 links from page 4
Collect 150 links from page 5
Total number of collected links : 150


In [2]:
with open('property_links_filtered.txt', 'w') as f:
    for url in all_urls:
        f.write(f"{url}\n")

print("Links saved in property_links_filtered.txt")


Links saved in property_links_filtered.txt


In [3]:
import csv

with open('property_links_filtered.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['URL'])  # Header for collumns
    for url in all_urls:
        writer.writerow([url])

print("Links are saved into property_links_filtered.csv")




Links are saved into property_links_filtered.csv


In [4]:
import asyncio
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Read URL from CSV
urls = []
with open('property_links_filtered.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)  # Pass header
    for row in reader:
        urls.append(row[0])  # Add first row to list

#Create a semaphore with a limited requests
semaphore = asyncio.Semaphore(10)

# Asynchronous function to get HTML
async def fetch_url(url):
    async with semaphore: # Acquire the semaphore before running the code block
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))  
        driver.get(url)
        
        try:
            # We are waiting for the desired element to appear on the page (for example, an element with an ID or class)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "classified__header--immoweb-code"))
            )
        except Exception as e:
            print(f"Error loading page {url}: {e}")
            driver.quit()
            return None  # If the page has not been loaded, we return None
        
        # We get the HTML after the page is completely loaded
        html = driver.page_source
        driver.quit()
        return html

# An asynchronous function for processing pages
async def process_property_info(url):
    html = await fetch_url(url)
    if html is None:
        return None  # If fetch_url returned None, skip this entry
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get data
    property_id = soup.find('div', class_="classified__header--immoweb-code").text.strip() if soup.find('div', class_="classified__header--immoweb-code") else None
    price = soup.find('p', class_="classified__price").find('span', {'aria-hidden': 'true'}).text.strip() if soup.find('p', class_="classified__price") else None
    price = soup.find('p', class_="classified__price").find('span', {'aria-hidden': 'true'}).text.strip() if soup.find('p', class_="classified__price") else None
    if price:
    # Separete til -
        price = price.split('-')[0].strip()
        price = price.replace('€', '').replace(',', '').strip()

    address_full = soup.find_all('span', class_="classified__information--address-row")
    postal_code, locality = address_full[1].text.split('—') if len(address_full) > 1 else (None, None)
    number_of_rooms = soup.find('span', class_='overview__text').text.strip() if soup.find('span', 'overview__text') else None
    
    # Find living area
    rows = soup.find_all('tr', class_='classified-table__row')
    living_area = None
    for row in rows:
        header = row.find('th')
        if header and "Living area" in header.text:
            living_area = row.find('td').text.strip() if row.find('td') else None
            living_area = living_area.replace('m²', '').strip() if living_area else None
            break
    
    # Return collected info
    return {
        'Property ID': property_id,
        'Locality name': locality.strip() if locality else None,
        'Postal code': postal_code.strip() if postal_code else None,
        'Price': price,
        'Number of rooms': number_of_rooms,
        'Living area (m²)': living_area
    }

# Main function for processing all URLs
async def process_all_properties(urls):
    tasks = [process_property_info(url) for url in urls]
    properties = await asyncio.gather(*tasks)  # run all tasks at the same time

    # Filter out None results (in case some properties were skipped)
    properties = [prop for prop in properties if prop is not None]

    # We save the results in CSV with the correct encoding(Python attempted to write a character that is not supported by default)
    with open('properties_async.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Property ID', 'Locality name', 'Postal code', 'Price', 'Number of rooms', 'Living area (m²)'])
        writer.writeheader()
        writer.writerows(properties)
    
    print(f"Save data of {len(properties)} properties in properties_async.csv")


# Jupyter Notebook doesn't use asyncio.run(), just await
await process_all_properties(urls)


Error loading page https://www.immoweb.be/en/classified/house/for-sale/rhode-saint-genese/1640/20236654: Message: 
Stacktrace:
	GetHandleVerifier [0x00695523+24195]
	(No symbol) [0x0062AA04]
	(No symbol) [0x00522093]
	(No symbol) [0x00566ED2]
	(No symbol) [0x0056711B]
	(No symbol) [0x005A76F2]
	(No symbol) [0x0058AB84]
	(No symbol) [0x005A5280]
	(No symbol) [0x0058A8D6]
	(No symbol) [0x0055BA27]
	(No symbol) [0x0055C43D]
	GetHandleVerifier [0x0095CE13+2938739]
	GetHandleVerifier [0x009AEC69+3274185]
	GetHandleVerifier [0x007209C2+594722]
	GetHandleVerifier [0x00727EDC+624700]
	(No symbol) [0x006337CD]
	(No symbol) [0x00630528]
	(No symbol) [0x006306C5]
	(No symbol) [0x00622CA6]
	BaseThreadInitThunk [0x7500FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x76FD80CE+286]
	RtlGetAppContainerNamedObjectPath [0x76FD809E+238]

Save data of 149 properties in properties_async.csv


In [1]:
import asyncio
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Read URL from CSV (only first 50)
urls = []
with open('property_links_filtered.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)  # Pass header
    for i, row in enumerate(reader):
        if i >= 50:  # Process only the first 50 URLs
            break
        urls.append(row[0])  # Add first row to list

# Create a semaphore with a limited number of requests
semaphore = asyncio.Semaphore(10)

# Asynchronous function to get HTML
async def fetch_url(url):
    async with semaphore:  # Acquire the semaphore before running the code block
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))  
        driver.get(url)
        
        try:
            # Wait for the desired element to appear on the page (for example, an element with an ID or class)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "classified__header--immoweb-code"))
            )
        except Exception as e:
            print(f"Error loading page {url}: {e}")
            driver.quit()
            return None  # If the page has not been loaded, return None
        
        # Get the HTML after the page is completely loaded
        html = driver.page_source
        driver.quit()
        return html

# An asynchronous function for processing pages
async def process_property_info(url):
    html = await fetch_url(url)
    if html is None:
        return None  # If fetch_url returned None, skip this entry
    
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get data
    property_id = soup.find('div', class_="classified__header--immoweb-code").text.strip() if soup.find('div', class_="classified__header--immoweb-code") else None
    price = soup.find('p', class_="classified__price").find('span', {'aria-hidden': 'true'}).text.strip() if soup.find('p', class_="classified__price") else None
    if price:
        # Separate till '-'
        price = price.split('-')[0].strip()
        price = price.replace('€', '').replace(',', '').strip()

    address_full = soup.find_all('span', class_="classified__information--address-row")
    postal_code, locality = address_full[1].text.split('—') if len(address_full) > 1 else (None, None)
    number_of_rooms = soup.find('span', class_='overview__text').text.strip() if soup.find('span', 'overview__text') else None
    
    # Find living area
    rows = soup.find_all('tr', class_='classified-table__row')
    living_area = None
    for row in rows:
        header = row.find('th')
        if header and "Living area" in header.text:
            living_area = row.find('td').text.strip() if row.find('td') else None
            living_area = living_area.replace('m²', '').strip() if living_area else None
            break
    
    # Return collected info
    return {
        'Property ID': property_id,
        'Locality name': locality.strip() if locality else None,
        'Postal code': postal_code.strip() if postal_code else None,
        'Price': price,
        'Number of rooms': number_of_rooms,
        'Living area (m²)': living_area
    }

# Main function for processing all URLs
async def process_all_properties(urls):
    tasks = [process_property_info(url) for url in urls]
    properties = await asyncio.gather(*tasks)  # run all tasks at the same time

    # Filter out None results (in case some properties were skipped)
    properties = [prop for prop in properties if prop is not None]

    # Save the results in CSV with the correct encoding
    with open('properties_async_last.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Property ID', 'Locality name', 'Postal code', 'Price', 'Number of rooms', 'Living area (m²)'])
        writer.writeheader()
        writer.writerows(properties)
    
    print(f"Save data of {len(properties)} properties in properties_async_last.csv")


# Jupyter Notebook doesn't use asyncio.run(), just await 
await process_all_properties(urls)


Save data of 50 properties in properties_async.csv
