In [5]:
import functools
import itertools
import requests
import pandas as pd
from tqdm.contrib.concurrent import thread_map

In [6]:
def get_price_range(minPrice, maxPrice, session):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page=1&orderBy=newest&isALifeAnnuitySale=false&minPrice={minPrice}&maxPrice={maxPrice}"
    num_pages = session.get(api_url).json()['marketingCount'] // 30 + 1
    if num_pages > 330:
        mid_price = (minPrice + maxPrice) // 2 
        return get_price_range(minPrice, mid_price, session).union(get_price_range(mid_price, maxPrice, session))
    return get_ids_for_category(minPrice, maxPrice, num_pages, session)

def get_ids_from_page(i, minPrice, maxPrice, session):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page={i}&orderBy=newest&isALifeAnnuitySale=false&minPrice={minPrice}&maxPrice={maxPrice}"
    return set(result['id'] for result in session.get(api_url).json()['results'])

def get_ids_for_category(minPrice, maxPrice, num_pages, session):
    return set(itertools.chain.from_iterable(thread_map(functools.partial(get_ids_from_page, minPrice=minPrice, maxPrice=maxPrice, session=session), range(1, num_pages+ 1), max_workers=64)))

def get_property(id, session):
    property_url = f"http://www.immoweb.be/en/classified/{id}"
    try:
        tables = pd.read_html(session.get(property_url, timeout=5).text)
        df = pd.concat(tables).set_index(0).T
        df['id'] = id
        df = df.set_index('id')
        return df.loc[:, ~df.columns.duplicated()]
    except:
        return None

def get_properties(ids, session, max_workers=64):
    return pd.concat(thread_map(functools.partial(get_property, session=session), ids, max_workers=max_workers))

In [7]:
with requests.Session() as session:
    ids = get_price_range(0, 10**7, session)
    properties = get_properties(ids, session)

100%|██████████| 205/205 [00:03<00:00, 58.63it/s]
100%|██████████| 220/220 [00:03<00:00, 57.35it/s]
100%|██████████| 246/246 [00:04<00:00, 49.90it/s] 
100%|██████████| 306/306 [00:08<00:00, 37.33it/s]
100%|██████████| 306/306 [00:06<00:00, 47.24it/s] 
100%|██████████| 270/270 [00:05<00:00, 48.14it/s]
100%|██████████| 207/207 [00:03<00:00, 53.60it/s]
100%|██████████| 272/272 [00:05<00:00, 46.98it/s] 
100%|██████████| 268/268 [00:05<00:00, 50.81it/s]
100%|██████████| 253/253 [00:04<00:00, 56.62it/s]
100%|██████████| 103/103 [00:02<00:00, 41.52it/s]
100%|██████████| 61/61 [00:04<00:00, 14.49it/s]
100%|██████████| 46/46 [00:00<00:00, 46.79it/s]
100%|██████████| 65313/65313 [17:22<00:00, 62.67it/s]  


In [8]:
properties.to_csv('properties.csv')