In [1]:
import functools
import itertools
import requests
import pandas as pd
from tqdm.contrib.concurrent import thread_map

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_ids_from_page(i, property_type, session):
    api_url = f"https://www.immoweb.be/en/search-results/{property_type}/for-sale?countries=BE&page={i}&orderBy=newest&isALifeAnnuitySale=false"
    return [result['id'] for result in session.get(api_url).json()['results']]

def get_ids_for_category(property_type, session):
    return set(itertools.chain.from_iterable(thread_map(functools.partial(get_ids_from_page, property_type=property_type, session=session), range(1, 334))))

def get_property(id, session):
    property_url = f"http://www.immoweb.be/en/classified/{id}"
    try:
        tables = pd.read_html(session.get(property_url, timeout=5).text)
        df = pd.concat(tables).set_index(0).T
        df['id'] = id
        df = df.set_index('id')
        return df.loc[:, ~df.columns.duplicated()]
    except:
        return None

def get_properties(ids, session, max_workers=64):
    return pd.concat(thread_map(functools.partial(get_property, session=session), ids, max_workers=max_workers))

In [5]:
ids = set()
with requests.Session() as session:
    for property_type in ['apartment', 'house']:
        ids.update(get_ids_for_category(property_type, session))
    properties = get_properties(ids, session)

100%|██████████| 333/333 [00:13<00:00, 24.05it/s]
100%|██████████| 333/333 [00:13<00:00, 25.29it/s]
100%|██████████| 19980/19980 [05:27<00:00, 60.95it/s] 


In [6]:
properties.to_csv('properties.csv')