In [85]:
import functools
import itertools
import requests
import pandas as pd
from tqdm.contrib.concurrent import thread_map, process_map
from multiprocessing import Pool

In [86]:
def get_price_range(minPrice, maxPrice, session):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page=1&orderBy=newest&isALifeAnnuitySale=false&minPrice={minPrice}&maxPrice={maxPrice}"
    num_pages = session.get(api_url).json()['marketingCount'] // 30 + 1
    if num_pages > 333:
        mid_price = (minPrice + maxPrice) // 2 
        return get_price_range(minPrice, mid_price, session) + get_price_range(mid_price, maxPrice, session)
    return [(minPrice, maxPrice, num_pages)]

def get_ids_from_page(i, minPrice, maxPrice, session):
    api_url = f"https://www.immoweb.be/en/search-results/house-and-apartment/for-sale?countries=BE&page={i}&orderBy=newest&isALifeAnnuitySale=false&minPrice={minPrice}&maxPrice={maxPrice}"
    return [result['id'] for result in session.get(api_url).json()['results']]

def get_ids_for_category(minPrice, maxPrice, num_pages, session):
    return set(itertools.chain.from_iterable(thread_map(functools.partial(get_ids_from_page, minPrice=minPrice, maxPrice=maxPrice, session=session), range(1, num_pages+ 1))))

def get_property(id, session):
    property_url = f"http://www.immoweb.be/en/classified/{id}"
    try:
        tables = pd.read_html(session.get(property_url, timeout=5).text)
        df = pd.concat(tables).set_index(0).T
        df['id'] = id
        df = df.set_index('id')
        return df.loc[:, ~df.columns.duplicated()]
    except:
        return None

def get_properties(ids, session, max_workers=64):
    return pd.concat(thread_map(functools.partial(get_property, session=session), ids, max_workers=max_workers))

In [87]:
ids = set()
with requests.Session() as session:
    for minPrice, maxPrice, num_pages in get_price_range(0, 10**7, session):
        ids.update(get_ids_for_category(minPrice, maxPrice, num_pages, session))
    properties = get_properties(ids, session, 16)

100%|██████████| 205/205 [00:07<00:00, 27.72it/s]
100%|██████████| 219/219 [00:06<00:00, 34.17it/s]
100%|██████████| 246/246 [00:09<00:00, 26.86it/s]
100%|██████████| 305/305 [00:12<00:00, 23.47it/s]
100%|██████████| 306/306 [00:11<00:00, 26.55it/s]
100%|██████████| 270/270 [00:09<00:00, 28.64it/s]
100%|██████████| 207/207 [00:07<00:00, 28.09it/s]
100%|██████████| 272/272 [00:10<00:00, 26.25it/s]
100%|██████████| 268/268 [00:09<00:00, 29.75it/s]
100%|██████████| 253/253 [00:09<00:00, 27.66it/s]
100%|██████████| 103/103 [00:04<00:00, 24.24it/s]
100%|██████████| 61/61 [00:02<00:00, 30.11it/s]
100%|██████████| 46/46 [00:01<00:00, 28.34it/s]
  with self._cond:
  3%|▎         | 1663/65212 [00:58<37:14, 28.44it/s]  


KeyboardInterrupt: 

In [None]:
properties.to_csv('properties.csv')

In [84]:
!python3 test_async.py

Traceback (most recent call last):
  File "/home/snape/Documents/challenge-collecting-data/utils/test_async.py", line 13, in <module>
    print(asyncio.run(get_ids_for_category(100000, 200000, 100)))
  File "/usr/lib/python3.10/asyncio/runners.py", line 44, in run
    return loop.run_until_complete(main)
  File "/usr/lib/python3.10/asyncio/base_events.py", line 646, in run_until_complete
    return future.result()
  File "/home/snape/Documents/challenge-collecting-data/utils/test_async.py", line 9, in get_ids_for_category
    with httpx.AsyncClient() as session:
AttributeError: __enter__


In [81]:
loop = asyncio.get_event_loop()
moo = asyncio.run_coroutine_threadsafe(get_ids_for_category(minPrice=0, maxPrice=10**5, num_pages=100), loop)

In [82]:
moo

<Future at 0x7fe184ccf430 state=finished raised AttributeError>