## HouseHunt Scraper

In [None]:
# imports
# %pip install undetected-chromedriver selenium beautifulsoup4
import os
from bs4 import BeautifulSoup
import time
import random

from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc ## use py 3.11 or lesser for this to work.


import json
import re
import pandas as pd
import numpy as np

# Output file path
output_file = 'output.parquet'

# Setup Chrome options
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')
driver = uc.Chrome(options=options)

In [None]:
## General Functions
# Function to simulate human-like waiting (random sleep time)
def human_sleep(min_sleep=2, max_sleep=5):
    time.sleep(random.uniform(min_sleep, max_sleep))

### Funda

TBD:
- ~~Filters for different attributes as an input~~
- cleanup kenmerken, choose only columns you want and extract the rest to columns. 
- ~~process listings on next page of homepage and so forth~~
- setup orchestrator/trigger rule. 
-  ~~Have a comparision step before scrape. If url already in df, then ignore and go to next url listing.~~
- ~~Extract phone number & makelaar name ~~
- On the home/search_results page, check for status. Some of them i saw: 'Verhuurd onder voorbehoud', 'Onder optie'. Can be usueful to update existing scraped article status. but might be too much work. 
- Email/Telegram automatic message or something


In [None]:
## Funda Scrape Funcs

def get_value_preceding_text(soup, text_search:str, element: str = "span"):
    # Find the span that contains text_search
    wonen_span = soup.find(element, string=lambda t: t and text_search in t.lower())

    # Get the previous sibling elementt
    if wonen_span:
        prev_span = wonen_span.find_previous_sibling(element)
        if prev_span:
            return(prev_span.text.strip())
        
def kenmerken_extract(sec_kenmerken): ## extract kenmerken table funda
    kenmerken_names = [i.text.strip() for i in sec_kenmerken.find_all("dt")]
    kenmerken_output = dict()
    for i in kenmerken_names:
        empty_dt = sec_kenmerken.find("dt", string=i)

        # Check if the <dt> is found and then find the corresponding <dd>
        if empty_dt:
            dd_element = empty_dt.find_next_sibling('dd')
            if dd_element:
                # print("Found <dd> text:", dd_element.text.strip())
                kenmerken_output[i] = dd_element.text.strip()
                # return dd_element.text.strip()
            else:
                print("No <dd> found after <dt> with empty text.")
        else:
            print("No <dt> with empty text found.")
    print(f"Num Kenmerken: {len(list(kenmerken_output.values()))}")
    return kenmerken_output

#function to extract phone number and format them a bit.
def format_number(number):
    # If the number contains '31' and doesn't start with '+', format it as '+31'
    number = re.sub(r"[^\d]", "", number)
    if number.startswith('31'): ##'31' in number and not
        return '+' + re.sub(r"[^\d]", "", number)
    # If the number starts with '06', leave it as '06'
    elif number.startswith('06'):
        return re.sub(r"[^\d]", "", number)
    else:
        print('ERROR: Weird phone number.')
        return number
        

def retrieve_html(url:str, driver):
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    return soup

Funda Filters

selected_area=["utrecht","amsterdam"]&price="1000-"&object_type=["apartment","parking"]&publication_date="3"&floor_area="30-"&plot_area="30-"&rooms="1-"&bedrooms="1-"&bathrooms="1-"&rental_agreement=["indefinite_duration","temporary_rent"]&renting_condition=["furnished","partially_furnished","service_cost_included","service_cost_excluded"]&energy_label=["A%2B%2B%2B%2B","G"]&exterior_space_type=["balcony","terrace","garden"]&construction_type=["newly_built","resale"]&construction_period=["unknown","before_1906","from_1931_to_1944"]&open_house=["all","coming_weekend","today"]


In [None]:
##only use double quotes for the str
input_dict = dict()
input_dict["selected_area"] = ["utrecht", "rotterdam", "ijsselstein"]
input_dict["price"] = "600-1400"
input_dict["object_type"] = ["house", "apartment"] #["house","apartment","parking","land","storage_space","storage","berth","substructure","pitch"]
# input_dict["publication_date"]= "5"
input_dict['availability']=["available","negotiations"] #"unavailable"
# input_dict["floor_area"] = "30-"
# input_dict["plot_area"]="30-"
# input_dict["rooms"]="1-"
# input_dict["bedrooms"] = "1-"
# input_dict["bathrooms"]="1-"
# input_dict["rental_agreement"] = ["indefinite_duration", "temporary_rent"]
# input_dict["renting_condition"] = ["furnished", "partially_finished", "service_cost_included", "service_cost_excluded"]
# input_dict["construction_type"] = ["newly_built","resale"]
# input_dict["open_house"] = ["all","coming_weekend","today"]

all_filters = list()
for i in input_dict.keys():
    if isinstance(input_dict[i], str):
        all_filters.append(f'{i}="{input_dict[i]}"')
    elif isinstance(input_dict[i], list):
        # Convert list items to strings with double quotes
        quoted_items = [f'"{item}"' for item in input_dict[i]]
        all_filters.append(f"{i}=[{','.join(quoted_items)}]")


# URL of homepage
homepage_url = "https://www.funda.nl/zoeken/huur"+"?"+"&".join(all_filters)
print(homepage_url)



In [None]:
## Process flow to extract ads from funda
driver.get(homepage_url)

# Wait for the page to load properly and the decline button to be clickable
try:
    # Wait for the decline button to be clickable (ensure the page has loaded)
    decline_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="didomi-notice-disagree-button"]'))
    )

    # Click the decline button
    decline_button.click()
    human_sleep()  # Simulate delay after clicking the decline button

except Exception as e:
    print(f"Error while clicking decline button: {e}")

# Wait for some more time to ensure page content has loaded before scraping
human_sleep(3, 6)

# Allow more time for page content to load dynamically
html = driver.page_source
soup_search_results = BeautifulSoup(html, 'html.parser')

# driver.quit()

### all <a> tag relating to page numbers. 
pages = soup_search_results.find_all('a', href=re.compile(r'\?page=\d+'))
total_pages = max([int(re.search(r'(\d+)$', i['href']).group(1)) for i in pages]) ## from list of page numbers scrapted, pick max



listing_urls = []
# Parse the JSON-LD data

for i in np.arange(1, total_pages + 1):
    if i > 1:
        human_sleep()
        soup_search_results = retrieve_html(homepage_url + "&search_result="+str(i), driver)

    script_tag = soup_search_results.find('script', {'type': 'application/ld+json'}) ## on the homepage/listings page, find the element containing urls linking to all listing.
    if script_tag:
        json_data = json.loads(script_tag.string)
        # Extract URLs from the itemListElement
        listing_urls.extend([item['url'] for item in json_data['itemListElement']])
        
        # Print out the extracted URLs
        for url in listing_urls:
            print(url)
    else:
        print("No script tag with JSON-LD data found.")



## Check if df already saved, and if yes if we already scraped the listing. 
processing_listing_urls = []
for url in listing_urls:
    if os.path.exists(output_file):
            saved_df = pd.read_parquet(output_file)
            if url in saved_df['listing_url'].values:
                print('Already scraped. Ignore or repeat scrape later.')
            else:
                print('New listing. Processing:')
                processing_listing_urls.append(url)

    else:
        saved_df = pd.DataFrame()
        processing_listing_urls.append(url)


df_funda = pd.DataFrame(columns=['price', 'address', 'city', 'province', 'postcode', 'living_area', 'num_bedrooms', 'kenmerken', 'omschrijving', 'phone', 'makelaar_name', 'makelaar_url', 'status', 'listing_url', 'timestamp'])
for url in processing_listing_urls:
    output_list = [None] * len(df_funda.columns)
    # URL of the page you want to scrape
    # url = listing_urls[0]
    driver.get(url)
    scrape_time = time.time()
    human_sleep(3, 6)

    # Allow more time for page content to load dynamically
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')



    # Find the <h2> element with text "Omschrijving" and find the next div with class 'listing-description-text'
    h2_element = soup.find('h2', string="Omschrijving")

    # Check if we found the h2 element
    if h2_element:
        # Find the next div element after the h2
        description_div = h2_element.find_next('div', class_='listing-description-text')
        
        # Extract and print the text inside the description div
        if description_div:
            omschrijving = description_div.get_text(strip=True)
            print(omschrijving)
        else:
            print("Description div not found.")
    else:
        print("Heading 'Omschrijving' not found.")

    ## extract price
    ld_json_script = soup.find('script', type='application/ld+json')
    price = json.loads(ld_json_script.string)['offers']['price']

    output_list[0] = price 

    ## extract title of page (address)

    title_text = soup.title.get_text()

    # Step 2: Use regex to extract text after colon and before "[Funda]" (if present)
    match = re.search(r':\s*(.*?)\s*(?:\[Funda\])?$', title_text)
    if match:
        address = match.group(1)
        print('Address: '+ address)  # Output: Hoekeindseweg 162 2665 KH Bleiswijk
    else:
        print("No match found.")
    output_list[1] = address
    # soup.find(class_="md:font-bold") #surface area
    city_divs = soup.find_all('div', attrs={"city": True}) ## contains all info about the house address
    city_divs[0]['city']
    city_divs[0]['province']
    city_divs[0]['housenumber']
    city_divs[0]['neighborhoodidentifier']
    city_divs[0]['postcode']

    output_list[2] = city_divs[0]['city']
    output_list[3] = city_divs[0]['province']
    output_list[4] = city_divs[0]['postcode']

            
    living_area = get_value_preceding_text(soup, 'wonen')
    plot_size = get_value_preceding_text(soup, 'perceel')
    slaapkamers = get_value_preceding_text(soup, 'slaapkamers')


    # output_list[5] = living_area
    output_list[6] = slaapkamers


    sec_kenmerken = soup.find('section', {'id': 'features'})


    alle_kenmerken = kenmerken_extract(sec_kenmerken)
    print(alle_kenmerken)
    output_list[7] = alle_kenmerken

    output_list[5] = re.search(r'\d+', alle_kenmerken['Wonen']).group() ## specify wonen/living_area

    omschrijving = soup.find('h2', string="Omschrijving").find_next('div', class_='listing-description-text').get_text(strip=True)
    print(omschrijving)
    output_list[8] = omschrijving

    ### Extract phone number for listing
    elements = soup.find_all(lambda tag: tag.name and tag.string and "Bel" in tag.string and any(char.isdigit() for char in tag.string)) ## look for word bel and numeric
    try:
        raw_text = elements[0].get_text()  # Get the text content from the element
        # Extract the phone number part using regex
        match = re.search(r"(\+?\(?\d+\)?[\d\- ]+)", raw_text)  # Match a phone number
        if match:
            phone_number = match.group(0)  # Extract the phone number from the match
            formatted_phone = format_number(phone_number)  # Format the number
            print("Formatted number:", formatted_phone)
        else:
            print("No valid phone number found in:", raw_text)
    except Exception as e:
        print(f'Phone number not found. {e}')
        formatted_phone = None

    output_list[9] = formatted_phone

    ### Extract makelaar name and url
    try:
        for a_tag in soup.find_all('a', href=True):  # Find <a> tags with href attribute
            href = a_tag['href']
            if 'https://www.funda.nl/makelaars/' in href.lower():  # Check if 'makelaar' is in href
                # Get the title attribute (if exists), default to empty string if not
                title = a_tag.get('title', '')
                makelaar_details = {'makelaar_url':href, 'makelaar_name':title}
    except Exception as e:
        print(f'Makelaar name not found. {e}')
        makelaar_details = {'makelaar_url':None, 'makelaar_name':None}

    output_list[10] = makelaar_details['makelaar_name']
    output_list[11] = makelaar_details['makelaar_url']
    output_list[-3] = alle_kenmerken['Status']
    output_list[-2] = url
    output_list[-1] = scrape_time
    df_funda.loc[len(df_funda)] = output_list

driver.quit()

## saving df and appending any new scraping
updated_df = pd.concat([saved_df, df_funda], ignore_index=True)

# Save the combined DataFrame back to output.parquet
updated_df.to_parquet('output.parquet', index=False)
print("Updated data saved to output.parquet")


# # Find the span that contains "wonen"
# wonen_span = soup.find('span', string=lambda t: t and 'perceel' in t.lower())

# # Get the previous sibling span - Useful to find area
# if wonen_span:
#     prev_span = wonen_span.find_previous_sibling('span')
#     if prev_span:
#         print(prev_span.text.strip())  # Output: 110 m²
# soup.find('section', {'id': 'features'})


In [None]:
updated_df

## Pararius

Try retrieving the same attributes as funda to maintain db consistency. Otherwise adds nones