In [1]:
from selenium import webdriver
import requests
from parsel import Selector
from bs4 import BeautifulSoup

## 1) We obtain 10000 url of the properties with web driver.

In [3]:
driver = webdriver.Chrome(executable_path='chromedriver.exe')

# We add url of each house in 333 page in immoweb to houses_url list.
houses_url = []

for i in range(1, 3):
    # We used 'i' to build urls of the 333 page in immoweb.
    #   So, we can reach 333 pages with for loop.
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/house/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #   certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector` allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # xpath query of the houses in the immoweb page
    xpath_houeses = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_houses_url = sel.xpath(xpath_houeses).extract()
    
    # There are approximately 30 houses in each page.
    # We add each page url list to houses_url like matrix.
    houses_url.append(page_houses_url)

print(houses_url[1][4])
print(len(houses_url))

https://www.immoweb.be/en/classified/mixed-use-building/for-sale/forest/1190/8758672?searchId=5f6ca5c7d8ae3
2


## 2) We scrape all data of each house with requests

#### Each url represents a house and a house have many attributes like localty, type_property etc.  

In [6]:
import re
class HouseScraping:
    def __init__(self, url):
        self.url = url
        self.html = str(requests.get(self.url).content)
        self.sel = Selector(text=self.html)
        self.soup = BeautifulSoup(self.html)
        
        self.locality = self.locality()
        self.type_property = self.type_property()
        self.subtype = self.subtype()
        self.price = self.price()
        self.type_sale = self.type_sale()
        self.num_rooms = self.num_rooms()
        self.area = self.area()
        self.kitchen = self.kitchen()
        self.furnished = self.furnished()
        self.fire = self.fire()
        self.terrace_area = self.terrace_area()
        self.garden_area = self.garden_area()
        self.land = self.land()
        self.land_plot = self.land_plot()
        self.num_facade = self.num_facade()
        self.pool = self.pool()
        self.state = self.state()
    
    def locality(self):
        return None
    
    def type_property(self):
        return None
    
    def subtype(self):
        return None
    
    def price(self):
        return None
    
    def type_sale(self):
        return None
    
    def num_rooms(self):
        return None
    
    def area(self):
        return None
    
    def kitchen(self):
        return None
        
    def furnished(self):
        return None
    
    def fire(self):
        return None
    
    def terrace_area(self):
        return None
    
    def garden_area(self):
        return None
    
    def land(self):
        return None
    
    def land_plot(self):
        return None
    
    def num_facade(self):
        return None
    
    def pool(self):
        swim_regex = re.findall('swimming pool', self.html)
        if swim_regex:
            return 1
        else:
            return 0
        
    def state(self):            
        return None

####       We collect all data in the houses_dict.

In [7]:
from collections import defaultdict
houses_dict = defaultdict(list)

for page_list in houses_url:
    for url_a_house in page_list[:3]:
        
        houses_class = HouseScraping(url_a_house)
        
        houses_dict['Locality'].append(houses_class.locality)
        houses_dict['Type of property'].append(houses_class.type_property)
        houses_dict['Subtype of property'].append(houses_class.subtype)
        houses_dict['Price'].append(houses_class.price)
        houses_dict['Type of sale'].append(houses_class.type_sale)
        houses_dict['Number of rooms'].append(houses_class.num_rooms)
        houses_dict['Area'].append(houses_class.area)
        houses_dict['Fully equipped kitchen'].append(houses_class.kitchen)
        houses_dict['Furnished'].append(houses_class.furnished)
        houses_dict['Open fire'].append(houses_class.fire)
        houses_dict['Terrace'].append(houses_class.terrace_area)
        houses_dict['Garden'].append(houses_class.garden_area)
        houses_dict['Surface of the land'].append(houses_class.land)
        houses_dict['Surface area of the plot of land'].append(houses_class.land_plot)
        houses_dict['Number of facades'].append(houses_class.num_facade)
        houses_dict['Swimming pool'].append(houses_class.pool)
        houses_dict['State of the building'].append(houses_class.state)

## 3) We store all data to a csv file with dataframe.

In [8]:
import pandas as pd
    
df = pd.DataFrame(houses_dict)
df.to_csv('all_data_of_the_houses.csv')

In [9]:
df

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Area,Fully equipped kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Surface area of the plot of land,Number of facades,Swimming pool,State of the building
0,,,,,,,,,,,,,,,,0,
1,,,,,,,,,,,,,,,,0,
2,,,,,,,,,,,,,,,,0,
3,,,,,,,,,,,,,,,,1,
4,,,,,,,,,,,,,,,,0,
5,,,,,,,,,,,,,,,,0,
