# Scraping data from a real estate agency website

## 0) Import all required libraries

In [6]:
# to extract all properties urls (needed to handle with javascript)
from selenium import webdriver 

# to access the html content of a single property url
import requests 

# to select parts of an XML or HTML text using CSS or XPath and extract data from it
from parsel import Selector 

# to select parts of an XML or HTML using BeautifulSoup (XPath not supported)
from bs4 import BeautifulSoup 

# to use regular expressions
import re 

# to build the dataframe
import pandas as pd 

# to build a dictionary form a string
import json 

# to build a defaultdict
from collections import defaultdict

## 1) Obtain 10000 url of houses with webdriver (appartments below)

In [13]:
class PropertyLinkScraping:
    def __init__(self, link):
        self.url = link
        
    def collect_property_link(self):
        driver = webdriver.Chrome(executable_path='chromedriver.exe')

        # Iterate through all result pages (i) and get the url of each of them
        for i in range(1, 2):
            apikey = str(i)+'&orderBy=relevance'
            url = self.url+apikey

            # An implicit wait tells WebDriver to poll the DOM for a
            #  certain amount of time when trying to find any element 
            #     (or elements) not immediately available. 
            driver.implicitly_wait(10)

            # The first thing you’ll want to do with WebDriver is navigate
            #   to a link. The normal way to do this is by calling get method:    
            driver.get(url)

            # Selector allows you to select parts of an XML or HTML text using CSS
            #   or XPath expressions and extract data from it.
            sel = Selector(text=driver.page_source) 

            # Store the xpath query of houses
            xpath_property = '//*[@id="main-content"]/li//h2//a/@href'

            # Find nodes matching the xpath ``query`` and return the result
            page_property_url = sel.xpath(xpath_property).extract()

            for url in page_property_url:
                houses_class = HouseApartmentScraping(url)
                houses_class.add_csv()

In [None]:
class HouseApartmentScraping:
    def __init__(self, url):
        self.url = url
        
        self.html = requests.get(self.url).content
        #self.sel = Selector(text=str(self.html))
        self.soup = BeautifulSoup(self.html,'html.parser')
        
        self.house_dict = self.house_dict()
        
        self.type_property = self.type_property()
        self.locality = self.locality()
        self.subtype = self.subtype()
        self.price = self.price()
        self.type_sale = self.type_sale()
        self.num_rooms = self.num_rooms()
        self.area = self.area()
        self.kitchen = self.kitchen()
        self.furnished = self.furnished()
        self.fire = self.fire()
        self.terrace_area = self.terrace_area()
        self.garden_area = self.garden_area()
        self.land = self.land()
        self.num_facade = self.num_facade()
        self.pool = self.pool()
        self.state = self.state()
        
    def house_dict(self):
        try:
            result_set = self.soup.find_all('script',attrs={"type" :"text/javascript"})
            for tag in result_set:
                #If it contains substring 'window.classified', we've found the right tag
                if 'window.classified' in str(tag.string):
                    # print(tag.string)
                    window_classified = tag
                    #when we've found the right tag we can stop the loop earlier
                    break
            wcs = window_classified.string
            wcs.strip()
            wcs = wcs[wcs.find("{"):wcs.rfind("}")+1]
            house_dict = json.loads(wcs)
            return house_dict
        except:
            return None

    def type_property(self):
        try:
            return self.house_dict['property']['type']
        except:
            return None        
    
    def locality(self):
        try:
            return self.house_dict['property']['location']['postalCode']
        except:
            return None
    
    def subtype(self):
        try:
            return self.house_dict['property']['subtype']
        except:
            return None
    
    def price(self):
        try:
            return int(self.house_dict['transaction']['sale']['price'])
        except:
            return None
    
    def type_sale(self):
        try:
            if self.house_dict['flags']['isPublicSale'] == True:
                return 'Public Sale'
            elif self.house_dict['flags']['isNotarySale'] == True:
                return 'Notary Sale'
            elif self.house_dict['flags']['isAnInteractiveSale'] == True:
                return 'Intractive Sale'
            else:
                return None
        except:
            return None 
    
    def num_rooms(self):
        try:
            return int(self.house_dict['property']['bedroomCount'])
        except:
            return None
    
    def area(self):
        try:
            return int(self.house_dict['property']['netHabitableSurface'])
        except:
            return None
    
    def kitchen(self):
        try: 
            kitchen_type = self.house_dict['property']['kitchen']['type']
            if kitchen_type:
                return 1
            else:
                return 0        
        except:
            return None
        
    def furnished(self):
        try:
            furnished = self.house_dict['transaction']['sale']['isFurnished']
            if furnished == True:
                return 1
            else:
                return 0
            
        except:
            return None
    
    def fire(self):
        try:
            fire = self.house_dict['property']['fireplaceExists']
            if fire == True:
                return 1 
            else:
                return 0                
        except:
            return None
    
    def terrace_area(self):
        try:
            if self.house_dict['property']['hasTerrace'] == True:
                return int(self.house_dict['property']['terraceSurface'])
            else:
                return 0
        except:
            return None
    
    def garden_area(self):
        try:
            if self.house_dict['property']['hasGarden'] ==  True:
                return self.house_dict['property']['gardenSurface']
            else:
                return 0
        except:
            return None
    
    def land(self):
        try:
            if self.house_dict['property']['land'] != None:
                return self.house_dict['property']['land']['surface']
            else:
                return 0
        except:
            return None
        
    def num_facade(self):
        try:
            return int(self.house_dict['property']['building']['facadeCount'])
        except:
            return None
        
    def pool(self):
        try: 
            swim_regex = re.findall('swimming pool', str(self.html))
            if swim_regex:
                return 1
            else:
                return 0
        except:
            return None
        
    def state(self): 
        try:
            return self.house_dict['property']['building']['condition']
        except:
            return None
        
    def add_csv (self):
         with open('houses_apartments_urls.csv', 'a') as file:
                file.write(self.locality+",")
                file.write(self.type_property+",")
                file.write(self.subtype+",")
                file.write(str(self.price)+",")
                file.write(str(self.type_sale)+",")
                file.write(str(self.num_rooms)+",")
                file.write(str(self.area)+",")
                file.write(str(self.kitchen)+",")
                file.write(str(self.furnished)+",")
                file.write(str(self.fire)+",")
                file.write(str(self.terrace_area)+",")
                file.write(str(self.garden_area)+",")
                file.write(str(self.land)+",")
                file.write(str(self.num_facade)+",")
                file.write(str(self.pool)+",")
                file.write(str(self.state)+"\n")
                file.close()

In [14]:
houses_url = PropertyLinkScraping('https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=')
houses_url.collect_property_link()

In [15]:
apartments_url = PropertyLinkScraping('https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page=')
apartments_url.collect_property_link()

In [32]:
header = ['Locality', 'Type of property', 'Subtype of property', 'Price', 'Type of sale', 
          'Number of rooms', 'Living surface area', 'Kitchen', 'Furnished', 'Open fire',
          'Terrace', 'Garden', 'Surface of the land', 'Number of facades', 'Swimming pool',
          'State of the building']

df = pd.read_csv("houses_apartments_urls.csv", sep=',', names=header)

In [33]:
df.shape

(59, 16)

In [34]:
df.head()

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Living surface area,Kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Number of facades,Swimming pool,State of the building
0,4180,HOUSE,HOUSE,195000.0,,3.0,242.0,1.0,0,0,36,100,475,3.0,0,GOOD
1,7500,HOUSE_GROUP,HOUSE_GROUP,,,,,,0,0,0,0,0,,0,
2,4480,HOUSE,HOUSE,245000.0,,5.0,142.0,1.0,0,0,3749,250,615,4.0,0,TO_BE_DONE_UP
3,4180,HOUSE,MIXED_USE_BUILDING,295000.0,,3.0,242.0,1.0,0,0,36,1000,1403,3.0,0,GOOD
4,4570,HOUSE,HOUSE,549000.0,,5.0,365.0,1.0,0,0,15,1850,25157,4.0,0,GOOD


In [25]:
df.tail()

Unnamed: 0,Index,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Living surface area,Kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Number of facades,Swimming pool,State of the building
54,1000,APARTMENT_GROUP,APARTMENT_GROUP,,,,,,0,0,0,0.0,0,,0,,
55,1000,APARTMENT,APARTMENT,1795000.0,,4.0,650.0,1.0,0,1,400,,0,3.0,1,AS_NEW,
56,1160,APARTMENT,APARTMENT,630000.0,,3.0,160.0,1.0,0,0,23,0.0,0,4.0,0,AS_NEW,
57,1180,APARTMENT,PENTHOUSE,1500000.0,,3.0,220.0,1.0,0,0,60,0.0,0,3.0,0,AS_NEW,
58,5300,APARTMENT_GROUP,APARTMENT_GROUP,,,,,,0,0,0,0.0,0,,0,,
