## Import some packages

In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date
from datetime import datetime
from random import randrange
import glob
import os
from tqdm import tqdm_notebook
from tqdm import tqdm

## Start web browser using Selenium

By using Selenium we can basicly imitate that we are using a proper web browser and browsing like that on a website. It is useful since with other methods not every element of the webpage is loaded.

We can also set some preferences that speeds up our web scraping. In this case since I do not want to scrape images, so our browser will not load them.

This is the most easiest way to start Selenium. As you will see it it will open a Chrome window:

In [2]:
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)



Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\menyh\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)


We can test that it works by running the chunk below which will navigate to a given page and download its content using Selenium and BeautifulSoup.

In [3]:
starting_link_to_scrape = "https://ingatlan.com/lista/elado+lakas+budapest?page=1"
driver.get(starting_link_to_scrape)
time.sleep(1)
base_source = driver.page_source
base_soup = BeautifulSoup(base_source, 'html.parser')

In [4]:
#base_soup

# Start

We want to scrapte www.ingatlan.com which is the leading Hungarian housing website where you can find: apartments and houses to buy and rent. First we will focus on the buying market. On the buying market we need to handle apartments and houses differently because they have different properties and requires different scraping methods. For now I will just show how to do the scraping for apartments.

# SCRAPTING APARTMENTS

## Write a function that gets all the links from one page for apartments

In [5]:
def get_links_and_other_from_one_page(link = 'https://ingatlan.com/lista/elado+lakas+budapest?page=1',
                                     sleep_time = 1):
    
    # Navigate the webbrowser using Selenium to the given page.
    starting_link_to_scrape = link
    driver.get(starting_link_to_scrape)
    time.sleep(sleep_time) #set sleep time
    base_source = driver.page_source
    base_soup = BeautifulSoup(base_source, 'html.parser')
    
    #get properties of the page
    properties = base_soup.findAll('a', {'class': 'listing__link js-listing-active-area'})
    
    links = []
    prices = []
    adresses = []
    area_sizes = []
    room_numbers = []
    balcony_sizes = []

    for one_prop in properties:
        
        #link
        try:
            links.append(one_prop['href'])
        except:
            links.appen('no_link')
        
        #price
        try:
            price = one_prop.find('div', {'class': 'price'})
            prices.append(price.text)
        except:
            prices.append('NA')
        
        #address
        try:
            adress = one_prop.find('div', {'class': 'listing__address'})
            adresses.append(adress.text)
        except:
            adresses.append('NA')
        
        #area
        try:
            area_size = one_prop.find('div', {'class': 'listing__parameter listing__data--area-size'})
            area_sizes.append(area_size.text)
        except:
            area_sizes.append('NA')
        
        #room_number
        try:
            room_number = one_prop.find('div', {'class': 'listing__parameter listing__data--room-count'})
            room_numbers.append(room_number.text)
        except:
            room_numbers.append('NA')
        
        #balcony size
        try:
            balcony_size = one_prop.find('div', {'class': 'listing__parameter listing__data--balcony-size'})
            balcony_sizes.append(balcony_size.text)
        except:
            balcony_sizes.append('NA')
    
    #put the data to a Pandas df
    df_dict = {'links' : links,
               'prices' : prices,
               'adresses' : adresses,
               'area_sizes' : area_sizes,
               'room_numbers' : room_numbers,
               'balcony_sizes' : balcony_sizes}
    
    df = pd.DataFrame(df_dict)
    
    return(df)

Test the function for one page and we get the following data:

In [6]:
links_page1 = get_links_and_other_from_one_page('https://ingatlan.com/lista/elado+lakas+budapest?page=4')

In [7]:
links_page1

Unnamed: 0,links,prices,adresses,area_sizes,room_numbers,balcony_sizes
0,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3216...,51.9 M Ft,"Szegedi út, XIII. kerület",50 m² terület,2 szoba,8 m² erkély
1,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3231...,50.39 M Ft,"Tahi utca 59-61, XIII. kerület",45 m² terület,2 szoba,9 m² erkély
2,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3232...,77.48 M Ft,"Tahi utca 59-61, XIII. kerület",76 m² terület,4 szoba,7.8 m² erkély
3,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3143...,38 M Ft,"Tahi utca, XIII. kerület",47 m² terület,1 + 1 fél szoba,1 m² erkély
4,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3201...,87.03 M Ft,"Cserhalom utca, XIII. kerület",86 m² terület,4 szoba,26.3 m² erkély
5,/xviii-ker/elado+lakas/csuszozsalus-lakas/3264...,34.9 M Ft,"Lakatos-lakótelep, XVIII. kerület",43 m² terület,1 + 2 fél szoba,2 m² erkély
6,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3205...,46.99 M Ft,"Szabolcs utca 4-6, XIII. kerület",45 m² terület,2 szoba,11.12 m² erkély
7,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3244...,39.64 M Ft,"Cserhalom utca, XIII. kerület",30 m² terület,1 szoba,5.32 m² erkély
8,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3240...,48.2 M Ft,"Csata utca, XIII. kerület",46 m² terület,2 szoba,9 m² erkély
9,/xi-ker/elado+lakas/tegla-epitesu-lakas/32505373,86.4 M Ft,"Kánai út, XI. kerület",69 m² terület,3 szoba,15.81 m² erkély


## Write a function that scrapes all pages using the previus functions

In [8]:
def get_properties_for_many_pages(web_page_base = 'https://ingatlan.com/lista/elado+lakas+budapest?page=',
                                  from_page = 1, to_page = 10,
                                  save_csv = True):
    
    print('Started Scraping')
    print('Scraping the Following Page:')
    
    all_data = pd.DataFrame()
    web_page_base = 'https://ingatlan.com/lista/elado+lakas+budapest?page='
    
    for i in tqdm(range(from_page, to_page), desc = "Progress: "):
        #tqdm() automaticly creates a progressbar
        
        try:
            web_page_link = web_page_base + str(i)
            one_data = get_links_and_other_from_one_page(web_page_link)            
            all_data = pd.concat([all_data,one_data], axis=0, ignore_index=True)
            
        except:
            continue
    
    all_data['webpage_all'] = 'https://ingatlan.com' + all_data['links']
    
    if save_csv:
        now = datetime.now()
        dt_string = now.strftime("%Y%m%d_%H%M%S")
        file_name_for_saving = str('base_data' + dt_string + '.csv')
        
        all_data.to_csv(file_name_for_saving, encoding='utf-8', index=False)
    
    return(all_data)

In [9]:
#list_of_aps = get_properties_for_many_pages(from_page=1, to_page=1600, save_csv = True)

## Import the file that we want to use to be the basis of further scraping = we want all the apartaments that we have just scaped

In [10]:
filename_of_apt_file = 'base_data20220111_181100.csv'
list_of_aps_data = pd.read_csv(filename_of_apt_file)
list_of_aps_data

Unnamed: 0,links,prices,adresses,area_sizes,room_numbers,balcony_sizes,webpage_all
0,/xii-ker/elado+lakas/tegla-epitesu-lakas/32477547,134.9 M Ft,"Bartha utca, XII. kerület",146 m² terület,1 + 3 fél szoba,13 m² erkély,https://ingatlan.com/xii-ker/elado+lakas/tegla...
1,/xiii-ker/elado+lakas/tegla-epitesu-lakas/3264...,88.85 M Ft,"Úszódaru utca 2, XIII. kerület",63 m² terület,2 szoba,13 m² erkély,https://ingatlan.com/xiii-ker/elado+lakas/tegl...
2,/x-ker/elado+lakas/tegla-epitesu-lakas/32632861,35.04 M Ft,"Zágrábi utca 8, X. kerület",37 m² terület,1 + 1 fél szoba,8.91 m² erkély,https://ingatlan.com/x-ker/elado+lakas/tegla-e...
3,/iii-ker/elado+lakas/panel-lakas/32109922,29.9 M Ft,"Bogdáni út, III. kerület",51 m² terület,2 szoba,,https://ingatlan.com/iii-ker/elado+lakas/panel...
4,/iii-ker/elado+lakas/panel-lakas/32308587,29.9 M Ft,"Bogdáni út, III. kerület",51 m² terület,2 szoba,,https://ingatlan.com/iii-ker/elado+lakas/panel...
...,...,...,...,...,...,...,...
29460,/v-ker/elado+lakas/tegla-epitesu-lakas/6248324,540 M Ft,"Vörösmarty tér, V. kerület",260 m² terület,3 szoba,,https://ingatlan.com/v-ker/elado+lakas/tegla-e...
29461,/v-ker/elado+lakas/tegla-epitesu-lakas/6181917,349 M Ft,"Belgrád rakpart, V. kerület",246 m² terület,5 + 1 fél szoba,,https://ingatlan.com/v-ker/elado+lakas/tegla-e...
29462,/vii-ker/elado+lakas/tegla-epitesu-lakas/5196070,97.39 M Ft,"Damjanich utca, VII. kerület",109 m² terület,3 szoba,6 m² erkély,https://ingatlan.com/vii-ker/elado+lakas/tegla...
29463,/viii-ker/elado+lakas/tegla-epitesu-lakas/4563923,79.8 M Ft,"Népszínház utca, VIII. kerület",102 m² terület,3 + 1 fél szoba,3 m² erkély,https://ingatlan.com/viii-ker/elado+lakas/tegl...


### Get one random apartment's link

In [11]:
random_number = randrange(len(list_of_aps_data))
print(random_number)
one_link = list_of_aps_data['webpage_all'][random_number]
print(one_link)

16835
https://ingatlan.com/iii-ker/elado+lakas/panel-lakas/32590304


## Write a function that can srcrape one apartment

By using this function we can get the data for one apartment in a Pandas data frame

In [12]:
def scrape_one_apartment(link = 'https://ingatlan.com/xiii-ker/elado+lakas/tegla-epitesu-lakas/32076067',
                        sleep_time = 1):
    
    starting_link_to_scrape = link
    driver.get(starting_link_to_scrape)
    time.sleep(sleep_time)
    base_source = driver.page_source
    time.sleep(sleep_time) #sleep time
    base_soup = BeautifulSoup(base_source, 'html.parser')
    
    try:
        parameters_base = base_soup.find('dl', {'class': 'parameters'})
        #print(parameters_base)
        parameterNames_raw = parameters_base.findAll('dt', {'class': 'parameterName'})
        parameterValues__raw = parameters_base.findAll('dd', {'class': 'parameterValue'})
        
        parameterNames = []
        parameterValues = []
        
        for parameterName in parameterNames_raw:
            parameterName_text = parameterName.text
            parameterName_text = re.sub(r'[\W_]+', '', parameterName_text)
            parameterNames.append(parameterName_text)
            
        for parameterValue in parameterValues__raw:
            parameterValue_text = parameterValue.text
            parameterValue_text = re.sub(r'[\W_]+', '', parameterValue_text)
            parameterValues.append(parameterValue_text)
    
        dataframe = pd.DataFrame(data = [parameterValues], columns = [parameterNames]) 
        dataframe['link'] = link
        dataframe['error'] = 'no'
    
    except:
        dataframe = pd.DataFrame(data = ['yes'], columns = ['error'])
    
    return(dataframe)

Test the function

In [13]:
one_ap_params = scrape_one_apartment(link=one_link)

In [14]:
one_ap_params

Unnamed: 0,Ingatlanállapota,Építéséve,Komfort,Energiatanúsítvány,Emelet,Épületszintjei,Lift,Belmagasság,Fűtés,Légkondicionáló,...,FürdőésWC,Tájolás,Kilátás,Erkélymérete,Kertkapcsolatos,Tetőtér,Panelprogram,Parkolás,link,error
0,jóállapotú,1950és1980között,összkomfortos,nincsmegadva,7,10,van,nincsmegadva,távfűtésegyediméréssel,van,...,különhelyiségben,kelet,panorámás,nincsmegadva,nincsmegadva,nemtetőtéri,résztvett,utcaközterület,https://ingatlan.com/iii-ker/elado+lakas/panel...,no


## As of now we have all the components to scrape the data for all the apartments

We need to create all the links that are refering to the apartments (we have them it our pandas data frame) and loop through them

In [15]:
all_links_for_apartments_all = list_of_aps_data['webpage_all']
print(len(all_links_for_apartments_all))
all_links_for_apartments = list(dict.fromkeys(all_links_for_apartments_all))
print(len(all_links_for_apartments))

29465
29444


We have this many apartment as above

## Write a function what loops throuh all the apartments and saves them

In [16]:
def get_all_apt_params(links = all_links_for_apartments[1:5], save_csv = True, filename_input = 'apt_data'):
    
    all_data = pd.DataFrame()
    i = 0
    all_links_len = len([links])
    time1 = datetime.now()
    
    for link in [links]:
        
        now = datetime.now()
        dt_string = now.strftime("%Y.%m.%d %H:%M:%S")
        
        try:
            #print(str(i) + ') ' + str(link))
            one_data = scrape_one_apartment(link)
            one_data['download_time'] = dt_string
            
            all_data = pd.concat([all_data,one_data], axis=0, ignore_index=True)
            i = i + 1
            
            #print(str('left: ') + str(all_links_len - i + 1))
            
            divider = 25
            if i % divider == 0:
                
                time2 = datetime.now()
                
                time_left = str(round((all_links_len - i + 1) / divider * (time2 - time1).total_seconds() / 60))
                #print("approx mins left: ", time_left)
                
                time1 = datetime.now()
                
        except:
            i = i + 1
            continue
    
    if save_csv:
        
        now = datetime.now()
        dt_string = now.strftime("%Y%m%d_%H%M%S")
        apt_data_filename = str(filename_input) + str(dt_string) + '.csv'
        
        all_data.to_csv(apt_data_filename, encoding='utf-8', index=False)
    
    return(all_data)

In [17]:
apt_data = get_all_apt_params(links = all_links_for_apartments[0], save_csv = False)

In [18]:
apt_data

Unnamed: 0,Ingatlanállapota,Építéséve,Komfort,Energiatanúsítvány,Emelet,Épületszintjei,Lift,Belmagasság,Fűtés,Légkondicionáló,...,Tájolás,Kilátás,Erkélymérete,Kertkapcsolatos,Tetőtér,Parkolás,Parkolóhelyára,link,error,download_time
0,felújított,1950előtt,luxus,nincsmegadva,2,4,nincs,3mnélmagasabb,gázcirko,van,...,kelet,panorámás,13m²,nem,tetőtéri,önállógarázskötelezőmegvenni,8000000Ft,https://ingatlan.com/xii-ker/elado+lakas/tegla...,no,2022.01.12 12:29:08


## Run this code in chunks to be able to figure out if there is an error

In [19]:
folder_name = 'all_apartments_220111'
filname_input_string = str(folder_name) + '/apt_data'
filname_input_string

'all_apartments_220111/apt_data'

In [20]:
a = 5200

for a in tqdm(range(a, len(all_links_for_apartments)), desc="Progress: "):
    try:
        get_all_apt_params(links = all_links_for_apartments[a], save_csv = True, filename_input=filname_input_string)
        a = a + 1
        #print(str(a) + '/' + str(len(all_links_for_apartments)))
    except:
        a = a + 1 
        continue

Progress: 100%|██████████| 24244/24244 [21:35:30<00:00,  3.21s/it]   


## Read in apartment data

For this we need to read in all the files in the previous folder

In [21]:
# setting the path for joining multiple files
files = os.path.join(folder_name, '*.csv')
# list of merged files returned
files_to_join = glob.glob(files)

In [22]:
len(files_to_join)

30747