In [1]:
# Standard imports 
import numpy as np
import pandas as pd

# OS and time packages 
import os
import time
import tqdm
import datetime

# HTML and text processing 
import nltk
import requests
from bs4 import BeautifulSoup
import json
import re

# Plotting 
import matplotlib.pyplot as plt 
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

plt.rc('font', size=14)             # controls default text sizes
plt.rc('axes', titlesize=18)        # fontsize of the axes title
plt.rc('axes', labelsize=18)        # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)       # fontsize of the tick labels
plt.rc('ytick', labelsize=14)       # fontsize of the tick labels
plt.rc('legend', fontsize=14)       # legend fontsize
plt.rc('figure', titlesize=20)      # fontsize of the figure title

plt.rcParams['figure.figsize'] = 10, 4 # set default size of plots

# Filter warnings 
pd.options.mode.chained_assignment = None
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  plt.style.use('seaborn-whitegrid')


In [3]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a `requests.get()`-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log'):
        log = open('log','a')
    else: 
        log = open('log','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 

In [4]:
def get_soup(url: str, header: dict) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see `log()`. 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    response = requests.get(url, headers=header) # Request
    log(response) # Log 
    soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML

    return soup

In [5]:
def create_url_boliga(page: int) -> str:
    """
    Creates a boliga URL with the given pagenumber.

    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the boliga website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the boliga website for given page. 
    """

    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a' # Construct url with f-string

    return url

In [6]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

In [7]:
def extract_info_boliga(soup:BeautifulSoup) -> pd.DataFrame:

    sales_ads_entries = len(soup.tbody)
    links = []
    bbr_links = []
    addresses = []
    prices = []
    date_of_sales = []
    type_of_sales = []
    house_sizes = []
    prices_per_m2 = []
    no_of_rooms_list = []
    year_built_list = []


    for sale_ads_no in range(0,sales_ads_entries-1):
        # Link to sales ad
        href = soup.find_all(class_="text-primary font-weight-bolder text-left")[sale_ads_no]['href']
        url = f'https://www.boliga.dk{href}'
        links.append(url)

        # Link to BBR info (for later)
        id = re.search(r'[^/]+$', href)[0]
        url_bbr = f'https://www.boliga.dk/bbrinfo/{id}'
        bbr_links.append(url_bbr)

        # Address
        address = soup.find_all(class_="text-primary font-weight-bolder text-left")[sale_ads_no].text
        addresses.append(address)

        # Price
        price = soup.find_all(class_="table-col d-print-table-cell text-center")[0::7][sale_ads_no].text
        prices.append(price)

        # Date of sale
        date_of_sale = soup.find_all(class_="d-flex flex-column date-type-cell")[sale_ads_no].find_next().text
        date_of_sales.append(date_of_sale)

        # Type of sale
        type_of_sale = soup.find_all(class_="d-flex flex-column date-type-cell")[sale_ads_no].find_next().find_next().text
        type_of_sales.append(type_of_sale)

        # House size
        house_size = soup.find_all(class_='d-flex flex-column')[sale_ads_no].find_next().text
        house_sizes.append(house_size)

        # Price per m2
        price_per_m2 = soup.find_all(class_='d-flex flex-column')[sale_ads_no].find_next().find_next().text
        prices_per_m2.append(price_per_m2)
        
        # Number of rooms
        no_of_rooms = soup.tbody.find_all(class_="table-col d-print-table-cell text-center")[3::7][sale_ads_no].text
        no_of_rooms_list.append(no_of_rooms)

        # Year built
        year_built = soup.tbody.find_all(class_="table-col d-print-table-cell text-center")[4::7][sale_ads_no].text
        year_built_list.append(year_built)

        data_dict = {
            'link': links,
            'bbr_link':bbr_links,
            'address':addresses,
            'price': prices,
            'date_of_sale': date_of_sales,
            'type_of_sale': type_of_sales,
            'house_size_m2':house_sizes,
            'house_price_per_m2':prices_per_m2,
            'no_of_rooms': no_of_rooms_list,
            'year_built':year_built_list
        }
    return pd.DataFrame(data_dict)

In [2]:
errors = []

startpage = 2645
endpage = 34784

for page in tqdm.tqdm(range(startpage, endpage+1)):
    url = create_url_boliga(page)
    try:
        soup = get_soup(url, header)
        df = extract_info_boliga(soup)
        df.to_parquet(f'data/frontpage/boliga_{page}.pq')
        time.sleep(0.1)
    except: #skip page if errors
        print(f'Error encountered on page {page}')
        errors.append(url)
        df_error = pd.DataFrame(errors).to_parquet('errors.pq')
        time.sleep(0.1)
        continue

  0%|          | 0/32140 [00:00<?, ?it/s]


NameError: name 'create_url_boliga' is not defined

In [25]:
pd.read_parquet(f'data/frontpage/boliga_1.pq')

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built
0,https://www.boliga.dk/salg/info/621/201217/5BF...,https://www.boliga.dk/bbrinfo/5BFBC165-4E64-44...,Christianshave 31 6000 Kolding,560.000 kr.,05-11-1974,Alm. Salg,155 m²,3.613 kr/m²,5,2015
1,https://www.boliga.dk/salg/info/430/20036/A35B...,https://www.boliga.dk/bbrinfo/A35B1617-43AA-43...,Rønnebærvænget 5B 5856 Ryslinge,268.000 kr.,14-11-1985,Alm. Salg,63 m²,4.254 kr/m²,2,1978
2,https://www.boliga.dk/salg/info/860/3278/76D17...,https://www.boliga.dk/bbrinfo/76D17D74-9E4D-43...,de Plessenvej 15 9850 Hirtshals,116.400 kr.,13-01-1986,Fam. Salg,93 m²,1.252 kr/m²,4,1985
3,https://www.boliga.dk/salg/info/707/112163/DA0...,https://www.boliga.dk/bbrinfo/DA030E60-4C83-46...,Fasanvej 27 8950 Ørsted,52.750 kr.,14-07-1987,Fam. Salg,109 m²,484 kr/m²,4,1987
4,https://www.boliga.dk/salg/info/101/40300/519B...,https://www.boliga.dk/bbrinfo/519BD5E8-11CF-48...,"Ottilia Jacobsens Plads 17, 2. th 1799 Københ...",3.580.136 kr.,05-12-1987,Alm. Salg,82 m²,43.660 kr/m²,3,2020
5,https://www.boliga.dk/salg/info/707/110547/38E...,https://www.boliga.dk/bbrinfo/38E98D16-E36B-4C...,Sandagervej 2A 8961 Allingåbro,84.893 kr.,31-03-1990,Andet,60 m²,1.415 kr/m²,2,1942
6,https://www.boliga.dk/salg/info/360/9329/5653D...,https://www.boliga.dk/bbrinfo/5653DFEA-FBB9-43...,Bekkasinvej 10 4930 Maribo,585.000 kr.,31-12-1991,Fam. Salg,130 m²,4.500 kr/m²,4,1981
7,https://www.boliga.dk/salg/info/461/123260/99B...,https://www.boliga.dk/bbrinfo/99B3004D-BD56-42...,Fraugde-Kærby-Vej 84 5220 Odense SØ,572.200 kr.,31-12-1991,Fam. Salg,200 m²,2.861 kr/m²,5,1900
8,https://www.boliga.dk/salg/info/621/49240/F2E8...,https://www.boliga.dk/bbrinfo/F2E8ACB0-C017-42...,Fåborgvej 4 6000 Kolding,785.000 kr.,31-12-1991,Fam. Salg,152 m²,5.164 kr/m²,5,1973
9,https://www.boliga.dk/salg/info/330/8263/8C96F...,https://www.boliga.dk/bbrinfo/8C96F17D-5A93-46...,Lyrekrogen 18 4220 Korsør,75.840 kr.,31-12-1991,Alm. Salg,91 m²,833 kr/m²,6,1969
