In [1]:
# Standard imports 
import numpy as np
import pandas as pd

# OS and time packages 
import os
import time
import tqdm
import datetime

# HTML and text processing 
import nltk
import requests
from bs4 import BeautifulSoup
import json
import re

# Plotting 
import matplotlib.pyplot as plt 
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

plt.rc('font', size=14)             # controls default text sizes
plt.rc('axes', titlesize=18)        # fontsize of the axes title
plt.rc('axes', labelsize=18)        # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)       # fontsize of the tick labels
plt.rc('ytick', labelsize=14)       # fontsize of the tick labels
plt.rc('legend', fontsize=14)       # legend fontsize
plt.rc('figure', titlesize=20)      # fontsize of the figure title

plt.rcParams['figure.figsize'] = 10, 4 # set default size of plots

# Filter warnings 
pd.options.mode.chained_assignment = None
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  plt.style.use('seaborn-whitegrid')


In [2]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a `requests.get()`-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log'):
        log = open('log','a')
    else: 
        log = open('log','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 

In [3]:
def get_soup(url: str, header: dict) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see `log()`. 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    response = requests.get(url, headers=header) # Request
    log(response) # Log 
    soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML

    return soup

In [4]:
def create_url_boliga(page: int) -> str:
    """
    Creates a boliga URL with the given pagenumber.

    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the boliga website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the boliga website for given page. 
    """

    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a' # Construct url with f-string

    return url

In [5]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

In [6]:
def extract_info_boliga(soup:BeautifulSoup) -> pd.DataFrame:
        
    href = soup.find(class_="text-primary font-weight-bolder text-left")['href']
    url = f'https://www.boliga.dk{href}'
    id = re.search(r'[^/]+$', href)[0]
    url_bbr = f'https://www.boliga.dk/bbrinfo/{id}'
    address = soup.find(class_="text-primary font-weight-bolder text-left").text
    price = soup.find(class_="table-col d-print-table-cell text-center").text
    date_of_sale = soup.div.find_all('span')[0].text
    type_of_sale = soup.div.find_all('span')[1].text
    house_size = soup.find(class_='d-flex flex-column').find_all('span')[0].text
    price_per_m2 = soup.find(class_='d-flex flex-column').find_all('span')[1].text
    no_of_rooms = soup.find_all('td')[4].text
    year_built = soup.find_all('td')[5].text

    return [url, url_bbr, address, price, date_of_sale, type_of_sale, house_size, price_per_m2, no_of_rooms, year_built]


In [7]:
column_names = ['link', 'bbr_link', 'address', 'price', 'date_of_sale','type_of_sale', 'house_size_m2', 'house_price_per_m2', 'no_of_rooms', 'year_built']

In [8]:
errors = []

startpage = 13501
endpage = 14200

for page in tqdm.tqdm(range(startpage, endpage+1)):
    url = create_url_boliga(page)
    try:
        soup = get_soup(url, header)
        list_of_ads = soup.find_all('tr')

        output = []

        for ad in list_of_ads:
            info = extract_info_boliga(ad)
            output.append(info)

        df = pd.DataFrame(output, columns=column_names)
        df.to_parquet(f'data/frontpage/boliga_{page}.pq')
        time.sleep(0.1)
    except: #skip page if errors
        print(f'Error encountered on page {page}')
        errors.append(url)
        df_error = pd.DataFrame(errors).to_csv('errors.csv')
        time.sleep(0.1)
        continue

100%|██████████| 700/700 [40:27<00:00,  3.47s/it]
