In [1]:
# Standard imports 
import numpy as np
import pandas as pd

# OS and time packages 
import os
import time
import tqdm
import concurrent.futures
from pathlib import Path
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# HTML and text processing 
import requests
from bs4 import BeautifulSoup
import json
import re

import time
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

# Plotting 
import matplotlib.pyplot as plt 
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

plt.rc('font', size=14)             # controls default text sizes
plt.rc('axes', titlesize=18)        # fontsize of the axes title
plt.rc('axes', labelsize=18)        # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)       # fontsize of the tick labels
plt.rc('ytick', labelsize=14)       # fontsize of the tick labels
plt.rc('legend', fontsize=14)       # legend fontsize
plt.rc('figure', titlesize=20)      # fontsize of the figure title

plt.rcParams['figure.figsize'] = 10, 4 # set default size of plots

# Filter warnings 
pd.options.mode.chained_assignment = None
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

ModuleNotFoundError: No module named 'selenium'

In [None]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a `requests.get()`-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log3'):
        log = open('log3','a')
    else: 
        log = open('log3','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log3','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 

In [None]:
def get_soup(url: str, header: dict, selenium=False, driver=None) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see `log()`. 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    if selenium==True:
        driver.get(url)
        soup=BeautifulSoup(driver.page_source, 'lxml')
    else:
        response = requests.get(url, headers=header) # Request
        log(response) # Log 
        soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML
    return soup

In [None]:
def create_url_boliga(page: int, city=None) -> str:
    """
    Creates a boliga URL with the given pagenumber.
    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the boliga website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the boliga website for given page. 
    """

     # Construct url with f-string
    if city == 'Aarhus':
        url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a&propertyType=1&salesDateMin=1992&salesDateMax=2015&municipality=751'
    elif city =='Odense':
        url = 'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a&propertyType=1&salesDateMin=1992&salesDateMax=2015&municipality=461'
    elif city == 'Copenhagen':
        url = 'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a&propertyType=1&salesDateMin=1992&salesDateMax=2015&municipality=101'
    return url

In [None]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

In [None]:
def extract_info_boliga(soup:BeautifulSoup) -> pd.DataFrame:
        
    href = soup.find(class_="text-primary font-weight-bolder text-left")['href']
    url = f'https://www.boliga.dk{href}'
    id = re.search(r'[^/]+$', href)[0]
    url_bbr = f'https://www.boliga.dk/bbrinfo/{id}'
    address = soup.find(class_="text-primary font-weight-bolder text-left").text
    price = soup.find(class_="table-col d-print-table-cell text-center").text
    date_of_sale = soup.div.find_all('span')[0].text
    type_of_sale = soup.div.find_all('span')[1].text
    house_size = soup.find(class_='d-flex flex-column').find_all('span')[0].text
    price_per_m2 = soup.find(class_='d-flex flex-column').find_all('span')[1].text
    no_of_rooms = soup.find_all('td')[4].text
    year_built = soup.find_all('td')[5].text

    return [url, url_bbr, address, price, date_of_sale, type_of_sale, house_size, price_per_m2, no_of_rooms, year_built]


In [None]:
column_names = ['link', 'bbr_link', 'address', 'price', 'date_of_sale','type_of_sale', 'house_size_m2', 'house_price_per_m2', 'no_of_rooms', 'year_built']

In [None]:
list_of_url_kbh = []
list_of_url_aar = []
list_of_url_ode = []

for page in range(1, 358+1):
    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a&propertyType=1&salesDateMin=1992&salesDateMax=2015&municipality=101'
    list_of_url_kbh.append(url)

for page in range(1, 654+1):
    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a&propertyType=1&salesDateMin=1992&salesDateMax=2015&municipality=461'
    list_of_url_ode.append(url)

for page in range(1, 907):
    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a&propertyType=1&salesDateMin=1992&salesDateMax=2015&municipality=751'
    list_of_url_aar.append(url)

In [None]:
# errors = []

# def process_url(id_url_pair):
#     id_, url = id_url_pair
#     try:
#         soup = get_soup(url, header)
#         list_of_ads = soup.find_all('tr')
#         output = []

#         for ad in list_of_ads:
#             info = extract_info_boliga(ad)
#             output.append(info)
#         return id_, output
#     except:
#         print(f'Error encountered on url {url}')
#         errors.append(url)
#         pd.DataFrame(errors).to_csv
#         return id_, None

# id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_url_aar)]

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

# for result in results:
#     id_, data = result
#     output = []
#     if data is not None:
#         output.append(data)
#         df = pd.DataFrame(output[0], columns=column_names)
#         df.to_parquet(f'data/aarhus/boliga_{id_}.pq')

In [None]:
data_dir = Path('data/odense')
full_df1 = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.pq')
)

full_df1.to_parquet('data/odense.pq')

In [None]:
data_dir = Path('data/copenhagen')
full_df2 = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.pq')
)

full_df2.to_parquet('data/copenhagen.pq')

In [None]:
data_dir = Path('data/aarhus')
full_df3 = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.pq')
)

full_df3.to_parquet('data/aarhus.pq')

In [None]:
def extract_info_boliga_bbr_extra(soup:BeautifulSoup) -> pd.DataFrame:
    dwelling_type=soup.find(class_="house-name").text
    dwelling_type2=soup.find_all(class_="col-md-6 column")[0].div.span.text
    dwelling_type3=soup.find_all(class_="col-md-6 column")[2].div.span.text
    kitchen_type=soup.find_all(class_="col-md-6 column")[1].div.span.text
    bathrooms=soup.find_all(class_="col-md-6 column")[5].div.span.text
    toilet_type=soup.find_all(class_="col-md-6 column")[7].div.span.text
    toilets=soup.find_all(class_="col-md-6 column")[9].div.span.text
    return [dwelling_type, dwelling_type2, dwelling_type3, kitchen_type, bathrooms, toilet_type, toilets]


In [None]:
def extract_info_boliga_bbr(soup:BeautifulSoup) -> pd.DataFrame:
    list_ = soup.find_all('td')
    land_values_list = list_[3::5]
    land_values_date_list = list_[0::5]
    land_vals = []
    land_vals_date = []
    for i in range(len(land_values_list)):
        land_vals.append(land_values_list[i].text.lstrip().rstrip())
        land_vals_date.append(land_values_date_list[i].text.lstrip().rstrip())
    return list(zip(land_vals, land_vals_date))


In [None]:
df = pd.concat([full_df1, full_df2, full_df3])
df=df.reset_index(drop=True)

In [None]:
df['bbr_link'][0]

'https://www.boliga.dk/bbrinfo/A7ACE50C-94F6-4D59-AF2E-1C641F4BA09F'

In [None]:
df['bbr_link']=df['bbr_link']+'#info-valueChanges'

In [None]:
list_of_bbr = df['bbr_link'].to_list()

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

In [20]:
%%time
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        soup = get_soup(url, header, selenium=True, driver=driver)
        output = extract_info_boliga_bbr(soup)
        return id_, [url,output]
    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_bbr[:50000])]


for id_ in tqdm.tqdm(range(len(id_url_pairs))):
    id_, data = process_url(id_url_pairs[id_])
    df = pd.DataFrame(data)
    df=df.T
    df.columns = ['bbr_link', 'land_value']
    df.to_parquet(f'data/bbr3/bbr_{id_}.pq')

  0%|          | 3/45846 [00:03<13:34:19,  1.07s/it]

In [20]:
# %%time
# import concurrent.futures

# errors = []

# def process_url(id_url_pair):
#     id_, url = id_url_pair
#     try:
#         soup = get_soup(url, header, selenium=True, driver=driver)
#         output = extract_info_boliga_bbr(soup)
#         return id_, [url,output]
#     except:
#         print(f'Error encountered on url {url}')
#         errors.append(url)
#         return id_, None

# id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_bbr[50001:])]

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     futures = [executor.submit(process_url, id_url_pair) for id_url_pair in id_url_pairs]
#     for future in tqdm.tqdm(concurrent.futures.as_completed(futures)):
#         id_, data = future.result()
#         if data is not None:
#             df = pd.DataFrame(data)
#             df = df.T
#             df.columns = ['bbr_link', 'land_value']
#             df.to_parquet(f'data/bbr2/bbr_{id_}.pq')
            

127it [03:41,  1.74s/it]


In [35]:
pd.read_parquet('data/bbr2/bbr_5.pq')['land_value'][0]

array([array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2022'],
             dtype=object)                                            ,
       array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2021'],
             dtype=object)                                            ,
       array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2019'],
             dtype=object)                                            ,
       array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2018'],
             dtype=object)                                            ,
       array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2017'],
             dtype=object)                                            ,
       array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2016'],
             dtype=object)                                            ,
       array(['Grundværdi  1.086.500\xa0kr.', 'Ændret  1. okt. 2015'],
             dtype=object)                                            ,