In [276]:
# Standard imports 
import numpy as np
import pandas as pd

# OS and time packages 
import os
import time
import tqdm
import datetime

# HTML and text processing 
import nltk
import requests
from bs4 import BeautifulSoup
import json
import re

# Plotting 
import matplotlib.pyplot as plt 
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

plt.rc('font', size=14)             # controls default text sizes
plt.rc('axes', titlesize=18)        # fontsize of the axes title
plt.rc('axes', labelsize=18)        # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)       # fontsize of the tick labels
plt.rc('ytick', labelsize=14)       # fontsize of the tick labels
plt.rc('legend', fontsize=14)       # legend fontsize
plt.rc('figure', titlesize=20)      # fontsize of the figure title

plt.rcParams['figure.figsize'] = 10, 4 # set default size of plots

# Filter warnings 
pd.options.mode.chained_assignment = None
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  plt.style.use('seaborn-whitegrid')


In [2]:
request = requests.get('https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page=1&sort=date-a')

In [3]:
request.text

'<!DOCTYPE html><html lang="da"><head><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin="">\n    <link rel="preconnect" href="https://consent.cookiebot.com">\n    <link rel="preconnect" href="https://api.boliga.dk">\n    <link rel="preconnect" href="https://i.boliga.org">\n    <title>Resultatside - Salgspriser på boliger i Danmark</title>\n    <base href="/">\n    <meta charset="utf-8">\n    <meta name="description" content="Se udbudspriser og salgspriser på boliger i Danmark baseret på din søgning. Se resultaterne på en liste her.">\n    <meta name="keywords" content="">\n    <meta name="msapplication-TileColor" content="#da532c">\n    <meta name="theme-color" content="#ffffff">\n    <meta name="referrer" content="strict-origin-when-cross-origin">\n    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=0">\n    <link rel="icon" type="image/x-icon" href="favicon.ico">\n    <link rel="apple-touch-icon" sizes="180x180" hre

In [4]:
BeautifulSoup(request.text)

<!DOCTYPE html>
<html lang="da"><head><link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<link href="https://consent.cookiebot.com" rel="preconnect"/>
<link href="https://api.boliga.dk" rel="preconnect"/>
<link href="https://i.boliga.org" rel="preconnect"/>
<title>Resultatside - Salgspriser på boliger i Danmark</title>
<base href="/"/>
<meta charset="utf-8"/>
<meta content="Se udbudspriser og salgspriser på boliger i Danmark baseret på din søgning. Se resultaterne på en liste her." name="description"/>
<meta content="" name="keywords"/>
<meta content="#da532c" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<meta content="strict-origin-when-cross-origin" name="referrer"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=0" name="viewport"/>
<link href="favicon.ico" rel="icon" type="image/x-icon"/>
<link href="/assets/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/assets/fa

In [5]:
soup = BeautifulSoup(request.text)

temp = soup.find_all(class_="text-primary font-weight-bolder text-left")
# temp_txt = temp.text
# temp['href']

'/salg/info/621/201217/5BFBC165-4E64-4406-AAED-D5DF7FC14825'

In [10]:
test_list = []

In [11]:
temp[49]['href']

'/salg/info/810/1023/A30C1961-A14B-4037-9B58-A750B53BD418'

In [12]:
url_list = []
for i in range(0,15785+1):
    url=f'https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page={i}&sort=date-a'
    url_list.append(url)

In [13]:
test = url_list[0:5]
test

['https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page=0&sort=date-a',
 'https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page=1&sort=date-a',
 'https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page=2&sort=date-a',
 'https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page=3&sort=date-a',
 'https://www.boliga.dk/salg/resultater?searchTab=1&salesDateMin=2007&salesDateMax=2014&page=4&sort=date-a']

In [153]:
links = []

for url in tqdm.tqdm(test):
    request=requests.get(url)
    soup = BeautifulSoup(request.text)
    href_text = soup.find_all(class_="text-primary font-weight-bolder text-left")
    time.sleep(0.1)
    sales_ads = len(href_text)
    for i in range(0,sales_ads-1):
        href = 'href'
        url = f'https://www.boliga.dk{href_text[i][href]}'
        links.append(url)

100%|██████████| 5/5 [00:03<00:00,  1.25it/s]


In [152]:
len(soup.find_all(class_="text-primary font-weight-bolder text-left"))

50

In [131]:
temp = soup.tbody
temp.tr.a['href']

'/salg/info/621/201217/5BFBC165-4E64-4406-AAED-D5DF7FC14825'

In [132]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a `requests.get()`-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log'):
        log = open('log','a')
    else: 
        log = open('log','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 

In [314]:
def get_soup(url: str, header: dict) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see `log()`. 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    response = requests.get(url, headers=header) # Request
    log(response) # Log 
    soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML

    return soup

In [None]:
def create_url_boliga(page: int) -> str:
    """
    Creates a boliga URL with the given pagenumber.

    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the boliga website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the boliga website for given page. 
    """

    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a' # Construct url with f-string

    return url

In [317]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

In [387]:
def extract_info_boliga(soup:BeautifulSoup) -> pd.DataFrame:

    sales_ads_entries = len(soup.tbody)
    links = []
    addresses = []
    prices = []
    date_of_sales = []
    type_of_sales = []
    house_sizes = []
    prices_per_m2 = []
    no_of_rooms_list = []
    year_built_list = []


    for sale_ads_no in range(0,sales_ads_entries-1):
        # Link to sales ad
        href = soup.find_all(class_="text-primary font-weight-bolder text-left")[sale_ads_no]['href']
        url = f'https://www.boliga.dk{href}'
        links.append(url)

        # Address
        address = soup.find_all(class_="text-primary font-weight-bolder text-left")[sale_ads_no].text
        addresses.append(address)

        # Price
        price = soup.find_all(class_="table-col d-print-table-cell text-center")[0::7][sale_ads_no].text
        prices.append(price)

        # Date of sale
        date_of_sale = soup.find_all(class_="d-flex flex-column date-type-cell")[sale_ads_no].find_next().text
        date_of_sales.append(date_of_sale)

        # Type of sale
        type_of_sale = soup.find_all(class_="d-flex flex-column date-type-cell")[sale_ads_no].find_next().find_next().text
        type_of_sales.append(type_of_sale)

        # House size
        house_size = soup.find_all(class_='d-flex flex-column')[sale_ads_no].find_next().text
        house_sizes.append(house_size)

        # Price per m2
        price_per_m2 = soup.find_all(class_='d-flex flex-column')[sale_ads_no].find_next().find_next().text
        prices_per_m2.append(price_per_m2)
        
        # Number of rooms
        no_of_rooms = soup.tbody.find_all(class_="table-col d-print-table-cell text-center")[3::7][sale_ads_no].text
        no_of_rooms_list.append(no_of_rooms)

        # Year built
        year_built = soup.tbody.find_all(class_="table-col d-print-table-cell text-center")[4::7][sale_ads_no].text
        year_built_list.append(year_built)

        data_dict = {
            'link': links,
            'address':addresses,
            'price': prices,
            'date_of_sale': date_of_sales,
            'type_of_sale': type_of_sales,
            'house_size_m2':house_sizes,
            'house_price_per_m2':prices_per_m2,
            'no_of_rooms': no_of_rooms_list,
            'year_built':year_built_list
        }
    return pd.DataFrame(data_dict)

In [None]:
errors = []

startpage = 1
endpage = 5

for page in tqdm.tqdm(range(startpage, endpage+1)):
    url = create_url_boliga(page)
    try:
        soup = get_soup(url, header)
        df = extract_info_boliga(soup)
        df.to_parquet(f'data/frontpage/boliga_{page}.pq')
        time.sleep(sleep=0.1)
    except: #skip page if errors
        print(f'Error encountered on page {page}')
        errors.append(url)
        time.sleep(sleep=0.1)
        continue

In [None]:
def data_boliga_pages(startpage: int, endpage: int, header: dict, sleep: float) -> list:
    """
    Compound function that scrapes an interval of pages from PolitiFact, extracts information for analysis, \n
    and logs requests in `log`-file. 

    Input:
    - - - - - - - -
    startpage (int)  :  The first page to scrape. \n
    endpage   (int)  :  The last page to scrape. \n
    header    (dict) :  Dictionary to send in the query string for the request. \n
    sleep     (float):  Seconds to sleep between each request.

    Returns:
    - - - - - - - -
    list_of_output (list) : A list of lists, where each element list is the output of `extract_info()`.
    errors         (list) : An error list containing the URLs for the pages where the error occured.
    """

    list_of_output = [] # initialize empty list for dataframes
    errors = [] 

    # Loop through pages and track progress with tqdm
    for page in tqdm.tqdm(range(startpage, endpage+1)):
        url = create_url_pf_pages(page) # create url

        try: # circumvent problem with empty pages
            soup = get_soup(url, header) # construct html
            articles = extract_articles_pf_pages(soup) # extract articles 

            output = [] # initialize empty for articles 

            # Loop through articles 
            for article in articles:
                info = extract_info_pf_pages(article) # extract relevant information
                output.append(info) # append output
            time.sleep(sleep)

        except: # skip page
            print(f'Error encountered on page {page}')
            errors.append(url)
            time.sleep(sleep)
            continue

        list_of_output.append(output)

    return list_of_output, errors