In [1]:
# Standard imports 
import numpy as np
import pandas as pd

# OS and time packages 
import os
import time
import tqdm

from pathlib import Path

# HTML and text processing 
import requests
from bs4 import BeautifulSoup
import json
import re

# Plotting 
import matplotlib.pyplot as plt 
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

plt.rc('font', size=14)             # controls default text sizes
plt.rc('axes', titlesize=18)        # fontsize of the axes title
plt.rc('axes', labelsize=18)        # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)       # fontsize of the tick labels
plt.rc('ytick', labelsize=14)       # fontsize of the tick labels
plt.rc('legend', fontsize=14)       # legend fontsize
plt.rc('figure', titlesize=20)      # fontsize of the figure title

plt.rcParams['figure.figsize'] = 10, 4 # set default size of plots

# Filter warnings 
pd.options.mode.chained_assignment = None
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  plt.style.use('seaborn-whitegrid')


In [2]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a `requests.get()`-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log2'):
        log = open('log2','a')
    else: 
        log = open('log2','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log2','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 

In [3]:
def get_soup(url: str, header: dict) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see `log()`. 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    response = requests.get(url, headers=header) # Request
    log(response) # Log 
    soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML

    return soup

In [4]:
def create_url_boliga(page: int) -> str:
    """
    Creates a boliga URL with the given pagenumber.
    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the boliga website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the boliga website for given page. 
    """

    url = f'https://www.boliga.dk/salg/resultater?searchTab=1&page={page}&sort=date-a' # Construct url with f-string

    return url

In [5]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

In [6]:
def extract_info_boliga(soup:BeautifulSoup) -> pd.DataFrame:
        
    href = soup.find(class_="text-primary font-weight-bolder text-left")['href']
    url = f'https://www.boliga.dk{href}'
    id = re.search(r'[^/]+$', href)[0]
    url_bbr = f'https://www.boliga.dk/bbrinfo/{id}'
    address = soup.find(class_="text-primary font-weight-bolder text-left").text
    price = soup.find(class_="table-col d-print-table-cell text-center").text
    date_of_sale = soup.div.find_all('span')[0].text
    type_of_sale = soup.div.find_all('span')[1].text
    house_size = soup.find(class_='d-flex flex-column').find_all('span')[0].text
    price_per_m2 = soup.find(class_='d-flex flex-column').find_all('span')[1].text
    no_of_rooms = soup.find_all('td')[4].text
    year_built = soup.find_all('td')[5].text

    return [url, url_bbr, address, price, date_of_sale, type_of_sale, house_size, price_per_m2, no_of_rooms, year_built]


In [7]:
column_names = ['link', 'bbr_link', 'address', 'price', 'date_of_sale','type_of_sale', 'house_size_m2', 'house_price_per_m2', 'no_of_rooms', 'year_built']

In [8]:
errors = []

startpage = 12005
endpage = 13500

for page in tqdm.tqdm(range(startpage, endpage+1)):
    url = create_url_boliga(page)
    try:
        soup = get_soup(url, header)
        list_of_ads = soup.find_all('tr')

        output = []

        for ad in list_of_ads:
            info = extract_info_boliga(ad)
            output.append(info)

        df = pd.DataFrame(output, columns=column_names)
        df.to_parquet(f'data/frontpage/boliga_{page}.pq')
        time.sleep(0.1)
    except: #skip page if errors
        print(f'Error encountered on page {page}')
        errors.append(url)
        df_error = pd.DataFrame(errors).to_csv('errors.csv')
        time.sleep(0.1)
        continue

100%|██████████| 1496/1496 [1:19:37<00:00,  3.19s/it]


In [25]:
pd.read_parquet(f'data/frontpage/boliga_1.pq')

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built
0,https://www.boliga.dk/salg/info/621/201217/5BF...,https://www.boliga.dk/bbrinfo/5BFBC165-4E64-44...,Christianshave 31 6000 Kolding,560.000 kr.,05-11-1974,Alm. Salg,155 m²,3.613 kr/m²,5,2015
1,https://www.boliga.dk/salg/info/430/20036/A35B...,https://www.boliga.dk/bbrinfo/A35B1617-43AA-43...,Rønnebærvænget 5B 5856 Ryslinge,268.000 kr.,14-11-1985,Alm. Salg,63 m²,4.254 kr/m²,2,1978
2,https://www.boliga.dk/salg/info/860/3278/76D17...,https://www.boliga.dk/bbrinfo/76D17D74-9E4D-43...,de Plessenvej 15 9850 Hirtshals,116.400 kr.,13-01-1986,Fam. Salg,93 m²,1.252 kr/m²,4,1985
3,https://www.boliga.dk/salg/info/707/112163/DA0...,https://www.boliga.dk/bbrinfo/DA030E60-4C83-46...,Fasanvej 27 8950 Ørsted,52.750 kr.,14-07-1987,Fam. Salg,109 m²,484 kr/m²,4,1987
4,https://www.boliga.dk/salg/info/101/40300/519B...,https://www.boliga.dk/bbrinfo/519BD5E8-11CF-48...,"Ottilia Jacobsens Plads 17, 2. th 1799 Københ...",3.580.136 kr.,05-12-1987,Alm. Salg,82 m²,43.660 kr/m²,3,2020
5,https://www.boliga.dk/salg/info/707/110547/38E...,https://www.boliga.dk/bbrinfo/38E98D16-E36B-4C...,Sandagervej 2A 8961 Allingåbro,84.893 kr.,31-03-1990,Andet,60 m²,1.415 kr/m²,2,1942
6,https://www.boliga.dk/salg/info/360/9329/5653D...,https://www.boliga.dk/bbrinfo/5653DFEA-FBB9-43...,Bekkasinvej 10 4930 Maribo,585.000 kr.,31-12-1991,Fam. Salg,130 m²,4.500 kr/m²,4,1981
7,https://www.boliga.dk/salg/info/461/123260/99B...,https://www.boliga.dk/bbrinfo/99B3004D-BD56-42...,Fraugde-Kærby-Vej 84 5220 Odense SØ,572.200 kr.,31-12-1991,Fam. Salg,200 m²,2.861 kr/m²,5,1900
8,https://www.boliga.dk/salg/info/621/49240/F2E8...,https://www.boliga.dk/bbrinfo/F2E8ACB0-C017-42...,Fåborgvej 4 6000 Kolding,785.000 kr.,31-12-1991,Fam. Salg,152 m²,5.164 kr/m²,5,1973
9,https://www.boliga.dk/salg/info/330/8263/8C96F...,https://www.boliga.dk/bbrinfo/8C96F17D-5A93-46...,Lyrekrogen 18 4220 Korsør,75.840 kr.,31-12-1991,Alm. Salg,91 m²,833 kr/m²,6,1969


In [130]:
data_dir = Path('C:/Users/JBH/Dropbox/11_semester/Public_Econ_seminar/data/frontpage')
full_df = pd.concat(
    pd.read_parquet(parquet_file)
    for parquet_file in data_dir.glob('*.pq')
)

full_df.to_parquet('data/boliga.pq')

In [134]:
df = pd.read_parquet('data/boliga.pq')
df = df.reset_index(drop=True)

In [135]:
df['price']

0           560.000 kr.
1           268.000 kr.
2           116.400 kr.
3            52.750 kr.
4         3.580.136 kr.
              ...      
971395      798.000 kr.
971396      655.000 kr.
971397    1.425.000 kr.
971398    2.850.000 kr.
971399      800.000 kr.
Name: price, Length: 971400, dtype: object

In [136]:
df['price']=df['price'].str.replace(r'[.]|kr[.]','')
df['price']=df['price'].astype(int)

  df['price']=df['price'].str.replace(r'[.]|kr[.]','')


In [137]:
df['price']

0          560000
1          268000
2          116400
3           52750
4         3580136
           ...   
971395     798000
971396     655000
971397    1425000
971398    2850000
971399     800000
Name: price, Length: 971400, dtype: int32

In [138]:
df['date_of_sale']=pd.to_datetime(df['date_of_sale'])
df['year']=df.date_of_sale.dt.year
df['month']=df.date_of_sale.dt.month
df['week']=df.date_of_sale.dt.week
df['day']=df.date_of_sale.dt.day
df['quarter']=df.date_of_sale.dt.quarter

  df['date_of_sale']=pd.to_datetime(df['date_of_sale'])
  df['week']=df.date_of_sale.dt.week


In [139]:
df.columns

Index(['link', 'bbr_link', 'address', 'price', 'date_of_sale', 'type_of_sale',
       'house_size_m2', 'house_price_per_m2', 'no_of_rooms', 'year_built',
       'year', 'month', 'week', 'day', 'quarter'],
      dtype='object')

In [140]:
df['type_of_sale']=df['type_of_sale'].astype('category')
df['house_price_per_m2']

0           3.613 kr/m²
1           4.254 kr/m²
2           1.252 kr/m²
3             484 kr/m²
4          43.660 kr/m²
              ...      
971395      7.255 kr/m²
971396      5.598 kr/m²
971397     13.971 kr/m²
971398     13.768 kr/m²
971399     20.000 kr/m²
Name: house_price_per_m2, Length: 971400, dtype: object

In [150]:
df['house_price_per_m2']=df['house_price_per_m2'].str.replace('kr/m²', '')
df['house_price_per_m2']=df['house_price_per_m2'].str.replace(r'[.]', '')
df['house_size_m2']=df['house_size_m2'].str.strip()
df['house_size_m2']=df['house_size_m2'].str.replace('m²', '')
df['house_size_m2']=df['house_size_m2'].str.replace(r'[.]', '')

  df['house_price_per_m2']=df['house_price_per_m2'].str.replace(r'[.]', '')
  df['house_size_m2']=df['house_size_m2'].str.replace(r'[.]', '')


In [151]:
df['house_size_m2']

0         155 
1          63 
2          93 
3         109 
4          82 
          ... 
971395    110 
971396    117 
971397    102 
971398    207 
971399     40 
Name: house_size_m2, Length: 971400, dtype: object

In [158]:
df['house_size_m2']=df['house_size_m2'].astype(int)
df['house_price_per_m2']=df['house_price_per_m2'].str.strip()
df=df.dropna()

In [167]:
df=df[df['house_price_per_m2']!='']
df.reset_index(drop=True)

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter
0,https://www.boliga.dk/salg/info/621/201217/5BF...,https://www.boliga.dk/bbrinfo/5BFBC165-4E64-44...,Christianshave 31 6000 Kolding,560000,1974-05-11,Alm. Salg,155,3613,5,2015,1974,5,19,11,2
1,https://www.boliga.dk/salg/info/430/20036/A35B...,https://www.boliga.dk/bbrinfo/A35B1617-43AA-43...,Rønnebærvænget 5B 5856 Ryslinge,268000,1985-11-14,Alm. Salg,63,4254,2,1978,1985,11,46,14,4
2,https://www.boliga.dk/salg/info/860/3278/76D17...,https://www.boliga.dk/bbrinfo/76D17D74-9E4D-43...,de Plessenvej 15 9850 Hirtshals,116400,1986-01-13,Fam. Salg,93,1252,4,1985,1986,1,3,13,1
3,https://www.boliga.dk/salg/info/707/112163/DA0...,https://www.boliga.dk/bbrinfo/DA030E60-4C83-46...,Fasanvej 27 8950 Ørsted,52750,1987-07-14,Fam. Salg,109,484,4,1987,1987,7,29,14,3
4,https://www.boliga.dk/salg/info/101/40300/519B...,https://www.boliga.dk/bbrinfo/519BD5E8-11CF-48...,"Ottilia Jacobsens Plads 17, 2. th 1799 Københ...",3580136,1987-05-12,Alm. Salg,82,43660,3,2020,1987,5,20,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971338,https://www.boliga.dk/salg/info/671/45114/EC57...,https://www.boliga.dk/bbrinfo/EC570063-A4E9-49...,Struergårdvej 20 7600 Struer,798000,2006-03-20,Alm. Salg,110,7255,4,1964,2006,3,12,20,1
971339,https://www.boliga.dk/salg/info/370/29268/31A0...,https://www.boliga.dk/bbrinfo/31A0832E-FADD-4B...,Søgårdsvej 8 4160 Herlufmagle,655000,2006-03-20,Alm. Salg,117,5598,3,1952,2006,3,12,20,1
971340,https://www.boliga.dk/salg/info/851/173595/CC2...,https://www.boliga.dk/bbrinfo/CC22B413-8B1F-4F...,Landlystvej 25 9210 Aalborg SØ,1425000,2006-03-20,Alm. Salg,102,13971,4,1963,2006,3,12,20,1
971341,https://www.boliga.dk/salg/info/621/82221/DAA8...,https://www.boliga.dk/bbrinfo/DAA86B62-B046-46...,Langelinie 6 6000 Kolding,2850000,2006-03-20,Alm. Salg,207,13768,7,1878,2006,3,12,20,1


In [170]:
df['house_price_per_m2']=df['house_price_per_m2'].astype(int)

In [184]:
df['no_of_rooms']=df['no_of_rooms'].astype(int)
df['year_built']=df['year_built'].astype(int)

In [180]:
df['time_q']=pd.PeriodIndex(df['date_of_sale'], freq='Q')
df=df[df['year']>=1992]

In [189]:
df=df.reset_index(drop=True)
df

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q
0,https://www.boliga.dk/salg/info/147/195478/977...,https://www.boliga.dk/bbrinfo/977B144E-0353-41...,"Finsensvej 47B, 4. 410 2000 Frederiksberg",335000,1992-01-01,Alm. Salg,60,5583,1,1961,1992,1,1,1,1,1992Q1
1,https://www.boliga.dk/salg/info/580/22564/DE2C...,https://www.boliga.dk/bbrinfo/DE2CAFD7-09D0-4F...,Farversmøllevej 152 6200 Aabenraa,750000,1992-01-01,Fam. Salg,168,4464,4,1979,1992,1,1,1,1,1992Q1
2,https://www.boliga.dk/salg/info/846/9640/269EB...,https://www.boliga.dk/bbrinfo/269EB015-2ABF-48...,Tyttebærvej 4 9560 Hadsund,780000,1992-01-01,Alm. Salg,204,3000,5,1973,1992,1,1,1,1,1992Q1
3,https://www.boliga.dk/salg/info/661/130636/68B...,https://www.boliga.dk/bbrinfo/68B47361-7C32-4E...,Smedegårdvej 21E 7500 Holstebro,275000,1992-01-01,Alm. Salg,56,4911,2,1980,1992,1,1,1,1,1992Q1
4,https://www.boliga.dk/salg/info/706/4935/98131...,https://www.boliga.dk/bbrinfo/98131C65-2A1D-40...,Syrenvej 18 8400 Ebeltoft,252000,1992-01-01,Alm. Salg,86,2930,5,1972,1992,1,1,1,1,1992Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971293,https://www.boliga.dk/salg/info/671/45114/EC57...,https://www.boliga.dk/bbrinfo/EC570063-A4E9-49...,Struergårdvej 20 7600 Struer,798000,2006-03-20,Alm. Salg,110,7255,4,1964,2006,3,12,20,1,2006Q1
971294,https://www.boliga.dk/salg/info/370/29268/31A0...,https://www.boliga.dk/bbrinfo/31A0832E-FADD-4B...,Søgårdsvej 8 4160 Herlufmagle,655000,2006-03-20,Alm. Salg,117,5598,3,1952,2006,3,12,20,1,2006Q1
971295,https://www.boliga.dk/salg/info/851/173595/CC2...,https://www.boliga.dk/bbrinfo/CC22B413-8B1F-4F...,Landlystvej 25 9210 Aalborg SØ,1425000,2006-03-20,Alm. Salg,102,13971,4,1963,2006,3,12,20,1,2006Q1
971296,https://www.boliga.dk/salg/info/621/82221/DAA8...,https://www.boliga.dk/bbrinfo/DAA86B62-B046-46...,Langelinie 6 6000 Kolding,2850000,2006-03-20,Alm. Salg,207,13768,7,1878,2006,3,12,20,1,2006Q1


In [85]:
df.to_parquet('data/boliga.pq')

In [8]:
df = pd.read_parquet('data/boliga.pq')
# df=df[df['type_of_sale']=='Alm. Salg']
# df.to_parquet('data/boliga.pq')

In [9]:
df['address']=df['address'].str.rstrip()
df['address']=df['address'].str.lstrip()
df['address']

0         Finsensvej 47B, 4. 410 2000 Frederiksberg
2                        Tyttebærvej 4 9560 Hadsund
3                   Smedegårdvej 21E 7500 Holstebro
4                         Syrenvej 18 8400 Ebeltoft
5                         Skiffardvej 9 8560 Kolind
                            ...                    
971293                 Struergårdvej 20 7600 Struer
971294                Søgårdsvej 8 4160 Herlufmagle
971295               Landlystvej 25 9210 Aalborg SØ
971296                    Langelinie 6 6000 Kolding
971297                Grantoften 14 3630 Jægerspris
Name: address, Length: 830760, dtype: object

In [10]:
regex = r'\d{4}\s[A-Za-zÆØÅæøå\s]+'
df['post_no_city']=df['address'].apply(lambda x: re.findall(regex, x)[0] if re.findall(regex, x) else '')
df['post_no_city']

0         2000 Frederiksberg
2               9560 Hadsund
3             7500 Holstebro
4              8400 Ebeltoft
5                8560 Kolind
                 ...        
971293           7600 Struer
971294      4160 Herlufmagle
971295       9210 Aalborg SØ
971296          6000 Kolding
971297       3630 Jægerspris
Name: post_no_city, Length: 830760, dtype: object

In [11]:
regex = r'\d{4}'
df['post_no']=df['address'].apply(lambda x: re.findall(regex, x)[0] if re.findall(regex, x) else '')
df['post_no']=df['post_no'].astype(int)
df['post_no']

0         2000
2         9560
3         7500
4         8400
5         8560
          ... 
971293    7600
971294    4160
971295    9210
971296    6000
971297    3630
Name: post_no, Length: 830760, dtype: int32

In [12]:
regex = r'\d{4}(.+)'
df['city']=df['post_no_city'].apply(lambda x: re.findall(regex, x)[0] if re.findall(regex, x) else '')
df['city']=df['city'].str.lstrip()
df['city']

0         Frederiksberg
2               Hadsund
3             Holstebro
4              Ebeltoft
5                Kolind
              ...      
971293           Struer
971294      Herlufmagle
971295       Aalborg SØ
971296          Kolding
971297       Jægerspris
Name: city, Length: 830760, dtype: object

In [13]:
df_kbh=df[df['city'].str.contains('København')]
df_kbh

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q,post_no_city,post_no,city
70,https://www.boliga.dk/salg/info/101/533972/416...,https://www.boliga.dk/bbrinfo/416D6FE5-45A5-4B...,"Strandboulevarden 98, 3. th 2100 København Ø",790000,1992-11-03,Alm. Salg,128,6172,5,1903,1992,11,45,3,4,1992Q4,2100 København Ø,2100,København Ø
147,https://www.boliga.dk/salg/info/101/187276/933...,https://www.boliga.dk/bbrinfo/933F5DC3-E067-45...,"Grækenlandsvej 37, 1. th 2300 København S",319000,1994-02-14,Alm. Salg,53,6019,2,1934,1994,2,7,14,1,1994Q1,2300 København S,2300,København S
281,https://www.boliga.dk/salg/info/101/705960/1BA...,https://www.boliga.dk/bbrinfo/1BAB9A84-670B-4C...,"Weidekampsgade 59, 4. th 2300 København S",2645000,2006-03-21,Alm. Salg,93,28441,2,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S
290,https://www.boliga.dk/salg/info/101/716210/B3B...,https://www.boliga.dk/bbrinfo/B3B0A4DA-8B77-4E...,"Klaksvigsgade 9, 3. tv 2300 København S",3300000,2006-03-21,Alm. Salg,110,30000,3,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S
291,https://www.boliga.dk/salg/info/101/707610/408...,https://www.boliga.dk/bbrinfo/408813A5-3EAC-46...,"Weidekampsgade 61, 5. tv 2300 København S",2500000,2006-03-21,Alm. Salg,93,26882,2,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971083,https://www.boliga.dk/salg/info/101/462242/5F3...,https://www.boliga.dk/bbrinfo/5F314831-8A6C-46...,Rodosvej 20 2300 København S,4725000,2006-03-19,Alm. Salg,119,39706,5,1922,2006,3,11,19,1,2006Q1,2300 København S,2300,København S
971133,https://www.boliga.dk/salg/info/101/756190/2CB...,https://www.boliga.dk/bbrinfo/2CB8A34B-44A8-45...,"Digevej 52, 3. tv 2300 København S",1895000,2006-03-19,Alm. Salg,89,21292,3,2005,2006,3,11,19,1,2006Q1,2300 København S,2300,København S
971134,https://www.boliga.dk/salg/info/101/755550/4A7...,https://www.boliga.dk/bbrinfo/4A7A3B20-658F-47...,"Digevej 52, st. th 2300 København S",1850000,2006-03-19,Alm. Salg,100,18500,3,2005,2006,3,11,19,1,2006Q1,2300 København S,2300,København S
971135,https://www.boliga.dk/salg/info/101/756050/7E3...,https://www.boliga.dk/bbrinfo/7E3CC8FE-E12B-44...,"Digevej 52, 2. th 2300 København S",1995000,2006-03-19,Alm. Salg,100,19950,3,2005,2006,3,11,19,1,2006Q1,2300 København S,2300,København S


In [14]:
df_aar=df[df['city'].str.contains('Aarhus')]
df_aar

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q,post_no_city,post_no,city
246,https://www.boliga.dk/salg/info/751/151039/6D8...,https://www.boliga.dk/bbrinfo/6D8BB9A3-0B71-42...,"Gustav Wieds Vej 29, st. th 8000 Aarhus C",2348000,2006-03-20,Alm. Salg,92,25522,3,1952,2006,3,12,20,1,2006Q1,8000 Aarhus C,8000,Aarhus C
424,https://www.boliga.dk/salg/info/751/217234/B6A...,https://www.boliga.dk/bbrinfo/B6AB96AD-6A65-4C...,"Janus La Cours Gade 3, 1. tv 8000 Aarhus C",2251000,2006-03-21,Alm. Salg,81,27790,3,1928,2006,3,12,21,1,2006Q1,8000 Aarhus C,8000,Aarhus C
550,https://www.boliga.dk/salg/info/751/78315/EDAE...,https://www.boliga.dk/bbrinfo/EDAE6F0C-7A7D-41...,"Eckersbergsgade 21, kl 8000 Aarhus C",1995000,2006-03-22,Alm. Salg,0,68793,1,1936,2006,3,12,22,1,2006Q1,8000 Aarhus C,8000,Aarhus C
784,https://www.boliga.dk/salg/info/751/370032/386...,https://www.boliga.dk/bbrinfo/386A9EA4-DDB2-46...,Præstehaven 42 8210 Aarhus V,2557000,2006-03-23,Alm. Salg,98,26092,4,1940,2006,3,12,23,1,2006Q1,8210 Aarhus V,8210,Aarhus V
861,https://www.boliga.dk/salg/info/751/977247/239...,https://www.boliga.dk/bbrinfo/2391EA66-ABBC-45...,"Bronzealdertoften 4, 1. 1 8210 Aarhus V",1375000,2006-03-23,Alm. Salg,59,23305,2,2006,2006,3,12,23,1,2006Q1,8210 Aarhus V,8210,Aarhus V
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970455,https://www.boliga.dk/salg/info/751/728174/A3B...,https://www.boliga.dk/bbrinfo/A3B9BE37-1263-40...,"Østbanetorvet 4, 3. mf 8000 Aarhus C",3268000,2006-03-14,Alm. Salg,125,26144,3,1895,2006,3,11,14,1,2006Q1,8000 Aarhus C,8000,Aarhus C
970632,https://www.boliga.dk/salg/info/751/957505/D8F...,https://www.boliga.dk/bbrinfo/D8FCF4B1-8878-4B...,"Bülowsgade 6, 2 8000 Aarhus C",1399000,2006-03-15,Alm. Salg,35,39971,1,1876,2006,3,11,15,1,2006Q1,8000 Aarhus C,8000,Aarhus C
970906,https://www.boliga.dk/salg/info/751/249691/284...,https://www.boliga.dk/bbrinfo/284474DD-0EAB-41...,Klokkerbakken 51 8210 Aarhus V,2775000,2006-03-17,Alm. Salg,120,23125,5,1966,2006,3,11,17,1,2006Q1,8210 Aarhus V,8210,Aarhus V
971245,https://www.boliga.dk/salg/info/751/430361/0A1...,https://www.boliga.dk/bbrinfo/0A1FBF5D-9C35-4E...,"Skovfaldet 2, 2. tv 8200 Aarhus N",3325000,2006-03-19,Alm. Salg,127,26181,3,1961,2006,3,11,19,1,2006Q1,8200 Aarhus N,8200,Aarhus N


In [15]:
df_fre=df[df['city'].str.contains('Frederiksberg')]
df_fre

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q,post_no_city,post_no,city
0,https://www.boliga.dk/salg/info/147/195478/977...,https://www.boliga.dk/bbrinfo/977B144E-0353-41...,"Finsensvej 47B, 4. 410 2000 Frederiksberg",335000,1992-01-01,Alm. Salg,60,5583,1,1961,1992,1,1,1,1,1992Q1,2000 Frederiksberg,2000,Frederiksberg
148,https://www.boliga.dk/salg/info/147/31629/6C3D...,https://www.boliga.dk/bbrinfo/6C3DDFFB-917C-48...,Femte Juni Plads 7 2000 Frederiksberg,1100000,1994-02-14,Alm. Salg,336,3274,15,1913,1994,2,7,14,1,1994Q1,2000 Frederiksberg,2000,Frederiksberg
324,https://www.boliga.dk/salg/info/147/257995/987...,https://www.boliga.dk/bbrinfo/98732290-94A7-49...,"Holger Danskes Vej 87, 2. 6 2000 Frederiksberg",1175000,2006-03-21,Alm. Salg,28,41964,1,2006,2006,3,12,21,1,2006Q1,2000 Frederiksberg,2000,Frederiksberg
338,https://www.boliga.dk/salg/info/147/258005/694...,https://www.boliga.dk/bbrinfo/694F92A0-A4BF-4F...,"Holger Danskes Vej 87, 3. 6 2000 Frederiksberg",1225000,2006-03-21,Alm. Salg,28,43750,1,2006,2006,3,12,21,1,2006Q1,2000 Frederiksberg,2000,Frederiksberg
339,https://www.boliga.dk/salg/info/147/258006/6B8...,https://www.boliga.dk/bbrinfo/6B83F9DF-3C9F-41...,"Holger Danskes Vej 87, 3. 7 2000 Frederiksberg",1245000,2006-03-21,Alm. Salg,27,46111,1,2006,2006,3,12,21,1,2006Q1,2000 Frederiksberg,2000,Frederiksberg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970412,https://www.boliga.dk/salg/info/147/258020/DFA...,https://www.boliga.dk/bbrinfo/DFA7ACC0-8418-46...,"Holger Danskes Vej 87, 5. 1 2000 Frederiksberg",1695000,2006-03-14,Alm. Salg,37,45811,1,2006,2006,3,11,14,1,2006Q1,2000 Frederiksberg,2000,Frederiksberg
970658,https://www.boliga.dk/salg/info/147/227310/51A...,https://www.boliga.dk/bbrinfo/51A5D9FF-CE27-45...,Drosselvej 55B 2000 Frederiksberg,5800000,2006-03-15,Alm. Salg,119,48739,4,1988,2006,3,11,15,1,2006Q1,2000 Frederiksberg,2000,Frederiksberg
970982,https://www.boliga.dk/salg/info/147/229259/FBE...,https://www.boliga.dk/bbrinfo/FBE92C89-7FF8-44...,"Christian Winthers Vej 6, st 1860 Frederiksberg C",7350000,2006-03-18,Alm. Salg,168,43750,4,1951,2006,3,11,18,1,2006Q1,1860 Frederiksberg C,1860,Frederiksberg C
971019,https://www.boliga.dk/salg/info/147/61838/2848...,https://www.boliga.dk/bbrinfo/284830CA-14CF-43...,"Howitzvej 59, 4. th 2000 Frederiksberg",3050000,2006-03-18,Alm. Salg,85,35882,3,1904,2006,3,11,18,1,2006Q1,2000 Frederiksberg,2000,Frederiksberg


In [16]:
df_ode=df[df['city'].str.contains('Odense')]
df_ode

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q,post_no_city,post_no,city
12,https://www.boliga.dk/salg/info/461/451373/8E4...,https://www.boliga.dk/bbrinfo/8E47B014-8ACD-46...,Østergårds Allé 44 5250 Odense SV,550000,1992-09-01,Alm. Salg,112,4911,4,1953,1992,9,36,1,3,1992Q3,5250 Odense SV,5250,Odense SV
15,https://www.boliga.dk/salg/info/461/352308/207...,https://www.boliga.dk/bbrinfo/207A8560-5DDD-44...,Skippervænget 4 5000 Odense C,375000,1992-09-01,Alm. Salg,117,3205,4,1810,1992,9,36,1,3,1992Q3,5000 Odense C,5000,Odense C
56,https://www.boliga.dk/salg/info/461/622151/2CD...,https://www.boliga.dk/bbrinfo/2CD73CB4-67C9-4B...,Ugletoften 213 5260 Odense S,604000,1992-11-03,Alm. Salg,109,5541,4,1986,1992,11,45,3,4,1992Q4,5260 Odense S,5260,Odense S
69,https://www.boliga.dk/salg/info/461/146406/CC4...,https://www.boliga.dk/bbrinfo/CC43E6AD-99B0-4A...,H.J. Poulsens Allé 104 5250 Odense SV,543100,1992-11-03,Alm. Salg,125,4345,5,1961,1992,11,45,3,4,1992Q4,5250 Odense SV,5250,Odense SV
89,https://www.boliga.dk/salg/info/461/568408/301...,https://www.boliga.dk/bbrinfo/301E3D23-DE4F-4B...,Blangstedgårdsvej 84 5220 Odense SØ,510000,1992-11-03,Alm. Salg,87,5862,4,1981,1992,11,45,3,4,1992Q4,5220 Odense SØ,5220,Odense SØ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970918,https://www.boliga.dk/salg/info/461/188923/CDC...,https://www.boliga.dk/bbrinfo/CDC44BEA-0AC5-45...,"Jagtvej 23, 2. tv 5000 Odense C",6900000,2006-03-17,Alm. Salg,65,106154,2,1911,2006,3,11,17,1,2006Q1,5000 Odense C,5000,Odense C
970925,https://www.boliga.dk/salg/info/461/188923/922...,https://www.boliga.dk/bbrinfo/922032EA-018A-48...,"Jagtvej 23, 1. tv 5000 Odense C",6900000,2006-03-17,Alm. Salg,65,106154,2,1911,2006,3,11,17,1,2006Q1,5000 Odense C,5000,Odense C
971013,https://www.boliga.dk/salg/info/461/608442/ACE...,https://www.boliga.dk/bbrinfo/ACED4E9E-F0FC-44...,Væbnerhatten 54 5220 Odense SØ,2350000,2006-03-18,Alm. Salg,161,14596,6,1985,2006,3,11,18,1,2006Q1,5220 Odense SØ,5220,Odense SØ
971141,https://www.boliga.dk/salg/info/461/329977/4AB...,https://www.boliga.dk/bbrinfo/4ABE4DDB-C97A-46...,Rulkehøjen 36 5260 Odense S,2170000,2006-03-19,Alm. Salg,164,13232,6,1970,2006,3,11,19,1,2006Q1,5260 Odense S,5260,Odense S


In [17]:
df_sub = pd.concat([df_kbh, df_aar, df_ode])
df_sub=df_sub.reset_index(drop=True)
df_sub

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q,post_no_city,post_no,city
0,https://www.boliga.dk/salg/info/101/533972/416...,https://www.boliga.dk/bbrinfo/416D6FE5-45A5-4B...,"Strandboulevarden 98, 3. th 2100 København Ø",790000,1992-11-03,Alm. Salg,128,6172,5,1903,1992,11,45,3,4,1992Q4,2100 København Ø,2100,København Ø
1,https://www.boliga.dk/salg/info/101/187276/933...,https://www.boliga.dk/bbrinfo/933F5DC3-E067-45...,"Grækenlandsvej 37, 1. th 2300 København S",319000,1994-02-14,Alm. Salg,53,6019,2,1934,1994,2,7,14,1,1994Q1,2300 København S,2300,København S
2,https://www.boliga.dk/salg/info/101/705960/1BA...,https://www.boliga.dk/bbrinfo/1BAB9A84-670B-4C...,"Weidekampsgade 59, 4. th 2300 København S",2645000,2006-03-21,Alm. Salg,93,28441,2,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S
3,https://www.boliga.dk/salg/info/101/716210/B3B...,https://www.boliga.dk/bbrinfo/B3B0A4DA-8B77-4E...,"Klaksvigsgade 9, 3. tv 2300 København S",3300000,2006-03-21,Alm. Salg,110,30000,3,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S
4,https://www.boliga.dk/salg/info/101/707610/408...,https://www.boliga.dk/bbrinfo/408813A5-3EAC-46...,"Weidekampsgade 61, 5. tv 2300 København S",2500000,2006-03-21,Alm. Salg,93,26882,2,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68985,https://www.boliga.dk/salg/info/461/188923/CDC...,https://www.boliga.dk/bbrinfo/CDC44BEA-0AC5-45...,"Jagtvej 23, 2. tv 5000 Odense C",6900000,2006-03-17,Alm. Salg,65,106154,2,1911,2006,3,11,17,1,2006Q1,5000 Odense C,5000,Odense C
68986,https://www.boliga.dk/salg/info/461/188923/922...,https://www.boliga.dk/bbrinfo/922032EA-018A-48...,"Jagtvej 23, 1. tv 5000 Odense C",6900000,2006-03-17,Alm. Salg,65,106154,2,1911,2006,3,11,17,1,2006Q1,5000 Odense C,5000,Odense C
68987,https://www.boliga.dk/salg/info/461/608442/ACE...,https://www.boliga.dk/bbrinfo/ACED4E9E-F0FC-44...,Væbnerhatten 54 5220 Odense SØ,2350000,2006-03-18,Alm. Salg,161,14596,6,1985,2006,3,11,18,1,2006Q1,5220 Odense SØ,5220,Odense SØ
68988,https://www.boliga.dk/salg/info/461/329977/4AB...,https://www.boliga.dk/bbrinfo/4ABE4DDB-C97A-46...,Rulkehøjen 36 5260 Odense S,2170000,2006-03-19,Alm. Salg,164,13232,6,1970,2006,3,11,19,1,2006Q1,5260 Odense S,5260,Odense S


In [18]:
regex = r'^\w+'
df_sub['kommune']=df_sub['city'].apply(lambda x: re.findall(regex, x)[0] if re.findall(regex, x) else '')+' Kommune'
df_sub['kommune']

0        København Kommune
1        København Kommune
2        København Kommune
3        København Kommune
4        København Kommune
               ...        
68985       Odense Kommune
68986       Odense Kommune
68987       Odense Kommune
68988       Odense Kommune
68989       Odense Kommune
Name: kommune, Length: 68990, dtype: object

In [19]:
df_sub['kommune']=df_sub['kommune'].replace('København Kommune', 'Københavns Kommune')
df_sub

Unnamed: 0,link,bbr_link,address,price,date_of_sale,type_of_sale,house_size_m2,house_price_per_m2,no_of_rooms,year_built,year,month,week,day,quarter,time_q,post_no_city,post_no,city,kommune
0,https://www.boliga.dk/salg/info/101/533972/416...,https://www.boliga.dk/bbrinfo/416D6FE5-45A5-4B...,"Strandboulevarden 98, 3. th 2100 København Ø",790000,1992-11-03,Alm. Salg,128,6172,5,1903,1992,11,45,3,4,1992Q4,2100 København Ø,2100,København Ø,Københavns Kommune
1,https://www.boliga.dk/salg/info/101/187276/933...,https://www.boliga.dk/bbrinfo/933F5DC3-E067-45...,"Grækenlandsvej 37, 1. th 2300 København S",319000,1994-02-14,Alm. Salg,53,6019,2,1934,1994,2,7,14,1,1994Q1,2300 København S,2300,København S,Københavns Kommune
2,https://www.boliga.dk/salg/info/101/705960/1BA...,https://www.boliga.dk/bbrinfo/1BAB9A84-670B-4C...,"Weidekampsgade 59, 4. th 2300 København S",2645000,2006-03-21,Alm. Salg,93,28441,2,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S,Københavns Kommune
3,https://www.boliga.dk/salg/info/101/716210/B3B...,https://www.boliga.dk/bbrinfo/B3B0A4DA-8B77-4E...,"Klaksvigsgade 9, 3. tv 2300 København S",3300000,2006-03-21,Alm. Salg,110,30000,3,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S,Københavns Kommune
4,https://www.boliga.dk/salg/info/101/707610/408...,https://www.boliga.dk/bbrinfo/408813A5-3EAC-46...,"Weidekampsgade 61, 5. tv 2300 København S",2500000,2006-03-21,Alm. Salg,93,26882,2,2006,2006,3,12,21,1,2006Q1,2300 København S,2300,København S,Københavns Kommune
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68985,https://www.boliga.dk/salg/info/461/188923/CDC...,https://www.boliga.dk/bbrinfo/CDC44BEA-0AC5-45...,"Jagtvej 23, 2. tv 5000 Odense C",6900000,2006-03-17,Alm. Salg,65,106154,2,1911,2006,3,11,17,1,2006Q1,5000 Odense C,5000,Odense C,Odense Kommune
68986,https://www.boliga.dk/salg/info/461/188923/922...,https://www.boliga.dk/bbrinfo/922032EA-018A-48...,"Jagtvej 23, 1. tv 5000 Odense C",6900000,2006-03-17,Alm. Salg,65,106154,2,1911,2006,3,11,17,1,2006Q1,5000 Odense C,5000,Odense C,Odense Kommune
68987,https://www.boliga.dk/salg/info/461/608442/ACE...,https://www.boliga.dk/bbrinfo/ACED4E9E-F0FC-44...,Væbnerhatten 54 5220 Odense SØ,2350000,2006-03-18,Alm. Salg,161,14596,6,1985,2006,3,11,18,1,2006Q1,5220 Odense SØ,5220,Odense SØ,Odense Kommune
68988,https://www.boliga.dk/salg/info/461/329977/4AB...,https://www.boliga.dk/bbrinfo/4ABE4DDB-C97A-46...,Rulkehøjen 36 5260 Odense S,2170000,2006-03-19,Alm. Salg,164,13232,6,1970,2006,3,11,19,1,2006Q1,5260 Odense S,5260,Odense S,Odense Kommune


In [20]:
land_tax = pd.read_excel('data/kmn_grundskyldspromille.xlsx')
land_tax['year']=pd.to_datetime(land_tax['year'], format='%Y').dt.year
land_tax

Unnamed: 0,kommune,year,land_tax
0,Københavns Kommune,1993,34.00
1,Odense Kommune,1993,12.90
2,Aarhus Kommune,1993,18.90
3,Københavns Kommune,1994,34.00
4,Odense Kommune,1994,12.90
...,...,...,...
85,Odense Kommune,2021,21.71
86,Aarhus Kommune,2021,24.58
87,Københavns Kommune,2022,34.00
88,Odense Kommune,2022,21.71


In [21]:
df_sub_tax=pd.merge(df_sub, land_tax, left_on=['kommune','year'], right_on=['kommune','year'])
df_sub_tax['id']=np.arange(0, len(df_sub_tax['bbr_link']))

In [57]:
url = df_sub_tax['bbr_link'][0]
url

'https://www.boliga.dk/bbrinfo/933F5DC3-E067-456C-9B4D-151491BD2C43'

In [58]:
soup = get_soup(url=url, header=header)
soup

<!DOCTYPE html>
<html lang="da"><head><link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<link href="https://consent.cookiebot.com" rel="preconnect"/>
<link href="https://api.boliga.dk" rel="preconnect"/>
<link href="https://i.boliga.org" rel="preconnect"/>
<title>BBR: Grækenlandsvej 37, 1. th, 2300 København S</title>
<base href="/"/>
<meta charset="utf-8"/>
<meta content="Se alle BBR- og boligoplysninger om Grækenlandsvej 37, 1. th, 2300 København S. Ejerlejlighed på 53 m²" name="description"/>
<meta content="" name="keywords"/>
<meta content="#da532c" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<meta content="strict-origin-when-cross-origin" name="referrer"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=0" name="viewport"/>
<link href="favicon.ico" rel="icon" type="image/x-icon"/>
<link href="/assets/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/assets/favicon-

In [62]:
dwelling_type=soup.find(class_="house-name").text
dwelling_type

' Ejerlejlighed '

In [70]:
dwelling_type2=soup.find_all(class_="col-md-6 column")[0].div.span.text
dwelling_type2

' Egentlig beboelseslejlighed '

In [72]:
kitchen_type=soup.find_all(class_="col-md-6 column")[1].div.span.text
kitchen_type

' Eget køkken med afløb '

In [74]:
dwelling_type3=soup.find_all(class_="col-md-6 column")[2].div.span.text
dwelling_type3

' Bolig i etageejendom, flerfamiliehus eller to-familiehus '

In [81]:
bathrooms=soup.find_all(class_="col-md-6 column")[5].div.span.text
bathrooms

' 1 '

In [83]:
toilet_type=soup.find_all(class_="col-md-6 column")[7].div.span
toilet_type

<span _ngcontent-sc375=""> Vandskyllende toilet i enheden </span>

In [89]:
toilets=soup.find_all(class_="col-md-6 column")[9].div.span.text
toilets

' 1 '

In [26]:
list_of_bbr = df_sub_tax['bbr_link'].to_list()
list_of_bbr

['https://www.boliga.dk/bbrinfo/933F5DC3-E067-456C-9B4D-151491BD2C43',
 'https://www.boliga.dk/bbrinfo/86B95A74-C7FE-4D0E-8F9F-C8FA46FF142B',
 'https://www.boliga.dk/bbrinfo/85CF431F-69B2-42E7-ADC9-F8004FA9D263',
 'https://www.boliga.dk/bbrinfo/8BAEBEB5-5012-4A90-9785-5E953109833D',
 'https://www.boliga.dk/bbrinfo/2BAED633-9E9D-4443-80BD-206CB9A9FDAE',
 'https://www.boliga.dk/bbrinfo/ABC93EA2-8A32-404F-81A3-B21B9E47E150',
 'https://www.boliga.dk/bbrinfo/DB36CE0E-C748-44FF-A972-63E881BD641D',
 'https://www.boliga.dk/bbrinfo/FAEA6AF6-3672-464D-A711-8313C9A7C8DA',
 'https://www.boliga.dk/bbrinfo/66926C35-D6F8-43A2-A006-F87EEE4ADD3F',
 'https://www.boliga.dk/bbrinfo/B1C07D35-C5C7-4701-B5ED-2BA13CE2490F',
 'https://www.boliga.dk/bbrinfo/98F227ED-C72C-4EC4-BC7F-0FA13AEE9FF5',
 'https://www.boliga.dk/bbrinfo/62E11F5A-F28E-4C62-A863-5582D735A35F',
 'https://www.boliga.dk/bbrinfo/9408DF18-43EA-4E3F-A205-79F9BF01FA4A',
 'https://www.boliga.dk/bbrinfo/13102C8C-EE52-4C7D-8A3E-A3669AB926B2',
 'http

In [27]:
def create_url_boliga_bbr(bbr_id: int) -> str:
    """
    Creates a boliga URL with the given pagenumber.
    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the boliga website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the boliga website for given page. 
    """

    url = f'{bbr_id}' # Construct url with f-string

    return url

In [28]:
def extract_info_boliga_bbr(soup:BeautifulSoup) -> pd.DataFrame:
    href = soup.find(class_="text-primary font-weight-bold")['href']
    bbr_link = f'https://www.boliga.dk/{href}'
    dwelling_type=soup.find(class_="house-name").text
    dwelling_type2=soup.find_all(class_="col-md-6 column")[0].div.span.text
    dwelling_type3=soup.find_all(class_="col-md-6 column")[2].div.span.text
    kitchen_type=soup.find_all(class_="col-md-6 column")[1].div.span.text
    bathrooms=soup.find_all(class_="col-md-6 column")[5].div.span.text
    toilet_type=soup.find_all(class_="col-md-6 column")[7].div.span.text
    toilets=soup.find_all(class_="col-md-6 column")[9].div.span.text

    return [bbr_link,dwelling_type, dwelling_type2, dwelling_type3, kitchen_type, bathrooms, toilet_type, toilets]


In [29]:
column_names = ['bbr_link','dwelling_type', 'dwelling_type2', 'dwelling_type3', 'kitchen_type', 'bathrooms', 'toilet_type', 'toilets']

In [30]:
list_of_bbr[:10]

['https://www.boliga.dk/bbrinfo/933F5DC3-E067-456C-9B4D-151491BD2C43',
 'https://www.boliga.dk/bbrinfo/86B95A74-C7FE-4D0E-8F9F-C8FA46FF142B',
 'https://www.boliga.dk/bbrinfo/85CF431F-69B2-42E7-ADC9-F8004FA9D263',
 'https://www.boliga.dk/bbrinfo/8BAEBEB5-5012-4A90-9785-5E953109833D',
 'https://www.boliga.dk/bbrinfo/2BAED633-9E9D-4443-80BD-206CB9A9FDAE',
 'https://www.boliga.dk/bbrinfo/ABC93EA2-8A32-404F-81A3-B21B9E47E150',
 'https://www.boliga.dk/bbrinfo/DB36CE0E-C748-44FF-A972-63E881BD641D',
 'https://www.boliga.dk/bbrinfo/FAEA6AF6-3672-464D-A711-8313C9A7C8DA',
 'https://www.boliga.dk/bbrinfo/66926C35-D6F8-43A2-A006-F87EEE4ADD3F',
 'https://www.boliga.dk/bbrinfo/B1C07D35-C5C7-4701-B5ED-2BA13CE2490F']

In [31]:
list_of_bbr[0]

'https://www.boliga.dk/bbrinfo/933F5DC3-E067-456C-9B4D-151491BD2C43'

In [32]:
how_many = len(list_of_bbr)
how_many/2

33696.0

In [35]:
output=[]
dfs = []
errors=[]
id_=34070

for url in tqdm.tqdm((list_of_bbr[34070:])):
    try:
        soup = get_soup(url, header)
        output.append(extract_info_boliga_bbr(soup))
        df=pd.DataFrame(output, columns=column_names)
        df.to_parquet(f'data/bbr/boliga_{id_}.pq')
        id_+=1
        time.sleep(0.2)
    except: #skip page if errors
        print(f'Error encountered on url {id_}')
        errors.append(url)
        df_error = pd.DataFrame(errors).to_csv('errors2.csv')
        id_+=1
        time.sleep(0.2)
        continue

  9%|▉         | 3066/33322 [1:53:58<18:16:31,  2.17s/it]

Error encountered on url 37136


  9%|▉         | 3068/33322 [1:54:11<32:29:57,  3.87s/it]

In [130]:
pd.concat(dfs).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 9
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   bbr_link        55 non-null     object
 1   dwelling_type   55 non-null     object
 2   dwelling_type2  55 non-null     object
 3   dwelling_type3  55 non-null     object
 4   kitchen_type    55 non-null     object
 5   bathrooms       55 non-null     object
 6   toilet_type     55 non-null     object
 7   toilets         55 non-null     object
dtypes: object(8)
memory usage: 3.9+ KB
