In [1]:
# Toolbox
import tools as tools

# Standard imports 
import pandas as pd


# OS and time packages 
import time
from pathlib import Path
import tqdm
import concurrent.futures

**Table of contents**<a id='toc0_'></a>    
- [Fetch meta data](#toc1_)    
  - [Make list of urls to webscrape](#toc1_1_)    
  - [Merge and clean data from Boliga](#toc1_2_)    
  - [Read BBR urls to scrape](#toc1_3_)    
- [Webscrape BBR data from boliga.dk](#toc2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

Header (state non-commercial/academic intentions)

In [2]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

# <a id='toc1_'></a>[Fetch meta data](#toc0_)

In [3]:
url = 'https://api.boliga.dk/api/v2/sold/search/results?pageSize=2000&page=1&salesDateMin=2000&salesDateMax=2008&propertytype=1&saleType=1&sort=date-d&buildYearMax=2005'
bbr_test_url = 'https://api.boliga.dk/api/v2/bbrinfo/bbr?id=69cd6d3d-e858-43aa-b530-bd20f132e3b8'
output=tools.get_json(url=url, header=header)
output['meta']

{'pageIndex': 1,
 'pageSize': 2000,
 'totalCount': 441678,
 'totalPages': 221,
 'minPage': 1,
 'maxPage': 6,
 'countFrom': 1,
 'countTo': 2000}

## <a id='toc1_1_'></a>[Make list of urls to webscrape](#toc0_)

In [5]:
list_of_url = []

total_pages = output['meta']['totalPages']

for page in range(1, total_pages+1):
    url = f'https://api.boliga.dk/api/v2/sold/search/results?pageSize=2000&page={page}&salesDateMin=2000&salesDateMax=2008&propertytype=1&saleType=1&sort=date-d&buildYearMax=2005'
    list_of_url.append(url)

In [None]:
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        time.sleep(0.5)

        # Get the JSON file
        out = tools.get_json(url, header)

        # Make pd.DataFrame
        df = pd.DataFrame(out['results'])

        # Select numeric cols: i) Floats ii) Integers
        fcols = df.select_dtypes('float').columns
        icols = df.select_dtypes('integer').columns
        
        # Downcast to most optimal dtype to conserve memory
        df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
        df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')

        # Save in data folder
        return df.to_parquet(f'data/boliga/boliga_{id_}.pq')
    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        pd.DataFrame(errors).to_csv
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_url)]

## NB! Do not overload Boliga's servers - scrape at odd hours and/or adjust max_workers accordingly!
with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

100%|██████████| 396/396 [04:28<00:00,  1.48it/s]


## <a id='toc1_2_'></a>[Merge and clean data from Boliga](#toc0_)

In [9]:
data_dir = Path('data/boliga')
full_df1 = pd.concat(
    pd.DataFrame(pd.read_parquet(pq_file))
    for pq_file in data_dir.glob('*.pq')
)

df=full_df1.reset_index(drop=True)
df['soldDate']=pd.to_datetime(df['soldDate'])
df['year']=df.soldDate.dt.year
df['month']=df.soldDate.dt.month
df['week']=df.soldDate.dt.weekday
df['time_q']=pd.PeriodIndex(df['soldDate'], freq='Q')

df=df.drop(columns=['change'])

## <a id='toc1_3_'></a>[Read BBR urls to scrape](#toc0_)

Some houses are sold more than once in the period...

In [None]:
list_of_bbr_urls = []
list_of_guids = list(df['guid'].unique())

for i,guid in enumerate(list_of_guids):
    bbr_url = f'https://api.boliga.dk/api/v2/bbrinfo/bbr?id={guid}'
    list_of_bbr_urls.append(bbr_url)

pd.DataFrame(list_of_bbr_urls, columns=['bbr_url']).to_parquet('data/bbr_ids_scraper.pq')

In [16]:

temp_df = pd.read_parquet('data/bbr_ids_scraper.pq')
list_of_bbr_urls = list(temp_df['bbr_url'])
len(list_of_bbr_urls)

519604

# <a id='toc2_'></a>[Webscrape BBR data from boliga.dk](#toc0_)
NB! Will take a lot of time to run. Took me about 24 hrs on an 8-core, 16gb RAM laptop.

In [24]:
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        time.sleep(0.5)
        # Get the JSON file
        out = tools.get_json(url, header)

        # Normalize/flatten JSON-file
        json_dat=pd.json_normalize(out)[
            ['unitId','evaluationInfos','bbrInfoBox.lotSize', 'bbrInfoBox.area', 'bbrInfoBox.evaluationPrice', 'unitInfo.toiletQuantity','unitInfo.bathroomQuantity','unitInfo.propertyUnitType','bfenr']
            ]

        # Make pd.DataFrame
        df = pd.DataFrame(json_dat)

        # Select numeric cols: i) Floats ii) Integers
        fcols = df.select_dtypes('float').columns
        icols = df.select_dtypes('integer').columns
        
        # Downcast to most optimal dtype to conserve memory
        df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
        df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')

        # Save in data folder
        return df.to_parquet(f'data/bbr/bbr_{id_}.pq')

    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        pd.DataFrame(errors).to_csv
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_bbr_urls)]

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

100%|██████████| 10/10 [00:01<00:00,  5.76it/s]
