In [1]:
import tools as tools

from pathlib import Path

# Standard imports 
import numpy as np
import pandas as pd


# OS and time packages 
import time
import tqdm
import concurrent.futures

# HTML and text processing 
import json

import time

# Plotting 
import matplotlib.pyplot as plt 
import seaborn as sns

plt.style.use('seaborn-whitegrid')
%matplotlib inline

plt.rc('font', size=14)             # controls default text sizes
plt.rc('axes', titlesize=18)        # fontsize of the axes title
plt.rc('axes', labelsize=18)        # fontsize of the x and y labels
plt.rc('xtick', labelsize=14)       # fontsize of the tick labels
plt.rc('ytick', labelsize=14)       # fontsize of the tick labels
plt.rc('legend', fontsize=14)       # legend fontsize
plt.rc('figure', titlesize=20)      # fontsize of the figure title

plt.rcParams['figure.figsize'] = 10, 4 # set default size of plots

# Filter warnings 
pd.options.mode.chained_assignment = None
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  plt.style.use('seaborn-whitegrid')


Header (state non-commercial/academic intentions)

In [2]:
header = {'name' : 'Jørgen Baun Høst',          'email' : 'pjz633@econ.ku.dk',
          'intention': 'Scrape Boliga for academic purposes'}

Meta data

In [None]:
url = 'https://api.boliga.dk/api/v2/sold/search/results?pageSize=2000&page=1&salesDateMin=1996&salesDateMax=2012&propertytype=1&saleType=1&sort=date-d&buildYearMax=2005'
bbr_test_url = 'https://api.boliga.dk/api/v2/bbrinfo/bbr?id=69cd6d3d-e858-43aa-b530-bd20f132e3b8'
output=tools.get_json(url=url, header=header)
output['meta']

In [None]:
output['results']

Make list of urls

In [None]:
list_of_url = []

total_pages = output['meta']['totalPages']

for page in range(1, total_pages+1):
    url = f'https://api.boliga.dk/api/v2/sold/search/results?pageSize=2000&page={page}&salesDateMin=1996&salesDateMax=2012&propertytype=1&saleType=1&sort=date-d&buildYearMax=2005'
    list_of_url.append(url)

In [None]:
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        time.sleep(0.5)
        out = tools.get_json(url, header)
        return id_, out['results']
    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        pd.DataFrame(errors).to_csv
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_url)]

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

for result in results:
    id_, data = result
    if data is not None:
       with open(f'data/boliga/boliga_{id_}.json', "w") as fp:
        json.dump(data,fp) 

100%|██████████| 396/396 [04:28<00:00,  1.48it/s]


## Merge and clean data from Boliga

In [None]:
data_dir = Path('data/boliga')
full_df1 = pd.concat(
    pd.DataFrame(json.load(open(json_file)))
    for json_file in data_dir.glob('*.json')
)

df=full_df1.reset_index(drop=True)
df['soldDate']=pd.to_datetime(df['soldDate'])
df['year']=df.soldDate.dt.year
df['month']=df.soldDate.dt.month
df['week']=df.soldDate.dt.weekday
df['time_q']=pd.PeriodIndex(df['soldDate'], freq='Q')

df=df.drop(columns=['change'])

## Read BBR urls to scrape

In [None]:
list_of_bbr_urls = []
list_of_guids = list(df['guid'].unique())

for i,guid in enumerate(list_of_guids):
    bbr_url = f'https://api.boliga.dk/api/v2/bbrinfo/bbr?id={guid}'
    list_of_bbr_urls.append(bbr_url)

pd.DataFrame(list_of_bbr_urls, columns=['bbr_url']).to_parquet('data/bbr_ids_scraper.pq')

Clear variables in workspace and read in urls to scrape. Some house are sold multiple times (so is uncessary to scrape more than once)

In [3]:
%reset 
temp_df = pd.read_parquet('data/bbr_ids_scraper.pq')
list_of_bbr_urls = list(temp_df['bbr_url'])
len(list_of_bbr_urls)

519604

In [4]:
errors = []

def process_url(id_url_pair):
    id_, url = id_url_pair
    try:
        time.sleep(0.5)
        out = tools.get_json(url, header)
        with open(f'data/bbr/bbr{id_}.json', "w") as fp:
            json.dump(data,fp) 
        return id_, out
    except:
        print(f'Error encountered on url {url}')
        errors.append(url)
        pd.DataFrame(errors).to_csv
        return id_, None

id_url_pairs = [(id_, url) for id_, url in enumerate(list_of_bbr_urls)]

with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
    results = list(tqdm.tqdm(executor.map(process_url, id_url_pairs), total=len(id_url_pairs)))

 32%|███▏      | 166793/519604 [5:46:40<27:28:21,  3.57it/s]

Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=BD1739EB-EF62-4E57-8513-4E4029D0C24A
Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=38CB7F4A-7ADD-48CD-BD19-7FA987D2F790
Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=A852F4F3-7E10-4285-BD81-19A3BDF05529


 58%|█████▊    | 302788/519604 [10:34:23<381:12:55,  6.33s/it]

Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=C30F38D6-6C73-4D69-A02E-FB6D7E5F8615Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=5311E0B5-219C-4EFF-82FD-D52A92B063E6



 76%|███████▌  | 392810/519604 [14:09:36<845:37:42, 24.01s/it]

Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=C6404315-15DB-4E58-973A-2449D1A1F2FB
Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=B4F3C9FF-D25A-41FD-B36E-B1482C4BDC0D
Error encountered on url https://api.boliga.dk/api/v2/bbrinfo/bbr?id=748D647D-9CE4-478A-B149-8D293CFDD855


100%|██████████| 519604/519604 [18:56:07<00:00,  7.62it/s]    
